1 /*
2 URL: svn://svnanon.samba.org/samba/branches/SAMBA_4_0/source/lib/tdb/common
3 Rev: 23590
4 Last Changed Date: 2007-06-22 13:36:10 -0400 (Fri, 22 Jun 2007)
5 */
6 /*
7 trivial database library - standalone version
8
9 Copyright (C) Andrew Tridgell 1999-2005
10 Copyright (C) Jeremy Allison 2000-2006
11 Copyright (C) Paul `Rusty' Russell 2000
12
13 ** NOTE! The following LGPL license applies to the tdb
14 ** library. This does NOT imply that all of Samba is released
15 ** under the LGPL
16
17 This library is free software; you can redistribute it and/or
18 modify it under the terms of the GNU Lesser General Public
19 License as published by the Free Software Foundation; either
20 version 2 of the License, or (at your option) any later version.
21
22 This library is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 Lesser General Public License for more details.
26
27 You should have received a copy of the GNU Lesser General Public
28 License along with this library; if not, write to the Free Software
29 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 */
31
32 #ifdef CONFIG_STAND_ALONE
33 #define HAVE_MMAP
34 #define HAVE_STRDUP
35 #define HAVE_SYS_MMAN_H
36 #define HAVE_UTIME_H
37 #define HAVE_UTIME
38 #endif
39 #define _XOPEN_SOURCE 500
40
41 #include <unistd.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <stdarg.h>
45 #include <stddef.h>
46 #include <errno.h>
47 #include <string.h>
48 #ifdef HAVE_SYS_SELECT_H
49 #include <sys/select.h>
50 #endif
51 #include <sys/time.h>
52 #include <sys/types.h>
53 #include <time.h>
54 #ifdef HAVE_UTIME_H
55 #include <utime.h>
56 #endif
57 #include <sys/stat.h>
58 #include <sys/file.h>
59 #include <fcntl.h>
60
61 #ifdef HAVE_SYS_MMAN_H
62 #include <sys/mman.h>
63 #endif
64
65 #ifndef MAP_FILE
66 #define MAP_FILE 0
67 #endif
68
69 #ifndef MAP_FAILED
70 #define MAP_FAILED ((void *)-1)
71 #endif
72
73 #ifndef HAVE_STRDUP
74 #define strdup rep_strdup
rep_strdup(const char * s)75 static char *rep_strdup(const char *s)
76 {
77 char *ret;
78 int length;
79 if (!s)
80 return NULL;
81
82 if (!length)
83 length = strlen(s);
84
85 ret = malloc(length + 1);
86 if (ret) {
87 strncpy(ret, s, length);
88 ret[length] = '\0';
89 }
90 return ret;
91 }
92 #endif
93
94 #ifndef PRINTF_ATTRIBUTE
95 #if (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1 )
96 /** Use gcc attribute to check printf fns. a1 is the 1-based index of
97 * the parameter containing the format, and a2 the index of the first
98 * argument. Note that some gcc 2.x versions don't handle this
99 * properly **/
100 #define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
101 #else
102 #define PRINTF_ATTRIBUTE(a1, a2)
103 #endif
104 #endif
105
106 typedef int bool;
107
108 #include "tdb.h"
109
110 #ifndef u32
111 #define u32 unsigned
112 #endif
113
114 #ifndef HAVE_GETPAGESIZE
115 #define getpagesize() 0x2000
116 #endif
117
118 typedef u32 tdb_len_t;
119 typedef u32 tdb_off_t;
120
121 #ifndef offsetof
122 #define offsetof(t,f) ((unsigned int)&((t *)0)->f)
123 #endif
124
125 #define TDB_MAGIC_FOOD "TDB file\n"
126 #define TDB_VERSION (0x26011967 + 6)
127 #define TDB_MAGIC (0x26011999U)
128 #define TDB_FREE_MAGIC (~TDB_MAGIC)
129 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
130 #define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
131 #define TDB_ALIGNMENT 4
132 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
133 #define DEFAULT_HASH_SIZE 131
134 #define FREELIST_TOP (sizeof(struct tdb_header))
135 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
136 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
137 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
138 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
139 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off_t))
140 #define TDB_HASHTABLE_SIZE(tdb) ((tdb->header.hash_size+1)*sizeof(tdb_off_t))
141 #define TDB_DATA_START(hash_size) TDB_HASH_TOP(hash_size-1)
142 #define TDB_RECOVERY_HEAD offsetof(struct tdb_header, recovery_start)
143 #define TDB_SEQNUM_OFS offsetof(struct tdb_header, sequence_number)
144 #define TDB_PAD_BYTE 0x42
145 #define TDB_PAD_U32 0x42424242
146
147 /* NB assumes there is a local variable called "tdb" that is the
148 * current context, also takes doubly-parenthesized print-style
149 * argument. */
150 #define TDB_LOG(x) tdb->log.log_fn x
151
152 /* lock offsets */
153 #define GLOBAL_LOCK 0
154 #define ACTIVE_LOCK 4
155 #define TRANSACTION_LOCK 8
156
157 /* free memory if the pointer is valid and zero the pointer */
158 #ifndef SAFE_FREE
159 #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
160 #endif
161
162 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
163
164 #define DOCONV() (tdb->flags & TDB_CONVERT)
165 #define CONVERT(x) (DOCONV() ? tdb_convert(&x, sizeof(x)) : &x)
166
167
168 /* the body of the database is made of one list_struct for the free space
169 plus a separate data list for each hash value */
170 struct list_struct {
171 tdb_off_t next; /* offset of the next record in the list */
172 tdb_len_t rec_len; /* total byte length of record */
173 tdb_len_t key_len; /* byte length of key */
174 tdb_len_t data_len; /* byte length of data */
175 u32 full_hash; /* the full 32 bit hash of the key */
176 u32 magic; /* try to catch errors */
177 /* the following union is implied:
178 union {
179 char record[rec_len];
180 struct {
181 char key[key_len];
182 char data[data_len];
183 }
184 u32 totalsize; (tailer)
185 }
186 */
187 };
188
189
190 /* this is stored at the front of every database */
191 struct tdb_header {
192 char magic_food[32]; /* for /etc/magic */
193 u32 version; /* version of the code */
194 u32 hash_size; /* number of hash entries */
195 tdb_off_t rwlocks; /* obsolete - kept to detect old formats */
196 tdb_off_t recovery_start; /* offset of transaction recovery region */
197 tdb_off_t sequence_number; /* used when TDB_SEQNUM is set */
198 tdb_off_t reserved[29];
199 };
200
201 struct tdb_lock_type {
202 int list;
203 u32 count;
204 u32 ltype;
205 };
206
207 struct tdb_traverse_lock {
208 struct tdb_traverse_lock *next;
209 u32 off;
210 u32 hash;
211 int lock_rw;
212 };
213
214
215 struct tdb_methods {
216 int (*tdb_read)(struct tdb_context *, tdb_off_t , void *, tdb_len_t , int );
217 int (*tdb_write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
218 void (*next_hash_chain)(struct tdb_context *, u32 *);
219 int (*tdb_oob)(struct tdb_context *, tdb_off_t , int );
220 int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
221 int (*tdb_brlock)(struct tdb_context *, tdb_off_t , int, int, int, size_t);
222 };
223
224 struct tdb_context {
225 char *name; /* the name of the database */
226 void *map_ptr; /* where it is currently mapped */
227 int fd; /* open file descriptor for the database */
228 tdb_len_t map_size; /* how much space has been mapped */
229 int read_only; /* opened read-only */
230 int traverse_read; /* read-only traversal */
231 struct tdb_lock_type global_lock;
232 int num_lockrecs;
233 struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
234 enum TDB_ERROR ecode; /* error code for last tdb error */
235 struct tdb_header header; /* a cached copy of the header */
236 u32 flags; /* the flags passed to tdb_open */
237 struct tdb_traverse_lock travlocks; /* current traversal locks */
238 struct tdb_context *next; /* all tdbs to avoid multiple opens */
239 dev_t device; /* uniquely identifies this tdb */
240 ino_t inode; /* uniquely identifies this tdb */
241 struct tdb_logging_context log;
242 unsigned int (*hash_fn)(TDB_DATA *key);
243 int open_flags; /* flags used in the open - needed by reopen */
244 unsigned int num_locks; /* number of chain locks held */
245 const struct tdb_methods *methods;
246 struct tdb_transaction *transaction;
247 int page_size;
248 int max_dead_records;
249 bool have_transaction_lock;
250 };
251
252
253 /*
254 internal prototypes
255 */
256 static int tdb_munmap(struct tdb_context *tdb);
257 static void tdb_mmap(struct tdb_context *tdb);
258 static int tdb_lock(struct tdb_context *tdb, int list, int ltype);
259 static int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
260 static int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset, int rw_type, int lck_type, int probe, size_t len);
261 static int tdb_transaction_lock(struct tdb_context *tdb, int ltype);
262 static int tdb_transaction_unlock(struct tdb_context *tdb);
263 static int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len);
264 static int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
265 static int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
266 static int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
267 static int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
268 static void *tdb_convert(void *buf, u32 size);
269 static int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
270 static tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_struct *rec);
271 static int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
272 static int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
273 static int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
274 static int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
275 static int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
276 static int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
277 static int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct *rec);
278 static unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
279 static int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
280 tdb_off_t offset, tdb_len_t len,
281 int (*parser)(TDB_DATA key, TDB_DATA data,
282 void *private_data),
283 void *private_data);
284 static tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, int locktype,
285 struct list_struct *rec);
286 static void tdb_io_init(struct tdb_context *tdb);
287 static int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
288 static int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
289 struct list_struct *rec);
290
291
292 /* file: error.c */
293
tdb_error(struct tdb_context * tdb)294 enum TDB_ERROR tdb_error(struct tdb_context *tdb)
295 {
296 return tdb->ecode;
297 }
298
299 static struct tdb_errname {
300 enum TDB_ERROR ecode; const char *estring;
301 } emap[] = { {TDB_SUCCESS, "Success"},
302 {TDB_ERR_CORRUPT, "Corrupt database"},
303 {TDB_ERR_IO, "IO Error"},
304 {TDB_ERR_LOCK, "Locking error"},
305 {TDB_ERR_OOM, "Out of memory"},
306 {TDB_ERR_EXISTS, "Record exists"},
307 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
308 {TDB_ERR_EINVAL, "Invalid parameter"},
309 {TDB_ERR_NOEXIST, "Record does not exist"},
310 {TDB_ERR_RDONLY, "write not permitted"} };
311
312 /* Error string for the last tdb error */
tdb_errorstr(struct tdb_context * tdb)313 const char *tdb_errorstr(struct tdb_context *tdb)
314 {
315 u32 i;
316 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
317 if (tdb->ecode == emap[i].ecode)
318 return emap[i].estring;
319 return "Invalid error code";
320 }
321
322 /* file: lock.c */
323
324 #define TDB_MARK_LOCK 0x80000000
325
326 /* a byte range locking function - return 0 on success
327 this functions locks/unlocks 1 byte at the specified offset.
328
329 On error, errno is also set so that errors are passed back properly
330 through tdb_open().
331
332 note that a len of zero means lock to end of file
333 */
tdb_brlock(struct tdb_context * tdb,tdb_off_t offset,int rw_type,int lck_type,int probe,size_t len)334 int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset,
335 int rw_type, int lck_type, int probe, size_t len)
336 {
337 struct flock fl;
338 int ret;
339
340 if (tdb->flags & TDB_NOLOCK) {
341 return 0;
342 }
343
344 if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
345 tdb->ecode = TDB_ERR_RDONLY;
346 return -1;
347 }
348
349 fl.l_type = rw_type;
350 fl.l_whence = SEEK_SET;
351 fl.l_start = offset;
352 fl.l_len = len;
353 fl.l_pid = 0;
354
355 do {
356 ret = fcntl(tdb->fd,lck_type,&fl);
357 } while (ret == -1 && errno == EINTR);
358
359 if (ret == -1) {
360 /* Generic lock error. errno set by fcntl.
361 * EAGAIN is an expected return from non-blocking
362 * locks. */
363 if (!probe && lck_type != F_SETLK) {
364 /* Ensure error code is set for log fun to examine. */
365 tdb->ecode = TDB_ERR_LOCK;
366 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d\n",
367 tdb->fd, offset, rw_type, lck_type, (int)len));
368 }
369 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
370 }
371 return 0;
372 }
373
374
375 /*
376 upgrade a read lock to a write lock. This needs to be handled in a
377 special way as some OSes (such as solaris) have too conservative
378 deadlock detection and claim a deadlock when progress can be
379 made. For those OSes we may loop for a while.
380 */
tdb_brlock_upgrade(struct tdb_context * tdb,tdb_off_t offset,size_t len)381 int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len)
382 {
383 int count = 1000;
384 while (count--) {
385 struct timeval tv;
386 if (tdb_brlock(tdb, offset, F_WRLCK, F_SETLKW, 1, len) == 0) {
387 return 0;
388 }
389 if (errno != EDEADLK) {
390 break;
391 }
392 /* sleep for as short a time as we can - more portable than usleep() */
393 tv.tv_sec = 0;
394 tv.tv_usec = 1;
395 select(0, NULL, NULL, NULL, &tv);
396 }
397 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock_upgrade failed at offset %d\n", offset));
398 return -1;
399 }
400
401
402 /* lock a list in the database. list -1 is the alloc list */
_tdb_lock(struct tdb_context * tdb,int list,int ltype,int op)403 static int _tdb_lock(struct tdb_context *tdb, int list, int ltype, int op)
404 {
405 struct tdb_lock_type *new_lck;
406 int i;
407 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
408
409 ltype &= ~TDB_MARK_LOCK;
410
411 /* a global lock allows us to avoid per chain locks */
412 if (tdb->global_lock.count &&
413 (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
414 return 0;
415 }
416
417 if (tdb->global_lock.count) {
418 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
419 }
420
421 if (list < -1 || list >= (int)tdb->header.hash_size) {
422 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid list %d for ltype=%d\n",
423 list, ltype));
424 return -1;
425 }
426 if (tdb->flags & TDB_NOLOCK)
427 return 0;
428
429 for (i=0; i<tdb->num_lockrecs; i++) {
430 if (tdb->lockrecs[i].list == list) {
431 if (tdb->lockrecs[i].count == 0) {
432 /*
433 * Can't happen, see tdb_unlock(). It should
434 * be an assert.
435 */
436 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock: "
437 "lck->count == 0 for list %d", list));
438 }
439 /*
440 * Just increment the in-memory struct, posix locks
441 * don't stack.
442 */
443 tdb->lockrecs[i].count++;
444 return 0;
445 }
446 }
447
448 new_lck = (struct tdb_lock_type *)realloc(
449 tdb->lockrecs,
450 sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
451 if (new_lck == NULL) {
452 errno = ENOMEM;
453 return -1;
454 }
455 tdb->lockrecs = new_lck;
456
457 /* Since fcntl locks don't nest, we do a lock for the first one,
458 and simply bump the count for future ones */
459 if (!mark_lock &&
460 tdb->methods->tdb_brlock(tdb,FREELIST_TOP+4*list, ltype, op,
461 0, 1)) {
462 return -1;
463 }
464
465 tdb->num_locks++;
466
467 tdb->lockrecs[tdb->num_lockrecs].list = list;
468 tdb->lockrecs[tdb->num_lockrecs].count = 1;
469 tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
470 tdb->num_lockrecs += 1;
471
472 return 0;
473 }
474
475 /* lock a list in the database. list -1 is the alloc list */
tdb_lock(struct tdb_context * tdb,int list,int ltype)476 int tdb_lock(struct tdb_context *tdb, int list, int ltype)
477 {
478 int ret;
479 ret = _tdb_lock(tdb, list, ltype, F_SETLKW);
480 if (ret) {
481 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
482 "ltype=%d (%s)\n", list, ltype, strerror(errno)));
483 }
484 return ret;
485 }
486
487 /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
tdb_lock_nonblock(struct tdb_context * tdb,int list,int ltype)488 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
489 {
490 return _tdb_lock(tdb, list, ltype, F_SETLK);
491 }
492
493
494 /* unlock the database: returns void because it's too late for errors. */
495 /* changed to return int it may be interesting to know there
496 has been an error --simo */
tdb_unlock(struct tdb_context * tdb,int list,int ltype)497 int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
498 {
499 int ret = -1;
500 int i;
501 struct tdb_lock_type *lck = NULL;
502 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
503
504 ltype &= ~TDB_MARK_LOCK;
505
506 /* a global lock allows us to avoid per chain locks */
507 if (tdb->global_lock.count &&
508 (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
509 return 0;
510 }
511
512 if (tdb->global_lock.count) {
513 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
514 }
515
516 if (tdb->flags & TDB_NOLOCK)
517 return 0;
518
519 /* Sanity checks */
520 if (list < -1 || list >= (int)tdb->header.hash_size) {
521 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
522 return ret;
523 }
524
525 for (i=0; i<tdb->num_lockrecs; i++) {
526 if (tdb->lockrecs[i].list == list) {
527 lck = &tdb->lockrecs[i];
528 break;
529 }
530 }
531
532 if ((lck == NULL) || (lck->count == 0)) {
533 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
534 return -1;
535 }
536
537 if (lck->count > 1) {
538 lck->count--;
539 return 0;
540 }
541
542 /*
543 * This lock has count==1 left, so we need to unlock it in the
544 * kernel. We don't bother with decrementing the in-memory array
545 * element, we're about to overwrite it with the last array element
546 * anyway.
547 */
548
549 if (mark_lock) {
550 ret = 0;
551 } else {
552 ret = tdb->methods->tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK,
553 F_SETLKW, 0, 1);
554 }
555 tdb->num_locks--;
556
557 /*
558 * Shrink the array by overwriting the element just unlocked with the
559 * last array element.
560 */
561
562 if (tdb->num_lockrecs > 1) {
563 *lck = tdb->lockrecs[tdb->num_lockrecs-1];
564 }
565 tdb->num_lockrecs -= 1;
566
567 /*
568 * We don't bother with realloc when the array shrinks, but if we have
569 * a completely idle tdb we should get rid of the locked array.
570 */
571
572 if (tdb->num_lockrecs == 0) {
573 SAFE_FREE(tdb->lockrecs);
574 }
575
576 if (ret)
577 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
578 return ret;
579 }
580
581 /*
582 get the transaction lock
583 */
tdb_transaction_lock(struct tdb_context * tdb,int ltype)584 int tdb_transaction_lock(struct tdb_context *tdb, int ltype)
585 {
586 if (tdb->have_transaction_lock || tdb->global_lock.count) {
587 return 0;
588 }
589 if (tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, ltype,
590 F_SETLKW, 0, 1) == -1) {
591 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_lock: failed to get transaction lock\n"));
592 tdb->ecode = TDB_ERR_LOCK;
593 return -1;
594 }
595 tdb->have_transaction_lock = 1;
596 return 0;
597 }
598
599 /*
600 release the transaction lock
601 */
tdb_transaction_unlock(struct tdb_context * tdb)602 int tdb_transaction_unlock(struct tdb_context *tdb)
603 {
604 int ret;
605 if (!tdb->have_transaction_lock) {
606 return 0;
607 }
608 ret = tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
609 if (ret == 0) {
610 tdb->have_transaction_lock = 0;
611 }
612 return ret;
613 }
614
615
616
617
618 /* lock/unlock entire database */
_tdb_lockall(struct tdb_context * tdb,int ltype,int op)619 static int _tdb_lockall(struct tdb_context *tdb, int ltype, int op)
620 {
621 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
622
623 ltype &= ~TDB_MARK_LOCK;
624
625 /* There are no locks on read-only dbs */
626 if (tdb->read_only || tdb->traverse_read)
627 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
628
629 if (tdb->global_lock.count && tdb->global_lock.ltype == ltype) {
630 tdb->global_lock.count++;
631 return 0;
632 }
633
634 if (tdb->global_lock.count) {
635 /* a global lock of a different type exists */
636 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
637 }
638
639 if (tdb->num_locks != 0) {
640 /* can't combine global and chain locks */
641 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
642 }
643
644 if (!mark_lock &&
645 tdb->methods->tdb_brlock(tdb, FREELIST_TOP, ltype, op,
646 0, 4*tdb->header.hash_size)) {
647 if (op == F_SETLKW) {
648 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lockall failed (%s)\n", strerror(errno)));
649 }
650 return -1;
651 }
652
653 tdb->global_lock.count = 1;
654 tdb->global_lock.ltype = ltype;
655
656 return 0;
657 }
658
659
660
661 /* unlock entire db */
_tdb_unlockall(struct tdb_context * tdb,int ltype)662 static int _tdb_unlockall(struct tdb_context *tdb, int ltype)
663 {
664 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
665
666 ltype &= ~TDB_MARK_LOCK;
667
668 /* There are no locks on read-only dbs */
669 if (tdb->read_only || tdb->traverse_read) {
670 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
671 }
672
673 if (tdb->global_lock.ltype != ltype || tdb->global_lock.count == 0) {
674 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
675 }
676
677 if (tdb->global_lock.count > 1) {
678 tdb->global_lock.count--;
679 return 0;
680 }
681
682 if (!mark_lock &&
683 tdb->methods->tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW,
684 0, 4*tdb->header.hash_size)) {
685 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
686 return -1;
687 }
688
689 tdb->global_lock.count = 0;
690 tdb->global_lock.ltype = 0;
691
692 return 0;
693 }
694
695 /* lock entire database with write lock */
tdb_lockall(struct tdb_context * tdb)696 int tdb_lockall(struct tdb_context *tdb)
697 {
698 return _tdb_lockall(tdb, F_WRLCK, F_SETLKW);
699 }
700
701 /* lock entire database with write lock - mark only */
tdb_lockall_mark(struct tdb_context * tdb)702 int tdb_lockall_mark(struct tdb_context *tdb)
703 {
704 return _tdb_lockall(tdb, F_WRLCK | TDB_MARK_LOCK, F_SETLKW);
705 }
706
707 /* unlock entire database with write lock - unmark only */
tdb_lockall_unmark(struct tdb_context * tdb)708 int tdb_lockall_unmark(struct tdb_context *tdb)
709 {
710 return _tdb_unlockall(tdb, F_WRLCK | TDB_MARK_LOCK);
711 }
712
713 /* lock entire database with write lock - nonblocking varient */
tdb_lockall_nonblock(struct tdb_context * tdb)714 int tdb_lockall_nonblock(struct tdb_context *tdb)
715 {
716 return _tdb_lockall(tdb, F_WRLCK, F_SETLK);
717 }
718
719 /* unlock entire database with write lock */
tdb_unlockall(struct tdb_context * tdb)720 int tdb_unlockall(struct tdb_context *tdb)
721 {
722 return _tdb_unlockall(tdb, F_WRLCK);
723 }
724
725 /* lock entire database with read lock */
tdb_lockall_read(struct tdb_context * tdb)726 int tdb_lockall_read(struct tdb_context *tdb)
727 {
728 return _tdb_lockall(tdb, F_RDLCK, F_SETLKW);
729 }
730
731 /* lock entire database with read lock - nonblock varient */
tdb_lockall_read_nonblock(struct tdb_context * tdb)732 int tdb_lockall_read_nonblock(struct tdb_context *tdb)
733 {
734 return _tdb_lockall(tdb, F_RDLCK, F_SETLK);
735 }
736
737 /* unlock entire database with read lock */
tdb_unlockall_read(struct tdb_context * tdb)738 int tdb_unlockall_read(struct tdb_context *tdb)
739 {
740 return _tdb_unlockall(tdb, F_RDLCK);
741 }
742
743 /* lock/unlock one hash chain. This is meant to be used to reduce
744 contention - it cannot guarantee how many records will be locked */
tdb_chainlock(struct tdb_context * tdb,TDB_DATA key)745 int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
746 {
747 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
748 }
749
750 /* lock/unlock one hash chain, non-blocking. This is meant to be used
751 to reduce contention - it cannot guarantee how many records will be
752 locked */
tdb_chainlock_nonblock(struct tdb_context * tdb,TDB_DATA key)753 int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
754 {
755 return tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
756 }
757
758 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
tdb_chainlock_mark(struct tdb_context * tdb,TDB_DATA key)759 int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
760 {
761 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
762 }
763
764 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
tdb_chainlock_unmark(struct tdb_context * tdb,TDB_DATA key)765 int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
766 {
767 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
768 }
769
tdb_chainunlock(struct tdb_context * tdb,TDB_DATA key)770 int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
771 {
772 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
773 }
774
tdb_chainlock_read(struct tdb_context * tdb,TDB_DATA key)775 int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
776 {
777 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
778 }
779
tdb_chainunlock_read(struct tdb_context * tdb,TDB_DATA key)780 int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
781 {
782 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
783 }
784
785
786
787 /* record lock stops delete underneath */
tdb_lock_record(struct tdb_context * tdb,tdb_off_t off)788 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
789 {
790 return off ? tdb->methods->tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0, 1) : 0;
791 }
792
793 /*
794 Write locks override our own fcntl readlocks, so check it here.
795 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
796 an error to fail to get the lock here.
797 */
tdb_write_lock_record(struct tdb_context * tdb,tdb_off_t off)798 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
799 {
800 struct tdb_traverse_lock *i;
801 for (i = &tdb->travlocks; i; i = i->next)
802 if (i->off == off)
803 return -1;
804 return tdb->methods->tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1, 1);
805 }
806
807 /*
808 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
809 an error to fail to get the lock here.
810 */
tdb_write_unlock_record(struct tdb_context * tdb,tdb_off_t off)811 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
812 {
813 return tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0, 1);
814 }
815
816 /* fcntl locks don't stack: avoid unlocking someone else's */
tdb_unlock_record(struct tdb_context * tdb,tdb_off_t off)817 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
818 {
819 struct tdb_traverse_lock *i;
820 u32 count = 0;
821
822 if (off == 0)
823 return 0;
824 for (i = &tdb->travlocks; i; i = i->next)
825 if (i->off == off)
826 count++;
827 return (count == 1 ? tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0, 1) : 0);
828 }
829
830 /* file: io.c */
831
832 /* check for an out of bounds access - if it is out of bounds then
833 see if the database has been expanded by someone else and expand
834 if necessary
835 note that "len" is the minimum length needed for the db
836 */
tdb_oob(struct tdb_context * tdb,tdb_off_t len,int probe)837 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
838 {
839 struct stat st;
840 if (len <= tdb->map_size)
841 return 0;
842 if (tdb->flags & TDB_INTERNAL) {
843 if (!probe) {
844 /* Ensure ecode is set for log fn. */
845 tdb->ecode = TDB_ERR_IO;
846 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond internal malloc size %d\n",
847 (int)len, (int)tdb->map_size));
848 }
849 return TDB_ERRCODE(TDB_ERR_IO, -1);
850 }
851
852 if (fstat(tdb->fd, &st) == -1) {
853 return TDB_ERRCODE(TDB_ERR_IO, -1);
854 }
855
856 if (st.st_size < (size_t)len) {
857 if (!probe) {
858 /* Ensure ecode is set for log fn. */
859 tdb->ecode = TDB_ERR_IO;
860 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond eof at %d\n",
861 (int)len, (int)st.st_size));
862 }
863 return TDB_ERRCODE(TDB_ERR_IO, -1);
864 }
865
866 /* Unmap, update size, remap */
867 if (tdb_munmap(tdb) == -1)
868 return TDB_ERRCODE(TDB_ERR_IO, -1);
869 tdb->map_size = st.st_size;
870 tdb_mmap(tdb);
871 return 0;
872 }
873
874 /* write a lump of data at a specified offset */
tdb_write(struct tdb_context * tdb,tdb_off_t off,const void * buf,tdb_len_t len)875 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
876 const void *buf, tdb_len_t len)
877 {
878 if (len == 0) {
879 return 0;
880 }
881
882 if (tdb->read_only || tdb->traverse_read) {
883 tdb->ecode = TDB_ERR_RDONLY;
884 return -1;
885 }
886
887 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
888 return -1;
889
890 if (tdb->map_ptr) {
891 memcpy(off + (char *)tdb->map_ptr, buf, len);
892 } else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
893 /* Ensure ecode is set for log fn. */
894 tdb->ecode = TDB_ERR_IO;
895 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d len=%d (%s)\n",
896 off, len, strerror(errno)));
897 return TDB_ERRCODE(TDB_ERR_IO, -1);
898 }
899 return 0;
900 }
901
902 /* Endian conversion: we only ever deal with 4 byte quantities */
tdb_convert(void * buf,u32 size)903 void *tdb_convert(void *buf, u32 size)
904 {
905 u32 i, *p = (u32 *)buf;
906 for (i = 0; i < size / 4; i++)
907 p[i] = TDB_BYTEREV(p[i]);
908 return buf;
909 }
910
911
912 /* read a lump of data at a specified offset, maybe convert */
tdb_read(struct tdb_context * tdb,tdb_off_t off,void * buf,tdb_len_t len,int cv)913 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
914 tdb_len_t len, int cv)
915 {
916 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0) {
917 return -1;
918 }
919
920 if (tdb->map_ptr) {
921 memcpy(buf, off + (char *)tdb->map_ptr, len);
922 } else {
923 ssize_t ret = pread(tdb->fd, buf, len, off);
924 if (ret != (ssize_t)len) {
925 /* Ensure ecode is set for log fn. */
926 tdb->ecode = TDB_ERR_IO;
927 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d "
928 "len=%d ret=%d (%s) map_size=%d\n",
929 (int)off, (int)len, (int)ret, strerror(errno),
930 (int)tdb->map_size));
931 return TDB_ERRCODE(TDB_ERR_IO, -1);
932 }
933 }
934 if (cv) {
935 tdb_convert(buf, len);
936 }
937 return 0;
938 }
939
940
941
942 /*
943 do an unlocked scan of the hash table heads to find the next non-zero head. The value
944 will then be confirmed with the lock held
945 */
tdb_next_hash_chain(struct tdb_context * tdb,u32 * chain)946 static void tdb_next_hash_chain(struct tdb_context *tdb, u32 *chain)
947 {
948 u32 h = *chain;
949 if (tdb->map_ptr) {
950 for (;h < tdb->header.hash_size;h++) {
951 if (0 != *(u32 *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
952 break;
953 }
954 }
955 } else {
956 u32 off=0;
957 for (;h < tdb->header.hash_size;h++) {
958 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
959 break;
960 }
961 }
962 }
963 (*chain) = h;
964 }
965
966
tdb_munmap(struct tdb_context * tdb)967 int tdb_munmap(struct tdb_context *tdb)
968 {
969 if (tdb->flags & TDB_INTERNAL)
970 return 0;
971
972 #ifdef HAVE_MMAP
973 if (tdb->map_ptr) {
974 int ret = munmap(tdb->map_ptr, tdb->map_size);
975 if (ret != 0)
976 return ret;
977 }
978 #endif
979 tdb->map_ptr = NULL;
980 return 0;
981 }
982
tdb_mmap(struct tdb_context * tdb)983 void tdb_mmap(struct tdb_context *tdb)
984 {
985 if (tdb->flags & TDB_INTERNAL)
986 return;
987
988 #ifdef HAVE_MMAP
989 if (!(tdb->flags & TDB_NOMMAP)) {
990 tdb->map_ptr = mmap(NULL, tdb->map_size,
991 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
992 MAP_SHARED|MAP_FILE, tdb->fd, 0);
993
994 /*
995 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
996 */
997
998 if (tdb->map_ptr == MAP_FAILED) {
999 tdb->map_ptr = NULL;
1000 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n",
1001 tdb->map_size, strerror(errno)));
1002 }
1003 } else {
1004 tdb->map_ptr = NULL;
1005 }
1006 #else
1007 tdb->map_ptr = NULL;
1008 #endif
1009 }
1010
1011 /* expand a file. we prefer to use ftruncate, as that is what posix
1012 says to use for mmap expansion */
tdb_expand_file(struct tdb_context * tdb,tdb_off_t size,tdb_off_t addition)1013 static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
1014 {
1015 char buf[1024];
1016
1017 if (tdb->read_only || tdb->traverse_read) {
1018 tdb->ecode = TDB_ERR_RDONLY;
1019 return -1;
1020 }
1021
1022 if (ftruncate(tdb->fd, size+addition) == -1) {
1023 char b = 0;
1024 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
1025 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n",
1026 size+addition, strerror(errno)));
1027 return -1;
1028 }
1029 }
1030
1031 /* now fill the file with something. This ensures that the
1032 file isn't sparse, which would be very bad if we ran out of
1033 disk. This must be done with write, not via mmap */
1034 memset(buf, TDB_PAD_BYTE, sizeof(buf));
1035 while (addition) {
1036 int n = addition>sizeof(buf)?sizeof(buf):addition;
1037 int ret = pwrite(tdb->fd, buf, n, size);
1038 if (ret != n) {
1039 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of %d failed (%s)\n",
1040 n, strerror(errno)));
1041 return -1;
1042 }
1043 addition -= n;
1044 size += n;
1045 }
1046 return 0;
1047 }
1048
1049
1050 /* expand the database at least size bytes by expanding the underlying
1051 file and doing the mmap again if necessary */
tdb_expand(struct tdb_context * tdb,tdb_off_t size)1052 int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
1053 {
1054 struct list_struct rec;
1055 tdb_off_t offset;
1056
1057 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
1058 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
1059 return -1;
1060 }
1061
1062 /* must know about any previous expansions by another process */
1063 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1064
1065 /* always make room for at least 10 more records, and round
1066 the database up to a multiple of the page size */
1067 size = TDB_ALIGN(tdb->map_size + size*10, tdb->page_size) - tdb->map_size;
1068
1069 if (!(tdb->flags & TDB_INTERNAL))
1070 tdb_munmap(tdb);
1071
1072 /*
1073 * We must ensure the file is unmapped before doing this
1074 * to ensure consistency with systems like OpenBSD where
1075 * writes and mmaps are not consistent.
1076 */
1077
1078 /* expand the file itself */
1079 if (!(tdb->flags & TDB_INTERNAL)) {
1080 if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
1081 goto fail;
1082 }
1083
1084 tdb->map_size += size;
1085
1086 if (tdb->flags & TDB_INTERNAL) {
1087 char *new_map_ptr = (char *)realloc(tdb->map_ptr,
1088 tdb->map_size);
1089 if (!new_map_ptr) {
1090 tdb->map_size -= size;
1091 goto fail;
1092 }
1093 tdb->map_ptr = new_map_ptr;
1094 } else {
1095 /*
1096 * We must ensure the file is remapped before adding the space
1097 * to ensure consistency with systems like OpenBSD where
1098 * writes and mmaps are not consistent.
1099 */
1100
1101 /* We're ok if the mmap fails as we'll fallback to read/write */
1102 tdb_mmap(tdb);
1103 }
1104
1105 /* form a new freelist record */
1106 memset(&rec,'\0',sizeof(rec));
1107 rec.rec_len = size - sizeof(rec);
1108
1109 /* link it into the free list */
1110 offset = tdb->map_size - size;
1111 if (tdb_free(tdb, offset, &rec) == -1)
1112 goto fail;
1113
1114 tdb_unlock(tdb, -1, F_WRLCK);
1115 return 0;
1116 fail:
1117 tdb_unlock(tdb, -1, F_WRLCK);
1118 return -1;
1119 }
1120
1121 /* read/write a tdb_off_t */
tdb_ofs_read(struct tdb_context * tdb,tdb_off_t offset,tdb_off_t * d)1122 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
1123 {
1124 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
1125 }
1126
tdb_ofs_write(struct tdb_context * tdb,tdb_off_t offset,tdb_off_t * d)1127 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
1128 {
1129 tdb_off_t off = *d;
1130 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
1131 }
1132
1133
1134 /* read a lump of data, allocating the space for it */
tdb_alloc_read(struct tdb_context * tdb,tdb_off_t offset,tdb_len_t len)1135 unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
1136 {
1137 unsigned char *buf;
1138
1139 /* some systems don't like zero length malloc */
1140 if (len == 0) {
1141 len = 1;
1142 }
1143
1144 if (!(buf = (unsigned char *)malloc(len))) {
1145 /* Ensure ecode is set for log fn. */
1146 tdb->ecode = TDB_ERR_OOM;
1147 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n",
1148 len, strerror(errno)));
1149 return TDB_ERRCODE(TDB_ERR_OOM, buf);
1150 }
1151 if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
1152 SAFE_FREE(buf);
1153 return NULL;
1154 }
1155 return buf;
1156 }
1157
1158 /* Give a piece of tdb data to a parser */
1159
tdb_parse_data(struct tdb_context * tdb,TDB_DATA key,tdb_off_t offset,tdb_len_t len,int (* parser)(TDB_DATA key,TDB_DATA data,void * private_data),void * private_data)1160 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
1161 tdb_off_t offset, tdb_len_t len,
1162 int (*parser)(TDB_DATA key, TDB_DATA data,
1163 void *private_data),
1164 void *private_data)
1165 {
1166 TDB_DATA data;
1167 int result;
1168
1169 data.dsize = len;
1170
1171 if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
1172 /*
1173 * Optimize by avoiding the malloc/memcpy/free, point the
1174 * parser directly at the mmap area.
1175 */
1176 if (tdb->methods->tdb_oob(tdb, offset+len, 0) != 0) {
1177 return -1;
1178 }
1179 data.dptr = offset + (unsigned char *)tdb->map_ptr;
1180 return parser(key, data, private_data);
1181 }
1182
1183 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
1184 return -1;
1185 }
1186
1187 result = parser(key, data, private_data);
1188 free(data.dptr);
1189 return result;
1190 }
1191
1192 /* read/write a record */
tdb_rec_read(struct tdb_context * tdb,tdb_off_t offset,struct list_struct * rec)1193 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
1194 {
1195 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
1196 return -1;
1197 if (TDB_BAD_MAGIC(rec)) {
1198 /* Ensure ecode is set for log fn. */
1199 tdb->ecode = TDB_ERR_CORRUPT;
1200 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
1201 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
1202 }
1203 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
1204 }
1205
tdb_rec_write(struct tdb_context * tdb,tdb_off_t offset,struct list_struct * rec)1206 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
1207 {
1208 struct list_struct r = *rec;
1209 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
1210 }
1211
1212 static const struct tdb_methods io_methods = {
1213 tdb_read,
1214 tdb_write,
1215 tdb_next_hash_chain,
1216 tdb_oob,
1217 tdb_expand_file,
1218 tdb_brlock
1219 };
1220
1221 /*
1222 initialise the default methods table
1223 */
tdb_io_init(struct tdb_context * tdb)1224 void tdb_io_init(struct tdb_context *tdb)
1225 {
1226 tdb->methods = &io_methods;
1227 }
1228
1229 /* file: transaction.c */
1230
1231 /*
1232 transaction design:
1233
1234 - only allow a single transaction at a time per database. This makes
1235 using the transaction API simpler, as otherwise the caller would
1236 have to cope with temporary failures in transactions that conflict
1237 with other current transactions
1238
1239 - keep the transaction recovery information in the same file as the
1240 database, using a special 'transaction recovery' record pointed at
1241 by the header. This removes the need for extra journal files as
1242 used by some other databases
1243
1244 - dynamically allocated the transaction recover record, re-using it
1245 for subsequent transactions. If a larger record is needed then
1246 tdb_free() the old record to place it on the normal tdb freelist
1247 before allocating the new record
1248
1249 - during transactions, keep a linked list of writes all that have
1250 been performed by intercepting all tdb_write() calls. The hooked
1251 transaction versions of tdb_read() and tdb_write() check this
1252 linked list and try to use the elements of the list in preference
1253 to the real database.
1254
1255 - don't allow any locks to be held when a transaction starts,
1256 otherwise we can end up with deadlock (plus lack of lock nesting
1257 in posix locks would mean the lock is lost)
1258
1259 - if the caller gains a lock during the transaction but doesn't
1260 release it then fail the commit
1261
1262 - allow for nested calls to tdb_transaction_start(), re-using the
1263 existing transaction record. If the inner transaction is cancelled
1264 then a subsequent commit will fail
1265
1266 - keep a mirrored copy of the tdb hash chain heads to allow for the
1267 fast hash heads scan on traverse, updating the mirrored copy in
1268 the transaction version of tdb_write
1269
1270 - allow callers to mix transaction and non-transaction use of tdb,
1271 although once a transaction is started then an exclusive lock is
1272 gained until the transaction is committed or cancelled
1273
1274 - the commit stategy involves first saving away all modified data
1275 into a linearised buffer in the transaction recovery area, then
1276 marking the transaction recovery area with a magic value to
1277 indicate a valid recovery record. In total 4 fsync/msync calls are
1278 needed per commit to prevent race conditions. It might be possible
1279 to reduce this to 3 or even 2 with some more work.
1280
1281 - check for a valid recovery record on open of the tdb, while the
1282 global lock is held. Automatically recover from the transaction
1283 recovery area if needed, then continue with the open as
1284 usual. This allows for smooth crash recovery with no administrator
1285 intervention.
1286
1287 - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
1288 still available, but no transaction recovery area is used and no
1289 fsync/msync calls are made.
1290
1291 */
1292
1293 struct tdb_transaction_el {
1294 struct tdb_transaction_el *next, *prev;
1295 tdb_off_t offset;
1296 tdb_len_t length;
1297 unsigned char *data;
1298 };
1299
1300 /*
1301 hold the context of any current transaction
1302 */
1303 struct tdb_transaction {
1304 /* we keep a mirrored copy of the tdb hash heads here so
1305 tdb_next_hash_chain() can operate efficiently */
1306 u32 *hash_heads;
1307
1308 /* the original io methods - used to do IOs to the real db */
1309 const struct tdb_methods *io_methods;
1310
1311 /* the list of transaction elements. We use a doubly linked
1312 list with a last pointer to allow us to keep the list
1313 ordered, with first element at the front of the list. It
1314 needs to be doubly linked as the read/write traversals need
1315 to be backwards, while the commit needs to be forwards */
1316 struct tdb_transaction_el *elements, *elements_last;
1317
1318 /* non-zero when an internal transaction error has
1319 occurred. All write operations will then fail until the
1320 transaction is ended */
1321 int transaction_error;
1322
1323 /* when inside a transaction we need to keep track of any
1324 nested tdb_transaction_start() calls, as these are allowed,
1325 but don't create a new transaction */
1326 int nesting;
1327
1328 /* old file size before transaction */
1329 tdb_len_t old_map_size;
1330 };
1331
1332
1333 /*
1334 read while in a transaction. We need to check first if the data is in our list
1335 of transaction elements, then if not do a real read
1336 */
transaction_read(struct tdb_context * tdb,tdb_off_t off,void * buf,tdb_len_t len,int cv)1337 static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
1338 tdb_len_t len, int cv)
1339 {
1340 struct tdb_transaction_el *el;
1341
1342 /* we need to walk the list backwards to get the most recent data */
1343 for (el=tdb->transaction->elements_last;el;el=el->prev) {
1344 tdb_len_t partial;
1345
1346 if (off+len <= el->offset) {
1347 continue;
1348 }
1349 if (off >= el->offset + el->length) {
1350 continue;
1351 }
1352
1353 /* an overlapping read - needs to be split into up to
1354 2 reads and a memcpy */
1355 if (off < el->offset) {
1356 partial = el->offset - off;
1357 if (transaction_read(tdb, off, buf, partial, cv) != 0) {
1358 goto fail;
1359 }
1360 len -= partial;
1361 off += partial;
1362 buf = (void *)(partial + (char *)buf);
1363 }
1364 if (off + len <= el->offset + el->length) {
1365 partial = len;
1366 } else {
1367 partial = el->offset + el->length - off;
1368 }
1369 memcpy(buf, el->data + (off - el->offset), partial);
1370 if (cv) {
1371 tdb_convert(buf, len);
1372 }
1373 len -= partial;
1374 off += partial;
1375 buf = (void *)(partial + (char *)buf);
1376
1377 if (len != 0 && transaction_read(tdb, off, buf, len, cv) != 0) {
1378 goto fail;
1379 }
1380
1381 return 0;
1382 }
1383
1384 /* its not in the transaction elements - do a real read */
1385 return tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv);
1386
1387 fail:
1388 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
1389 tdb->ecode = TDB_ERR_IO;
1390 tdb->transaction->transaction_error = 1;
1391 return -1;
1392 }
1393
1394
1395 /*
1396 write while in a transaction
1397 */
transaction_write(struct tdb_context * tdb,tdb_off_t off,const void * buf,tdb_len_t len)1398 static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
1399 const void *buf, tdb_len_t len)
1400 {
1401 struct tdb_transaction_el *el, *best_el=NULL;
1402
1403 if (len == 0) {
1404 return 0;
1405 }
1406
1407 /* if the write is to a hash head, then update the transaction
1408 hash heads */
1409 if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
1410 off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
1411 u32 chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
1412 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
1413 }
1414
1415 /* first see if we can replace an existing entry */
1416 for (el=tdb->transaction->elements_last;el;el=el->prev) {
1417 tdb_len_t partial;
1418
1419 if (best_el == NULL && off == el->offset+el->length) {
1420 best_el = el;
1421 }
1422
1423 if (off+len <= el->offset) {
1424 continue;
1425 }
1426 if (off >= el->offset + el->length) {
1427 continue;
1428 }
1429
1430 /* an overlapping write - needs to be split into up to
1431 2 writes and a memcpy */
1432 if (off < el->offset) {
1433 partial = el->offset - off;
1434 if (transaction_write(tdb, off, buf, partial) != 0) {
1435 goto fail;
1436 }
1437 len -= partial;
1438 off += partial;
1439 buf = (const void *)(partial + (const char *)buf);
1440 }
1441 if (off + len <= el->offset + el->length) {
1442 partial = len;
1443 } else {
1444 partial = el->offset + el->length - off;
1445 }
1446 memcpy(el->data + (off - el->offset), buf, partial);
1447 len -= partial;
1448 off += partial;
1449 buf = (const void *)(partial + (const char *)buf);
1450
1451 if (len != 0 && transaction_write(tdb, off, buf, len) != 0) {
1452 goto fail;
1453 }
1454
1455 return 0;
1456 }
1457
1458 /* see if we can append the new entry to an existing entry */
1459 if (best_el && best_el->offset + best_el->length == off &&
1460 (off+len < tdb->transaction->old_map_size ||
1461 off > tdb->transaction->old_map_size)) {
1462 unsigned char *data = best_el->data;
1463 el = best_el;
1464 el->data = (unsigned char *)realloc(el->data,
1465 el->length + len);
1466 if (el->data == NULL) {
1467 tdb->ecode = TDB_ERR_OOM;
1468 tdb->transaction->transaction_error = 1;
1469 el->data = data;
1470 return -1;
1471 }
1472 if (buf) {
1473 memcpy(el->data + el->length, buf, len);
1474 } else {
1475 memset(el->data + el->length, TDB_PAD_BYTE, len);
1476 }
1477 el->length += len;
1478 return 0;
1479 }
1480
1481 /* add a new entry at the end of the list */
1482 el = (struct tdb_transaction_el *)malloc(sizeof(*el));
1483 if (el == NULL) {
1484 tdb->ecode = TDB_ERR_OOM;
1485 tdb->transaction->transaction_error = 1;
1486 return -1;
1487 }
1488 el->next = NULL;
1489 el->prev = tdb->transaction->elements_last;
1490 el->offset = off;
1491 el->length = len;
1492 el->data = (unsigned char *)malloc(len);
1493 if (el->data == NULL) {
1494 free(el);
1495 tdb->ecode = TDB_ERR_OOM;
1496 tdb->transaction->transaction_error = 1;
1497 return -1;
1498 }
1499 if (buf) {
1500 memcpy(el->data, buf, len);
1501 } else {
1502 memset(el->data, TDB_PAD_BYTE, len);
1503 }
1504 if (el->prev) {
1505 el->prev->next = el;
1506 } else {
1507 tdb->transaction->elements = el;
1508 }
1509 tdb->transaction->elements_last = el;
1510 return 0;
1511
1512 fail:
1513 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", off, len));
1514 tdb->ecode = TDB_ERR_IO;
1515 tdb->transaction->transaction_error = 1;
1516 return -1;
1517 }
1518
1519 /*
1520 accelerated hash chain head search, using the cached hash heads
1521 */
transaction_next_hash_chain(struct tdb_context * tdb,u32 * chain)1522 static void transaction_next_hash_chain(struct tdb_context *tdb, u32 *chain)
1523 {
1524 u32 h = *chain;
1525 for (;h < tdb->header.hash_size;h++) {
1526 /* the +1 takes account of the freelist */
1527 if (0 != tdb->transaction->hash_heads[h+1]) {
1528 break;
1529 }
1530 }
1531 (*chain) = h;
1532 }
1533
1534 /*
1535 out of bounds check during a transaction
1536 */
transaction_oob(struct tdb_context * tdb,tdb_off_t len,int probe)1537 static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
1538 {
1539 if (len <= tdb->map_size) {
1540 return 0;
1541 }
1542 return TDB_ERRCODE(TDB_ERR_IO, -1);
1543 }
1544
1545 /*
1546 transaction version of tdb_expand().
1547 */
transaction_expand_file(struct tdb_context * tdb,tdb_off_t size,tdb_off_t addition)1548 static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
1549 tdb_off_t addition)
1550 {
1551 /* add a write to the transaction elements, so subsequent
1552 reads see the zero data */
1553 if (transaction_write(tdb, size, NULL, addition) != 0) {
1554 return -1;
1555 }
1556
1557 return 0;
1558 }
1559
1560 /*
1561 brlock during a transaction - ignore them
1562 */
transaction_brlock(struct tdb_context * tdb,tdb_off_t offset,int rw_type,int lck_type,int probe,size_t len)1563 static int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
1564 int rw_type, int lck_type, int probe, size_t len)
1565 {
1566 return 0;
1567 }
1568
1569 static const struct tdb_methods transaction_methods = {
1570 transaction_read,
1571 transaction_write,
1572 transaction_next_hash_chain,
1573 transaction_oob,
1574 transaction_expand_file,
1575 transaction_brlock
1576 };
1577
1578
1579 /*
1580 start a tdb transaction. No token is returned, as only a single
1581 transaction is allowed to be pending per tdb_context
1582 */
tdb_transaction_start(struct tdb_context * tdb)1583 int tdb_transaction_start(struct tdb_context *tdb)
1584 {
1585 /* some sanity checks */
1586 if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
1587 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
1588 tdb->ecode = TDB_ERR_EINVAL;
1589 return -1;
1590 }
1591
1592 /* cope with nested tdb_transaction_start() calls */
1593 if (tdb->transaction != NULL) {
1594 tdb->transaction->nesting++;
1595 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
1596 tdb->transaction->nesting));
1597 return 0;
1598 }
1599
1600 if (tdb->num_locks != 0 || tdb->global_lock.count) {
1601 /* the caller must not have any locks when starting a
1602 transaction as otherwise we'll be screwed by lack
1603 of nested locks in posix */
1604 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
1605 tdb->ecode = TDB_ERR_LOCK;
1606 return -1;
1607 }
1608
1609 if (tdb->travlocks.next != NULL) {
1610 /* you cannot use transactions inside a traverse (although you can use
1611 traverse inside a transaction) as otherwise you can end up with
1612 deadlock */
1613 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
1614 tdb->ecode = TDB_ERR_LOCK;
1615 return -1;
1616 }
1617
1618 tdb->transaction = (struct tdb_transaction *)
1619 calloc(sizeof(struct tdb_transaction), 1);
1620 if (tdb->transaction == NULL) {
1621 tdb->ecode = TDB_ERR_OOM;
1622 return -1;
1623 }
1624
1625 /* get the transaction write lock. This is a blocking lock. As
1626 discussed with Volker, there are a number of ways we could
1627 make this async, which we will probably do in the future */
1628 if (tdb_transaction_lock(tdb, F_WRLCK) == -1) {
1629 SAFE_FREE(tdb->transaction);
1630 return -1;
1631 }
1632
1633 /* get a read lock from the freelist to the end of file. This
1634 is upgraded to a write lock during the commit */
1635 if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
1636 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
1637 tdb->ecode = TDB_ERR_LOCK;
1638 goto fail;
1639 }
1640
1641 /* setup a copy of the hash table heads so the hash scan in
1642 traverse can be fast */
1643 tdb->transaction->hash_heads = (u32 *)
1644 calloc(tdb->header.hash_size+1, sizeof(u32));
1645 if (tdb->transaction->hash_heads == NULL) {
1646 tdb->ecode = TDB_ERR_OOM;
1647 goto fail;
1648 }
1649 if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
1650 TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
1651 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
1652 tdb->ecode = TDB_ERR_IO;
1653 goto fail;
1654 }
1655
1656 /* make sure we know about any file expansions already done by
1657 anyone else */
1658 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1659 tdb->transaction->old_map_size = tdb->map_size;
1660
1661 /* finally hook the io methods, replacing them with
1662 transaction specific methods */
1663 tdb->transaction->io_methods = tdb->methods;
1664 tdb->methods = &transaction_methods;
1665
1666 /* by calling this transaction write here, we ensure that we don't grow the
1667 transaction linked list due to hash table updates */
1668 if (transaction_write(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
1669 TDB_HASHTABLE_SIZE(tdb)) != 0) {
1670 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to prime hash table\n"));
1671 tdb->ecode = TDB_ERR_IO;
1672 tdb->methods = tdb->transaction->io_methods;
1673 goto fail;
1674 }
1675
1676 return 0;
1677
1678 fail:
1679 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
1680 tdb_transaction_unlock(tdb);
1681 SAFE_FREE(tdb->transaction->hash_heads);
1682 SAFE_FREE(tdb->transaction);
1683 return -1;
1684 }
1685
1686
1687 /*
1688 cancel the current transaction
1689 */
tdb_transaction_cancel(struct tdb_context * tdb)1690 int tdb_transaction_cancel(struct tdb_context *tdb)
1691 {
1692 if (tdb->transaction == NULL) {
1693 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
1694 return -1;
1695 }
1696
1697 if (tdb->transaction->nesting != 0) {
1698 tdb->transaction->transaction_error = 1;
1699 tdb->transaction->nesting--;
1700 return 0;
1701 }
1702
1703 tdb->map_size = tdb->transaction->old_map_size;
1704
1705 /* free all the transaction elements */
1706 while (tdb->transaction->elements) {
1707 struct tdb_transaction_el *el = tdb->transaction->elements;
1708 tdb->transaction->elements = el->next;
1709 free(el->data);
1710 free(el);
1711 }
1712
1713 /* remove any global lock created during the transaction */
1714 if (tdb->global_lock.count != 0) {
1715 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
1716 tdb->global_lock.count = 0;
1717 }
1718
1719 /* remove any locks created during the transaction */
1720 if (tdb->num_locks != 0) {
1721 int i;
1722 for (i=0;i<tdb->num_lockrecs;i++) {
1723 tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
1724 F_UNLCK,F_SETLKW, 0, 1);
1725 }
1726 tdb->num_locks = 0;
1727 tdb->num_lockrecs = 0;
1728 SAFE_FREE(tdb->lockrecs);
1729 }
1730
1731 /* restore the normal io methods */
1732 tdb->methods = tdb->transaction->io_methods;
1733
1734 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
1735 tdb_transaction_unlock(tdb);
1736 SAFE_FREE(tdb->transaction->hash_heads);
1737 SAFE_FREE(tdb->transaction);
1738
1739 return 0;
1740 }
1741
1742 /*
1743 sync to disk
1744 */
transaction_sync(struct tdb_context * tdb,tdb_off_t offset,tdb_len_t length)1745 static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
1746 {
1747 if (fsync(tdb->fd) != 0) {
1748 tdb->ecode = TDB_ERR_IO;
1749 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
1750 return -1;
1751 }
1752 #ifdef MS_SYNC
1753 if (tdb->map_ptr) {
1754 tdb_off_t moffset = offset & ~(tdb->page_size-1);
1755 if (msync(moffset + (char *)tdb->map_ptr,
1756 length + (offset - moffset), MS_SYNC) != 0) {
1757 tdb->ecode = TDB_ERR_IO;
1758 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
1759 strerror(errno)));
1760 return -1;
1761 }
1762 }
1763 #endif
1764 return 0;
1765 }
1766
1767
1768 /*
1769 work out how much space the linearised recovery data will consume
1770 */
tdb_recovery_size(struct tdb_context * tdb)1771 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
1772 {
1773 struct tdb_transaction_el *el;
1774 tdb_len_t recovery_size = 0;
1775
1776 recovery_size = sizeof(u32);
1777 for (el=tdb->transaction->elements;el;el=el->next) {
1778 if (el->offset >= tdb->transaction->old_map_size) {
1779 continue;
1780 }
1781 recovery_size += 2*sizeof(tdb_off_t) + el->length;
1782 }
1783
1784 return recovery_size;
1785 }
1786
1787 /*
1788 allocate the recovery area, or use an existing recovery area if it is
1789 large enough
1790 */
tdb_recovery_allocate(struct tdb_context * tdb,tdb_len_t * recovery_size,tdb_off_t * recovery_offset,tdb_len_t * recovery_max_size)1791 static int tdb_recovery_allocate(struct tdb_context *tdb,
1792 tdb_len_t *recovery_size,
1793 tdb_off_t *recovery_offset,
1794 tdb_len_t *recovery_max_size)
1795 {
1796 struct list_struct rec;
1797 const struct tdb_methods *methods = tdb->transaction->io_methods;
1798 tdb_off_t recovery_head;
1799
1800 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1801 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
1802 return -1;
1803 }
1804
1805 rec.rec_len = 0;
1806
1807 if (recovery_head != 0 &&
1808 methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
1809 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
1810 return -1;
1811 }
1812
1813 *recovery_size = tdb_recovery_size(tdb);
1814
1815 if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
1816 /* it fits in the existing area */
1817 *recovery_max_size = rec.rec_len;
1818 *recovery_offset = recovery_head;
1819 return 0;
1820 }
1821
1822 /* we need to free up the old recovery area, then allocate a
1823 new one at the end of the file. Note that we cannot use
1824 tdb_allocate() to allocate the new one as that might return
1825 us an area that is being currently used (as of the start of
1826 the transaction) */
1827 if (recovery_head != 0) {
1828 if (tdb_free(tdb, recovery_head, &rec) == -1) {
1829 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
1830 return -1;
1831 }
1832 }
1833
1834 /* the tdb_free() call might have increased the recovery size */
1835 *recovery_size = tdb_recovery_size(tdb);
1836
1837 /* round up to a multiple of page size */
1838 *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
1839 *recovery_offset = tdb->map_size;
1840 recovery_head = *recovery_offset;
1841
1842 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
1843 (tdb->map_size - tdb->transaction->old_map_size) +
1844 sizeof(rec) + *recovery_max_size) == -1) {
1845 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
1846 return -1;
1847 }
1848
1849 /* remap the file (if using mmap) */
1850 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1851
1852 /* we have to reset the old map size so that we don't try to expand the file
1853 again in the transaction commit, which would destroy the recovery area */
1854 tdb->transaction->old_map_size = tdb->map_size;
1855
1856 /* write the recovery header offset and sync - we can sync without a race here
1857 as the magic ptr in the recovery record has not been set */
1858 CONVERT(recovery_head);
1859 if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
1860 &recovery_head, sizeof(tdb_off_t)) == -1) {
1861 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
1862 return -1;
1863 }
1864
1865 return 0;
1866 }
1867
1868
1869 /*
1870 setup the recovery data that will be used on a crash during commit
1871 */
transaction_setup_recovery(struct tdb_context * tdb,tdb_off_t * magic_offset)1872 static int transaction_setup_recovery(struct tdb_context *tdb,
1873 tdb_off_t *magic_offset)
1874 {
1875 struct tdb_transaction_el *el;
1876 tdb_len_t recovery_size;
1877 unsigned char *data, *p;
1878 const struct tdb_methods *methods = tdb->transaction->io_methods;
1879 struct list_struct *rec;
1880 tdb_off_t recovery_offset, recovery_max_size;
1881 tdb_off_t old_map_size = tdb->transaction->old_map_size;
1882 u32 magic, tailer;
1883
1884 /*
1885 check that the recovery area has enough space
1886 */
1887 if (tdb_recovery_allocate(tdb, &recovery_size,
1888 &recovery_offset, &recovery_max_size) == -1) {
1889 return -1;
1890 }
1891
1892 data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
1893 if (data == NULL) {
1894 tdb->ecode = TDB_ERR_OOM;
1895 return -1;
1896 }
1897
1898 rec = (struct list_struct *)data;
1899 memset(rec, 0, sizeof(*rec));
1900
1901 rec->magic = 0;
1902 rec->data_len = recovery_size;
1903 rec->rec_len = recovery_max_size;
1904 rec->key_len = old_map_size;
1905 CONVERT(rec);
1906
1907 /* build the recovery data into a single blob to allow us to do a single
1908 large write, which should be more efficient */
1909 p = data + sizeof(*rec);
1910 for (el=tdb->transaction->elements;el;el=el->next) {
1911 if (el->offset >= old_map_size) {
1912 continue;
1913 }
1914 if (el->offset + el->length > tdb->transaction->old_map_size) {
1915 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
1916 free(data);
1917 tdb->ecode = TDB_ERR_CORRUPT;
1918 return -1;
1919 }
1920 memcpy(p, &el->offset, 4);
1921 memcpy(p+4, &el->length, 4);
1922 if (DOCONV()) {
1923 tdb_convert(p, 8);
1924 }
1925 /* the recovery area contains the old data, not the
1926 new data, so we have to call the original tdb_read
1927 method to get it */
1928 if (methods->tdb_read(tdb, el->offset, p + 8, el->length, 0) != 0) {
1929 free(data);
1930 tdb->ecode = TDB_ERR_IO;
1931 return -1;
1932 }
1933 p += 8 + el->length;
1934 }
1935
1936 /* and the tailer */
1937 tailer = sizeof(*rec) + recovery_max_size;
1938 memcpy(p, &tailer, 4);
1939 CONVERT(p);
1940
1941 /* write the recovery data to the recovery area */
1942 if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
1943 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
1944 free(data);
1945 tdb->ecode = TDB_ERR_IO;
1946 return -1;
1947 }
1948
1949 /* as we don't have ordered writes, we have to sync the recovery
1950 data before we update the magic to indicate that the recovery
1951 data is present */
1952 if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
1953 free(data);
1954 return -1;
1955 }
1956
1957 free(data);
1958
1959 magic = TDB_RECOVERY_MAGIC;
1960 CONVERT(magic);
1961
1962 *magic_offset = recovery_offset + offsetof(struct list_struct, magic);
1963
1964 if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
1965 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
1966 tdb->ecode = TDB_ERR_IO;
1967 return -1;
1968 }
1969
1970 /* ensure the recovery magic marker is on disk */
1971 if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
1972 return -1;
1973 }
1974
1975 return 0;
1976 }
1977
1978 /*
1979 commit the current transaction
1980 */
tdb_transaction_commit(struct tdb_context * tdb)1981 int tdb_transaction_commit(struct tdb_context *tdb)
1982 {
1983 const struct tdb_methods *methods;
1984 tdb_off_t magic_offset = 0;
1985 u32 zero = 0;
1986
1987 if (tdb->transaction == NULL) {
1988 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
1989 return -1;
1990 }
1991
1992 if (tdb->transaction->transaction_error) {
1993 tdb->ecode = TDB_ERR_IO;
1994 tdb_transaction_cancel(tdb);
1995 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
1996 return -1;
1997 }
1998
1999 if (tdb->transaction->nesting != 0) {
2000 tdb->transaction->nesting--;
2001 return 0;
2002 }
2003
2004 /* check for a null transaction */
2005 if (tdb->transaction->elements == NULL) {
2006 tdb_transaction_cancel(tdb);
2007 return 0;
2008 }
2009
2010 methods = tdb->transaction->io_methods;
2011
2012 /* if there are any locks pending then the caller has not
2013 nested their locks properly, so fail the transaction */
2014 if (tdb->num_locks || tdb->global_lock.count) {
2015 tdb->ecode = TDB_ERR_LOCK;
2016 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: locks pending on commit\n"));
2017 tdb_transaction_cancel(tdb);
2018 return -1;
2019 }
2020
2021 /* upgrade the main transaction lock region to a write lock */
2022 if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
2023 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to upgrade hash locks\n"));
2024 tdb->ecode = TDB_ERR_LOCK;
2025 tdb_transaction_cancel(tdb);
2026 return -1;
2027 }
2028
2029 /* get the global lock - this prevents new users attaching to the database
2030 during the commit */
2031 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
2032 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: failed to get global lock\n"));
2033 tdb->ecode = TDB_ERR_LOCK;
2034 tdb_transaction_cancel(tdb);
2035 return -1;
2036 }
2037
2038 if (!(tdb->flags & TDB_NOSYNC)) {
2039 /* write the recovery data to the end of the file */
2040 if (transaction_setup_recovery(tdb, &magic_offset) == -1) {
2041 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to setup recovery data\n"));
2042 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2043 tdb_transaction_cancel(tdb);
2044 return -1;
2045 }
2046 }
2047
2048 /* expand the file to the new size if needed */
2049 if (tdb->map_size != tdb->transaction->old_map_size) {
2050 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
2051 tdb->map_size -
2052 tdb->transaction->old_map_size) == -1) {
2053 tdb->ecode = TDB_ERR_IO;
2054 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: expansion failed\n"));
2055 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2056 tdb_transaction_cancel(tdb);
2057 return -1;
2058 }
2059 tdb->map_size = tdb->transaction->old_map_size;
2060 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
2061 }
2062
2063 /* perform all the writes */
2064 while (tdb->transaction->elements) {
2065 struct tdb_transaction_el *el = tdb->transaction->elements;
2066
2067 if (methods->tdb_write(tdb, el->offset, el->data, el->length) == -1) {
2068 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
2069
2070 /* we've overwritten part of the data and
2071 possibly expanded the file, so we need to
2072 run the crash recovery code */
2073 tdb->methods = methods;
2074 tdb_transaction_recover(tdb);
2075
2076 tdb_transaction_cancel(tdb);
2077 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2078
2079 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
2080 return -1;
2081 }
2082 tdb->transaction->elements = el->next;
2083 free(el->data);
2084 free(el);
2085 }
2086
2087 if (!(tdb->flags & TDB_NOSYNC)) {
2088 /* ensure the new data is on disk */
2089 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
2090 return -1;
2091 }
2092
2093 /* remove the recovery marker */
2094 if (methods->tdb_write(tdb, magic_offset, &zero, 4) == -1) {
2095 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to remove recovery magic\n"));
2096 return -1;
2097 }
2098
2099 /* ensure the recovery marker has been removed on disk */
2100 if (transaction_sync(tdb, magic_offset, 4) == -1) {
2101 return -1;
2102 }
2103 }
2104
2105 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2106
2107 /*
2108 TODO: maybe write to some dummy hdr field, or write to magic
2109 offset without mmap, before the last sync, instead of the
2110 utime() call
2111 */
2112
2113 /* on some systems (like Linux 2.6.x) changes via mmap/msync
2114 don't change the mtime of the file, this means the file may
2115 not be backed up (as tdb rounding to block sizes means that
2116 file size changes are quite rare too). The following forces
2117 mtime changes when a transaction completes */
2118 #ifdef HAVE_UTIME
2119 utime(tdb->name, NULL);
2120 #endif
2121
2122 /* use a transaction cancel to free memory and remove the
2123 transaction locks */
2124 tdb_transaction_cancel(tdb);
2125 return 0;
2126 }
2127
2128
2129 /*
2130 recover from an aborted transaction. Must be called with exclusive
2131 database write access already established (including the global
2132 lock to prevent new processes attaching)
2133 */
tdb_transaction_recover(struct tdb_context * tdb)2134 int tdb_transaction_recover(struct tdb_context *tdb)
2135 {
2136 tdb_off_t recovery_head, recovery_eof;
2137 unsigned char *data, *p;
2138 u32 zero = 0;
2139 struct list_struct rec;
2140
2141 /* find the recovery area */
2142 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
2143 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
2144 tdb->ecode = TDB_ERR_IO;
2145 return -1;
2146 }
2147
2148 if (recovery_head == 0) {
2149 /* we have never allocated a recovery record */
2150 return 0;
2151 }
2152
2153 /* read the recovery record */
2154 if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
2155 sizeof(rec), DOCONV()) == -1) {
2156 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
2157 tdb->ecode = TDB_ERR_IO;
2158 return -1;
2159 }
2160
2161 if (rec.magic != TDB_RECOVERY_MAGIC) {
2162 /* there is no valid recovery data */
2163 return 0;
2164 }
2165
2166 if (tdb->read_only) {
2167 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
2168 tdb->ecode = TDB_ERR_CORRUPT;
2169 return -1;
2170 }
2171
2172 recovery_eof = rec.key_len;
2173
2174 data = (unsigned char *)malloc(rec.data_len);
2175 if (data == NULL) {
2176 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
2177 tdb->ecode = TDB_ERR_OOM;
2178 return -1;
2179 }
2180
2181 /* read the full recovery data */
2182 if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
2183 rec.data_len, 0) == -1) {
2184 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
2185 tdb->ecode = TDB_ERR_IO;
2186 return -1;
2187 }
2188
2189 /* recover the file data */
2190 p = data;
2191 while (p+8 < data + rec.data_len) {
2192 u32 ofs, len;
2193 if (DOCONV()) {
2194 tdb_convert(p, 8);
2195 }
2196 memcpy(&ofs, p, 4);
2197 memcpy(&len, p+4, 4);
2198
2199 if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
2200 free(data);
2201 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
2202 tdb->ecode = TDB_ERR_IO;
2203 return -1;
2204 }
2205 p += 8 + len;
2206 }
2207
2208 free(data);
2209
2210 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
2211 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
2212 tdb->ecode = TDB_ERR_IO;
2213 return -1;
2214 }
2215
2216 /* if the recovery area is after the recovered eof then remove it */
2217 if (recovery_eof <= recovery_head) {
2218 if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
2219 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
2220 tdb->ecode = TDB_ERR_IO;
2221 return -1;
2222 }
2223 }
2224
2225 /* remove the recovery magic */
2226 if (tdb_ofs_write(tdb, recovery_head + offsetof(struct list_struct, magic),
2227 &zero) == -1) {
2228 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
2229 tdb->ecode = TDB_ERR_IO;
2230 return -1;
2231 }
2232
2233 /* reduce the file size to the old size */
2234 tdb_munmap(tdb);
2235 if (ftruncate(tdb->fd, recovery_eof) != 0) {
2236 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
2237 tdb->ecode = TDB_ERR_IO;
2238 return -1;
2239 }
2240 tdb->map_size = recovery_eof;
2241 tdb_mmap(tdb);
2242
2243 if (transaction_sync(tdb, 0, recovery_eof) == -1) {
2244 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
2245 tdb->ecode = TDB_ERR_IO;
2246 return -1;
2247 }
2248
2249 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
2250 recovery_eof));
2251
2252 /* all done */
2253 return 0;
2254 }
2255
2256 /* file: freelist.c */
2257
2258 /* read a freelist record and check for simple errors */
tdb_rec_free_read(struct tdb_context * tdb,tdb_off_t off,struct list_struct * rec)2259 static int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct *rec)
2260 {
2261 if (tdb->methods->tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
2262 return -1;
2263
2264 if (rec->magic == TDB_MAGIC) {
2265 /* this happens when a app is showdown while deleting a record - we should
2266 not completely fail when this happens */
2267 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
2268 rec->magic, off));
2269 rec->magic = TDB_FREE_MAGIC;
2270 if (tdb->methods->tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
2271 return -1;
2272 }
2273
2274 if (rec->magic != TDB_FREE_MAGIC) {
2275 /* Ensure ecode is set for log fn. */
2276 tdb->ecode = TDB_ERR_CORRUPT;
2277 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read bad magic 0x%x at offset=%d\n",
2278 rec->magic, off));
2279 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2280 }
2281 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
2282 return -1;
2283 return 0;
2284 }
2285
2286
2287
2288 /* Remove an element from the freelist. Must have alloc lock. */
remove_from_freelist(struct tdb_context * tdb,tdb_off_t off,tdb_off_t next)2289 static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next)
2290 {
2291 tdb_off_t last_ptr, i;
2292
2293 /* read in the freelist top */
2294 last_ptr = FREELIST_TOP;
2295 while (tdb_ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
2296 if (i == off) {
2297 /* We've found it! */
2298 return tdb_ofs_write(tdb, last_ptr, &next);
2299 }
2300 /* Follow chain (next offset is at start of record) */
2301 last_ptr = i;
2302 }
2303 TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off));
2304 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2305 }
2306
2307
2308 /* update a record tailer (must hold allocation lock) */
update_tailer(struct tdb_context * tdb,tdb_off_t offset,const struct list_struct * rec)2309 static int update_tailer(struct tdb_context *tdb, tdb_off_t offset,
2310 const struct list_struct *rec)
2311 {
2312 tdb_off_t totalsize;
2313
2314 /* Offset of tailer from record header */
2315 totalsize = sizeof(*rec) + rec->rec_len;
2316 return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t),
2317 &totalsize);
2318 }
2319
2320 /* Add an element into the freelist. Merge adjacent records if
2321 neccessary. */
tdb_free(struct tdb_context * tdb,tdb_off_t offset,struct list_struct * rec)2322 int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
2323 {
2324 tdb_off_t right, left;
2325
2326 /* Allocation and tailer lock */
2327 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
2328 return -1;
2329
2330 /* set an initial tailer, so if we fail we don't leave a bogus record */
2331 if (update_tailer(tdb, offset, rec) != 0) {
2332 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n"));
2333 goto fail;
2334 }
2335
2336 /* Look right first (I'm an Australian, dammit) */
2337 right = offset + sizeof(*rec) + rec->rec_len;
2338 if (right + sizeof(*rec) <= tdb->map_size) {
2339 struct list_struct r;
2340
2341 if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
2342 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
2343 goto left;
2344 }
2345
2346 /* If it's free, expand to include it. */
2347 if (r.magic == TDB_FREE_MAGIC) {
2348 if (remove_from_freelist(tdb, right, r.next) == -1) {
2349 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
2350 goto left;
2351 }
2352 rec->rec_len += sizeof(r) + r.rec_len;
2353 }
2354 }
2355
2356 left:
2357 /* Look left */
2358 left = offset - sizeof(tdb_off_t);
2359 if (left > TDB_DATA_START(tdb->header.hash_size)) {
2360 struct list_struct l;
2361 tdb_off_t leftsize;
2362
2363 /* Read in tailer and jump back to header */
2364 if (tdb_ofs_read(tdb, left, &leftsize) == -1) {
2365 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left offset read failed at %u\n", left));
2366 goto update;
2367 }
2368
2369 /* it could be uninitialised data */
2370 if (leftsize == 0 || leftsize == TDB_PAD_U32) {
2371 goto update;
2372 }
2373
2374 left = offset - leftsize;
2375
2376 /* Now read in record */
2377 if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
2378 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
2379 goto update;
2380 }
2381
2382 /* If it's free, expand to include it. */
2383 if (l.magic == TDB_FREE_MAGIC) {
2384 if (remove_from_freelist(tdb, left, l.next) == -1) {
2385 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left free failed at %u\n", left));
2386 goto update;
2387 } else {
2388 offset = left;
2389 rec->rec_len += leftsize;
2390 }
2391 }
2392 }
2393
2394 update:
2395 if (update_tailer(tdb, offset, rec) == -1) {
2396 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
2397 goto fail;
2398 }
2399
2400 /* Now, prepend to free list */
2401 rec->magic = TDB_FREE_MAGIC;
2402
2403 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
2404 tdb_rec_write(tdb, offset, rec) == -1 ||
2405 tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
2406 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%d\n", offset));
2407 goto fail;
2408 }
2409
2410 /* And we're done. */
2411 tdb_unlock(tdb, -1, F_WRLCK);
2412 return 0;
2413
2414 fail:
2415 tdb_unlock(tdb, -1, F_WRLCK);
2416 return -1;
2417 }
2418
2419
2420 /*
2421 the core of tdb_allocate - called when we have decided which
2422 free list entry to use
2423 */
tdb_allocate_ofs(struct tdb_context * tdb,tdb_len_t length,tdb_off_t rec_ptr,struct list_struct * rec,tdb_off_t last_ptr)2424 static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, tdb_len_t length, tdb_off_t rec_ptr,
2425 struct list_struct *rec, tdb_off_t last_ptr)
2426 {
2427 struct list_struct newrec;
2428 tdb_off_t newrec_ptr;
2429
2430 memset(&newrec, '\0', sizeof(newrec));
2431
2432 /* found it - now possibly split it up */
2433 if (rec->rec_len > length + MIN_REC_SIZE) {
2434 /* Length of left piece */
2435 length = TDB_ALIGN(length, TDB_ALIGNMENT);
2436
2437 /* Right piece to go on free list */
2438 newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
2439 newrec_ptr = rec_ptr + sizeof(*rec) + length;
2440
2441 /* And left record is shortened */
2442 rec->rec_len = length;
2443 } else {
2444 newrec_ptr = 0;
2445 }
2446
2447 /* Remove allocated record from the free list */
2448 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) {
2449 return 0;
2450 }
2451
2452 /* Update header: do this before we drop alloc
2453 lock, otherwise tdb_free() might try to
2454 merge with us, thinking we're free.
2455 (Thanks Jeremy Allison). */
2456 rec->magic = TDB_MAGIC;
2457 if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
2458 return 0;
2459 }
2460
2461 /* Did we create new block? */
2462 if (newrec_ptr) {
2463 /* Update allocated record tailer (we
2464 shortened it). */
2465 if (update_tailer(tdb, rec_ptr, rec) == -1) {
2466 return 0;
2467 }
2468
2469 /* Free new record */
2470 if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
2471 return 0;
2472 }
2473 }
2474
2475 /* all done - return the new record offset */
2476 return rec_ptr;
2477 }
2478
2479 /* allocate some space from the free list. The offset returned points
2480 to a unconnected list_struct within the database with room for at
2481 least length bytes of total data
2482
2483 0 is returned if the space could not be allocated
2484 */
tdb_allocate(struct tdb_context * tdb,tdb_len_t length,struct list_struct * rec)2485 tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_struct *rec)
2486 {
2487 tdb_off_t rec_ptr, last_ptr, newrec_ptr;
2488 struct {
2489 tdb_off_t rec_ptr, last_ptr;
2490 tdb_len_t rec_len;
2491 } bestfit;
2492
2493 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
2494 return 0;
2495
2496 /* Extra bytes required for tailer */
2497 length += sizeof(tdb_off_t);
2498
2499 again:
2500 last_ptr = FREELIST_TOP;
2501
2502 /* read in the freelist top */
2503 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
2504 goto fail;
2505
2506 bestfit.rec_ptr = 0;
2507 bestfit.last_ptr = 0;
2508 bestfit.rec_len = 0;
2509
2510 /*
2511 this is a best fit allocation strategy. Originally we used
2512 a first fit strategy, but it suffered from massive fragmentation
2513 issues when faced with a slowly increasing record size.
2514 */
2515 while (rec_ptr) {
2516 if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) {
2517 goto fail;
2518 }
2519
2520 if (rec->rec_len >= length) {
2521 if (bestfit.rec_ptr == 0 ||
2522 rec->rec_len < bestfit.rec_len) {
2523 bestfit.rec_len = rec->rec_len;
2524 bestfit.rec_ptr = rec_ptr;
2525 bestfit.last_ptr = last_ptr;
2526 /* consider a fit to be good enough if
2527 we aren't wasting more than half
2528 the space */
2529 if (bestfit.rec_len < 2*length) {
2530 break;
2531 }
2532 }
2533 }
2534
2535 /* move to the next record */
2536 last_ptr = rec_ptr;
2537 rec_ptr = rec->next;
2538 }
2539
2540 if (bestfit.rec_ptr != 0) {
2541 if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
2542 goto fail;
2543 }
2544
2545 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
2546 tdb_unlock(tdb, -1, F_WRLCK);
2547 return newrec_ptr;
2548 }
2549
2550 /* we didn't find enough space. See if we can expand the
2551 database and if we can then try again */
2552 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
2553 goto again;
2554 fail:
2555 tdb_unlock(tdb, -1, F_WRLCK);
2556 return 0;
2557 }
2558
2559 /* file: freelistcheck.c */
2560
2561 /* Check the freelist is good and contains no loops.
2562 Very memory intensive - only do this as a consistency
2563 checker. Heh heh - uses an in memory tdb as the storage
2564 for the "seen" record list. For some reason this strikes
2565 me as extremely clever as I don't have to write another tree
2566 data structure implementation :-).
2567 */
2568
seen_insert(struct tdb_context * mem_tdb,tdb_off_t rec_ptr)2569 static int seen_insert(struct tdb_context *mem_tdb, tdb_off_t rec_ptr)
2570 {
2571 TDB_DATA key, data;
2572
2573 memset(&data, '\0', sizeof(data));
2574 key.dptr = (unsigned char *)&rec_ptr;
2575 key.dsize = sizeof(rec_ptr);
2576 return tdb_store(mem_tdb, key, data, TDB_INSERT);
2577 }
2578
tdb_validate_freelist(struct tdb_context * tdb,int * pnum_entries)2579 int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries)
2580 {
2581 struct tdb_context *mem_tdb = NULL;
2582 struct list_struct rec;
2583 tdb_off_t rec_ptr, last_ptr;
2584 int ret = -1;
2585
2586 *pnum_entries = 0;
2587
2588 mem_tdb = tdb_open("flval", tdb->header.hash_size,
2589 TDB_INTERNAL, O_RDWR, 0600);
2590 if (!mem_tdb) {
2591 return -1;
2592 }
2593
2594 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
2595 tdb_close(mem_tdb);
2596 return 0;
2597 }
2598
2599 last_ptr = FREELIST_TOP;
2600
2601 /* Store the FREELIST_TOP record. */
2602 if (seen_insert(mem_tdb, last_ptr) == -1) {
2603 ret = TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2604 goto fail;
2605 }
2606
2607 /* read in the freelist top */
2608 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) {
2609 goto fail;
2610 }
2611
2612 while (rec_ptr) {
2613
2614 /* If we can't store this record (we've seen it
2615 before) then the free list has a loop and must
2616 be corrupt. */
2617
2618 if (seen_insert(mem_tdb, rec_ptr)) {
2619 ret = TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2620 goto fail;
2621 }
2622
2623 if (tdb_rec_free_read(tdb, rec_ptr, &rec) == -1) {
2624 goto fail;
2625 }
2626
2627 /* move to the next record */
2628 last_ptr = rec_ptr;
2629 rec_ptr = rec.next;
2630 *pnum_entries += 1;
2631 }
2632
2633 ret = 0;
2634
2635 fail:
2636
2637 tdb_close(mem_tdb);
2638 tdb_unlock(tdb, -1, F_WRLCK);
2639 return ret;
2640 }
2641
2642 /* file: traverse.c */
2643
2644 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
tdb_next_lock(struct tdb_context * tdb,struct tdb_traverse_lock * tlock,struct list_struct * rec)2645 static int tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock *tlock,
2646 struct list_struct *rec)
2647 {
2648 int want_next = (tlock->off != 0);
2649
2650 /* Lock each chain from the start one. */
2651 for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
2652 if (!tlock->off && tlock->hash != 0) {
2653 /* this is an optimisation for the common case where
2654 the hash chain is empty, which is particularly
2655 common for the use of tdb with ldb, where large
2656 hashes are used. In that case we spend most of our
2657 time in tdb_brlock(), locking empty hash chains.
2658
2659 To avoid this, we do an unlocked pre-check to see
2660 if the hash chain is empty before starting to look
2661 inside it. If it is empty then we can avoid that
2662 hash chain. If it isn't empty then we can't believe
2663 the value we get back, as we read it without a
2664 lock, so instead we get the lock and re-fetch the
2665 value below.
2666
2667 Notice that not doing this optimisation on the
2668 first hash chain is critical. We must guarantee
2669 that we have done at least one fcntl lock at the
2670 start of a search to guarantee that memory is
2671 coherent on SMP systems. If records are added by
2672 others during the search then thats OK, and we
2673 could possibly miss those with this trick, but we
2674 could miss them anyway without this trick, so the
2675 semantics don't change.
2676
2677 With a non-indexed ldb search this trick gains us a
2678 factor of around 80 in speed on a linux 2.6.x
2679 system (testing using ldbtest).
2680 */
2681 tdb->methods->next_hash_chain(tdb, &tlock->hash);
2682 if (tlock->hash == tdb->header.hash_size) {
2683 continue;
2684 }
2685 }
2686
2687 if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
2688 return -1;
2689
2690 /* No previous record? Start at top of chain. */
2691 if (!tlock->off) {
2692 if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
2693 &tlock->off) == -1)
2694 goto fail;
2695 } else {
2696 /* Otherwise unlock the previous record. */
2697 if (tdb_unlock_record(tdb, tlock->off) != 0)
2698 goto fail;
2699 }
2700
2701 if (want_next) {
2702 /* We have offset of old record: grab next */
2703 if (tdb_rec_read(tdb, tlock->off, rec) == -1)
2704 goto fail;
2705 tlock->off = rec->next;
2706 }
2707
2708 /* Iterate through chain */
2709 while( tlock->off) {
2710 tdb_off_t current;
2711 if (tdb_rec_read(tdb, tlock->off, rec) == -1)
2712 goto fail;
2713
2714 /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
2715 if (tlock->off == rec->next) {
2716 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: loop detected.\n"));
2717 goto fail;
2718 }
2719
2720 if (!TDB_DEAD(rec)) {
2721 /* Woohoo: we found one! */
2722 if (tdb_lock_record(tdb, tlock->off) != 0)
2723 goto fail;
2724 return tlock->off;
2725 }
2726
2727 /* Try to clean dead ones from old traverses */
2728 current = tlock->off;
2729 tlock->off = rec->next;
2730 if (!(tdb->read_only || tdb->traverse_read) &&
2731 tdb_do_delete(tdb, current, rec) != 0)
2732 goto fail;
2733 }
2734 tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
2735 want_next = 0;
2736 }
2737 /* We finished iteration without finding anything */
2738 return TDB_ERRCODE(TDB_SUCCESS, 0);
2739
2740 fail:
2741 tlock->off = 0;
2742 if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
2743 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
2744 return -1;
2745 }
2746
2747 /* traverse the entire database - calling fn(tdb, key, data) on each element.
2748 return -1 on error or the record count traversed
2749 if fn is NULL then it is not called
2750 a non-zero return value from fn() indicates that the traversal should stop
2751 */
tdb_traverse_internal(struct tdb_context * tdb,tdb_traverse_func fn,void * private_data,struct tdb_traverse_lock * tl)2752 static int tdb_traverse_internal(struct tdb_context *tdb,
2753 tdb_traverse_func fn, void *private_data,
2754 struct tdb_traverse_lock *tl)
2755 {
2756 TDB_DATA key, dbuf;
2757 struct list_struct rec;
2758 int ret, count = 0;
2759
2760 /* This was in the initializaton, above, but the IRIX compiler
2761 * did not like it. crh
2762 */
2763 tl->next = tdb->travlocks.next;
2764
2765 /* fcntl locks don't stack: beware traverse inside traverse */
2766 tdb->travlocks.next = tl;
2767
2768 /* tdb_next_lock places locks on the record returned, and its chain */
2769 while ((ret = tdb_next_lock(tdb, tl, &rec)) > 0) {
2770 count++;
2771 /* now read the full record */
2772 key.dptr = tdb_alloc_read(tdb, tl->off + sizeof(rec),
2773 rec.key_len + rec.data_len);
2774 if (!key.dptr) {
2775 ret = -1;
2776 if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
2777 goto out;
2778 if (tdb_unlock_record(tdb, tl->off) != 0)
2779 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
2780 goto out;
2781 }
2782 key.dsize = rec.key_len;
2783 dbuf.dptr = key.dptr + rec.key_len;
2784 dbuf.dsize = rec.data_len;
2785
2786 /* Drop chain lock, call out */
2787 if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
2788 ret = -1;
2789 SAFE_FREE(key.dptr);
2790 goto out;
2791 }
2792 if (fn && fn(tdb, key, dbuf, private_data)) {
2793 /* They want us to terminate traversal */
2794 ret = count;
2795 if (tdb_unlock_record(tdb, tl->off) != 0) {
2796 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: unlock_record failed!\n"));;
2797 ret = -1;
2798 }
2799 SAFE_FREE(key.dptr);
2800 goto out;
2801 }
2802 SAFE_FREE(key.dptr);
2803 }
2804 out:
2805 tdb->travlocks.next = tl->next;
2806 if (ret < 0)
2807 return -1;
2808 else
2809 return count;
2810 }
2811
2812
2813 /*
2814 a write style traverse - temporarily marks the db read only
2815 */
tdb_traverse_read(struct tdb_context * tdb,tdb_traverse_func fn,void * private_data)2816 int tdb_traverse_read(struct tdb_context *tdb,
2817 tdb_traverse_func fn, void *private_data)
2818 {
2819 struct tdb_traverse_lock tl = { NULL, 0, 0, F_RDLCK };
2820 int ret;
2821
2822 /* we need to get a read lock on the transaction lock here to
2823 cope with the lock ordering semantics of solaris10 */
2824 if (tdb_transaction_lock(tdb, F_RDLCK)) {
2825 return -1;
2826 }
2827
2828 tdb->traverse_read++;
2829 ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
2830 tdb->traverse_read--;
2831
2832 tdb_transaction_unlock(tdb);
2833
2834 return ret;
2835 }
2836
2837 /*
2838 a write style traverse - needs to get the transaction lock to
2839 prevent deadlocks
2840 */
tdb_traverse(struct tdb_context * tdb,tdb_traverse_func fn,void * private_data)2841 int tdb_traverse(struct tdb_context *tdb,
2842 tdb_traverse_func fn, void *private_data)
2843 {
2844 struct tdb_traverse_lock tl = { NULL, 0, 0, F_WRLCK };
2845 int ret;
2846
2847 if (tdb->read_only || tdb->traverse_read) {
2848 return tdb_traverse_read(tdb, fn, private_data);
2849 }
2850
2851 if (tdb_transaction_lock(tdb, F_WRLCK)) {
2852 return -1;
2853 }
2854
2855 ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
2856
2857 tdb_transaction_unlock(tdb);
2858
2859 return ret;
2860 }
2861
2862
2863 /* find the first entry in the database and return its key */
tdb_firstkey(struct tdb_context * tdb)2864 TDB_DATA tdb_firstkey(struct tdb_context *tdb)
2865 {
2866 TDB_DATA key;
2867 struct list_struct rec;
2868
2869 /* release any old lock */
2870 if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
2871 return tdb_null;
2872 tdb->travlocks.off = tdb->travlocks.hash = 0;
2873 tdb->travlocks.lock_rw = F_RDLCK;
2874
2875 /* Grab first record: locks chain and returned record. */
2876 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
2877 return tdb_null;
2878 /* now read the key */
2879 key.dsize = rec.key_len;
2880 key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
2881
2882 /* Unlock the hash chain of the record we just read. */
2883 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
2884 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
2885 return key;
2886 }
2887
2888 /* find the next entry in the database, returning its key */
tdb_nextkey(struct tdb_context * tdb,TDB_DATA oldkey)2889 TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
2890 {
2891 u32 oldhash;
2892 TDB_DATA key = tdb_null;
2893 struct list_struct rec;
2894 unsigned char *k = NULL;
2895
2896 /* Is locked key the old key? If so, traverse will be reliable. */
2897 if (tdb->travlocks.off) {
2898 if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
2899 return tdb_null;
2900 if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
2901 || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
2902 rec.key_len))
2903 || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
2904 /* No, it wasn't: unlock it and start from scratch */
2905 if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0) {
2906 SAFE_FREE(k);
2907 return tdb_null;
2908 }
2909 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
2910 SAFE_FREE(k);
2911 return tdb_null;
2912 }
2913 tdb->travlocks.off = 0;
2914 }
2915
2916 SAFE_FREE(k);
2917 }
2918
2919 if (!tdb->travlocks.off) {
2920 /* No previous element: do normal find, and lock record */
2921 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), tdb->travlocks.lock_rw, &rec);
2922 if (!tdb->travlocks.off)
2923 return tdb_null;
2924 tdb->travlocks.hash = BUCKET(rec.full_hash);
2925 if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
2926 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
2927 return tdb_null;
2928 }
2929 }
2930 oldhash = tdb->travlocks.hash;
2931
2932 /* Grab next record: locks chain and returned record,
2933 unlocks old record */
2934 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
2935 key.dsize = rec.key_len;
2936 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
2937 key.dsize);
2938 /* Unlock the chain of this new record */
2939 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
2940 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
2941 }
2942 /* Unlock the chain of old record */
2943 if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
2944 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
2945 return key;
2946 }
2947
2948 /* file: dump.c */
2949
tdb_dump_record(struct tdb_context * tdb,int hash,tdb_off_t offset)2950 static tdb_off_t tdb_dump_record(struct tdb_context *tdb, int hash,
2951 tdb_off_t offset)
2952 {
2953 struct list_struct rec;
2954 tdb_off_t tailer_ofs, tailer;
2955
2956 if (tdb->methods->tdb_read(tdb, offset, (char *)&rec,
2957 sizeof(rec), DOCONV()) == -1) {
2958 printf("ERROR: failed to read record at %u\n", offset);
2959 return 0;
2960 }
2961
2962 printf(" rec: hash=%d offset=0x%08x next=0x%08x rec_len=%d "
2963 "key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
2964 hash, offset, rec.next, rec.rec_len, rec.key_len, rec.data_len,
2965 rec.full_hash, rec.magic);
2966
2967 tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off_t);
2968
2969 if (tdb_ofs_read(tdb, tailer_ofs, &tailer) == -1) {
2970 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
2971 return rec.next;
2972 }
2973
2974 if (tailer != rec.rec_len + sizeof(rec)) {
2975 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
2976 (unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
2977 }
2978 return rec.next;
2979 }
2980
tdb_dump_chain(struct tdb_context * tdb,int i)2981 static int tdb_dump_chain(struct tdb_context *tdb, int i)
2982 {
2983 tdb_off_t rec_ptr, top;
2984
2985 top = TDB_HASH_TOP(i);
2986
2987 if (tdb_lock(tdb, i, F_WRLCK) != 0)
2988 return -1;
2989
2990 if (tdb_ofs_read(tdb, top, &rec_ptr) == -1)
2991 return tdb_unlock(tdb, i, F_WRLCK);
2992
2993 if (rec_ptr)
2994 printf("hash=%d\n", i);
2995
2996 while (rec_ptr) {
2997 rec_ptr = tdb_dump_record(tdb, i, rec_ptr);
2998 }
2999
3000 return tdb_unlock(tdb, i, F_WRLCK);
3001 }
3002
tdb_dump_all(struct tdb_context * tdb)3003 void tdb_dump_all(struct tdb_context *tdb)
3004 {
3005 int i;
3006 for (i=0;i<tdb->header.hash_size;i++) {
3007 tdb_dump_chain(tdb, i);
3008 }
3009 printf("freelist:\n");
3010 tdb_dump_chain(tdb, -1);
3011 }
3012
tdb_printfreelist(struct tdb_context * tdb)3013 int tdb_printfreelist(struct tdb_context *tdb)
3014 {
3015 int ret;
3016 long total_free = 0;
3017 tdb_off_t offset, rec_ptr;
3018 struct list_struct rec;
3019
3020 if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
3021 return ret;
3022
3023 offset = FREELIST_TOP;
3024
3025 /* read in the freelist top */
3026 if (tdb_ofs_read(tdb, offset, &rec_ptr) == -1) {
3027 tdb_unlock(tdb, -1, F_WRLCK);
3028 return 0;
3029 }
3030
3031 printf("freelist top=[0x%08x]\n", rec_ptr );
3032 while (rec_ptr) {
3033 if (tdb->methods->tdb_read(tdb, rec_ptr, (char *)&rec,
3034 sizeof(rec), DOCONV()) == -1) {
3035 tdb_unlock(tdb, -1, F_WRLCK);
3036 return -1;
3037 }
3038
3039 if (rec.magic != TDB_FREE_MAGIC) {
3040 printf("bad magic 0x%08x in free list\n", rec.magic);
3041 tdb_unlock(tdb, -1, F_WRLCK);
3042 return -1;
3043 }
3044
3045 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n",
3046 rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
3047 total_free += rec.rec_len;
3048
3049 /* move to the next record */
3050 rec_ptr = rec.next;
3051 }
3052 printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
3053 (int)total_free);
3054
3055 return tdb_unlock(tdb, -1, F_WRLCK);
3056 }
3057
3058 /* file: tdb.c */
3059
3060 TDB_DATA tdb_null;
3061
3062 /*
3063 non-blocking increment of the tdb sequence number if the tdb has been opened using
3064 the TDB_SEQNUM flag
3065 */
tdb_increment_seqnum_nonblock(struct tdb_context * tdb)3066 void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
3067 {
3068 tdb_off_t seqnum=0;
3069
3070 if (!(tdb->flags & TDB_SEQNUM)) {
3071 return;
3072 }
3073
3074 /* we ignore errors from this, as we have no sane way of
3075 dealing with them.
3076 */
3077 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
3078 seqnum++;
3079 tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
3080 }
3081
3082 /*
3083 increment the tdb sequence number if the tdb has been opened using
3084 the TDB_SEQNUM flag
3085 */
tdb_increment_seqnum(struct tdb_context * tdb)3086 static void tdb_increment_seqnum(struct tdb_context *tdb)
3087 {
3088 if (!(tdb->flags & TDB_SEQNUM)) {
3089 return;
3090 }
3091
3092 if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) {
3093 return;
3094 }
3095
3096 tdb_increment_seqnum_nonblock(tdb);
3097
3098 tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1);
3099 }
3100
tdb_key_compare(TDB_DATA key,TDB_DATA data,void * private_data)3101 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
3102 {
3103 return memcmp(data.dptr, key.dptr, data.dsize);
3104 }
3105
3106 /* Returns 0 on fail. On success, return offset of record, and fills
3107 in rec */
tdb_find(struct tdb_context * tdb,TDB_DATA key,u32 hash,struct list_struct * r)3108 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, u32 hash,
3109 struct list_struct *r)
3110 {
3111 tdb_off_t rec_ptr;
3112
3113 /* read in the hash top */
3114 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3115 return 0;
3116
3117 /* keep looking until we find the right record */
3118 while (rec_ptr) {
3119 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
3120 return 0;
3121
3122 if (!TDB_DEAD(r) && hash==r->full_hash
3123 && key.dsize==r->key_len
3124 && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
3125 r->key_len, tdb_key_compare,
3126 NULL) == 0) {
3127 return rec_ptr;
3128 }
3129 rec_ptr = r->next;
3130 }
3131 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
3132 }
3133
3134 /* As tdb_find, but if you succeed, keep the lock */
tdb_find_lock_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash,int locktype,struct list_struct * rec)3135 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, int locktype,
3136 struct list_struct *rec)
3137 {
3138 u32 rec_ptr;
3139
3140 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
3141 return 0;
3142 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
3143 tdb_unlock(tdb, BUCKET(hash), locktype);
3144 return rec_ptr;
3145 }
3146
3147
3148 /* update an entry in place - this only works if the new data size
3149 is <= the old data size and the key exists.
3150 on failure return -1.
3151 */
tdb_update_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash,TDB_DATA dbuf)3152 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
3153 {
3154 struct list_struct rec;
3155 tdb_off_t rec_ptr;
3156
3157 /* find entry */
3158 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
3159 return -1;
3160
3161 /* must be long enough key, data and tailer */
3162 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
3163 tdb->ecode = TDB_SUCCESS; /* Not really an error */
3164 return -1;
3165 }
3166
3167 if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
3168 dbuf.dptr, dbuf.dsize) == -1)
3169 return -1;
3170
3171 if (dbuf.dsize != rec.data_len) {
3172 /* update size */
3173 rec.data_len = dbuf.dsize;
3174 return tdb_rec_write(tdb, rec_ptr, &rec);
3175 }
3176
3177 return 0;
3178 }
3179
3180 /* find an entry in the database given a key */
3181 /* If an entry doesn't exist tdb_err will be set to
3182 * TDB_ERR_NOEXIST. If a key has no data attached
3183 * then the TDB_DATA will have zero length but
3184 * a non-zero pointer
3185 */
tdb_fetch(struct tdb_context * tdb,TDB_DATA key)3186 TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
3187 {
3188 tdb_off_t rec_ptr;
3189 struct list_struct rec;
3190 TDB_DATA ret;
3191 u32 hash;
3192
3193 /* find which hash bucket it is in */
3194 hash = tdb->hash_fn(&key);
3195 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
3196 return tdb_null;
3197
3198 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
3199 rec.data_len);
3200 ret.dsize = rec.data_len;
3201 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3202 return ret;
3203 }
3204
3205 /*
3206 * Find an entry in the database and hand the record's data to a parsing
3207 * function. The parsing function is executed under the chain read lock, so it
3208 * should be fast and should not block on other syscalls.
3209 *
3210 * DONT CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
3211 *
3212 * For mmapped tdb's that do not have a transaction open it points the parsing
3213 * function directly at the mmap area, it avoids the malloc/memcpy in this
3214 * case. If a transaction is open or no mmap is available, it has to do
3215 * malloc/read/parse/free.
3216 *
3217 * This is interesting for all readers of potentially large data structures in
3218 * the tdb records, ldb indexes being one example.
3219 */
3220
tdb_parse_record(struct tdb_context * tdb,TDB_DATA key,int (* parser)(TDB_DATA key,TDB_DATA data,void * private_data),void * private_data)3221 int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
3222 int (*parser)(TDB_DATA key, TDB_DATA data,
3223 void *private_data),
3224 void *private_data)
3225 {
3226 tdb_off_t rec_ptr;
3227 struct list_struct rec;
3228 int ret;
3229 u32 hash;
3230
3231 /* find which hash bucket it is in */
3232 hash = tdb->hash_fn(&key);
3233
3234 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
3235 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
3236 }
3237
3238 ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
3239 rec.data_len, parser, private_data);
3240
3241 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3242
3243 return ret;
3244 }
3245
3246 /* check if an entry in the database exists
3247
3248 note that 1 is returned if the key is found and 0 is returned if not found
3249 this doesn't match the conventions in the rest of this module, but is
3250 compatible with gdbm
3251 */
tdb_exists_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash)3252 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash)
3253 {
3254 struct list_struct rec;
3255
3256 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
3257 return 0;
3258 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3259 return 1;
3260 }
3261
tdb_exists(struct tdb_context * tdb,TDB_DATA key)3262 int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
3263 {
3264 u32 hash = tdb->hash_fn(&key);
3265 return tdb_exists_hash(tdb, key, hash);
3266 }
3267
3268 /* actually delete an entry in the database given the offset */
tdb_do_delete(struct tdb_context * tdb,tdb_off_t rec_ptr,struct list_struct * rec)3269 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct*rec)
3270 {
3271 tdb_off_t last_ptr, i;
3272 struct list_struct lastrec;
3273
3274 if (tdb->read_only || tdb->traverse_read) return -1;
3275
3276 if (tdb_write_lock_record(tdb, rec_ptr) == -1) {
3277 /* Someone traversing here: mark it as dead */
3278 rec->magic = TDB_DEAD_MAGIC;
3279 return tdb_rec_write(tdb, rec_ptr, rec);
3280 }
3281 if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
3282 return -1;
3283
3284 /* find previous record in hash chain */
3285 if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
3286 return -1;
3287 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
3288 if (tdb_rec_read(tdb, i, &lastrec) == -1)
3289 return -1;
3290
3291 /* unlink it: next ptr is at start of record. */
3292 if (last_ptr == 0)
3293 last_ptr = TDB_HASH_TOP(rec->full_hash);
3294 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
3295 return -1;
3296
3297 /* recover the space */
3298 if (tdb_free(tdb, rec_ptr, rec) == -1)
3299 return -1;
3300 return 0;
3301 }
3302
tdb_count_dead(struct tdb_context * tdb,u32 hash)3303 static int tdb_count_dead(struct tdb_context *tdb, u32 hash)
3304 {
3305 int res = 0;
3306 tdb_off_t rec_ptr;
3307 struct list_struct rec;
3308
3309 /* read in the hash top */
3310 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3311 return 0;
3312
3313 while (rec_ptr) {
3314 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
3315 return 0;
3316
3317 if (rec.magic == TDB_DEAD_MAGIC) {
3318 res += 1;
3319 }
3320 rec_ptr = rec.next;
3321 }
3322 return res;
3323 }
3324
3325 /*
3326 * Purge all DEAD records from a hash chain
3327 */
tdb_purge_dead(struct tdb_context * tdb,u32 hash)3328 static int tdb_purge_dead(struct tdb_context *tdb, u32 hash)
3329 {
3330 int res = -1;
3331 struct list_struct rec;
3332 tdb_off_t rec_ptr;
3333
3334 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
3335 return -1;
3336 }
3337
3338 /* read in the hash top */
3339 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3340 goto fail;
3341
3342 while (rec_ptr) {
3343 tdb_off_t next;
3344
3345 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
3346 goto fail;
3347 }
3348
3349 next = rec.next;
3350
3351 if (rec.magic == TDB_DEAD_MAGIC
3352 && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
3353 goto fail;
3354 }
3355 rec_ptr = next;
3356 }
3357 res = 0;
3358 fail:
3359 tdb_unlock(tdb, -1, F_WRLCK);
3360 return res;
3361 }
3362
3363 /* delete an entry in the database given a key */
tdb_delete_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash)3364 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash)
3365 {
3366 tdb_off_t rec_ptr;
3367 struct list_struct rec;
3368 int ret;
3369
3370 if (tdb->max_dead_records != 0) {
3371
3372 /*
3373 * Allow for some dead records per hash chain, mainly for
3374 * tdb's with a very high create/delete rate like locking.tdb.
3375 */
3376
3377 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3378 return -1;
3379
3380 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
3381 /*
3382 * Don't let the per-chain freelist grow too large,
3383 * delete all existing dead records
3384 */
3385 tdb_purge_dead(tdb, hash);
3386 }
3387
3388 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
3389 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3390 return -1;
3391 }
3392
3393 /*
3394 * Just mark the record as dead.
3395 */
3396 rec.magic = TDB_DEAD_MAGIC;
3397 ret = tdb_rec_write(tdb, rec_ptr, &rec);
3398 }
3399 else {
3400 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK,
3401 &rec)))
3402 return -1;
3403
3404 ret = tdb_do_delete(tdb, rec_ptr, &rec);
3405 }
3406
3407 if (ret == 0) {
3408 tdb_increment_seqnum(tdb);
3409 }
3410
3411 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
3412 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
3413 return ret;
3414 }
3415
tdb_delete(struct tdb_context * tdb,TDB_DATA key)3416 int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
3417 {
3418 u32 hash = tdb->hash_fn(&key);
3419 return tdb_delete_hash(tdb, key, hash);
3420 }
3421
3422 /*
3423 * See if we have a dead record around with enough space
3424 */
tdb_find_dead(struct tdb_context * tdb,u32 hash,struct list_struct * r,tdb_len_t length)3425 static tdb_off_t tdb_find_dead(struct tdb_context *tdb, u32 hash,
3426 struct list_struct *r, tdb_len_t length)
3427 {
3428 tdb_off_t rec_ptr;
3429
3430 /* read in the hash top */
3431 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3432 return 0;
3433
3434 /* keep looking until we find the right record */
3435 while (rec_ptr) {
3436 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
3437 return 0;
3438
3439 if (TDB_DEAD(r) && r->rec_len >= length) {
3440 /*
3441 * First fit for simple coding, TODO: change to best
3442 * fit
3443 */
3444 return rec_ptr;
3445 }
3446 rec_ptr = r->next;
3447 }
3448 return 0;
3449 }
3450
3451 /* store an element in the database, replacing any existing element
3452 with the same key
3453
3454 return 0 on success, -1 on failure
3455 */
tdb_store(struct tdb_context * tdb,TDB_DATA key,TDB_DATA dbuf,int flag)3456 int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
3457 {
3458 struct list_struct rec;
3459 u32 hash;
3460 tdb_off_t rec_ptr;
3461 char *p = NULL;
3462 int ret = -1;
3463
3464 if (tdb->read_only || tdb->traverse_read) {
3465 tdb->ecode = TDB_ERR_RDONLY;
3466 return -1;
3467 }
3468
3469 /* find which hash bucket it is in */
3470 hash = tdb->hash_fn(&key);
3471 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3472 return -1;
3473
3474 /* check for it existing, on insert. */
3475 if (flag == TDB_INSERT) {
3476 if (tdb_exists_hash(tdb, key, hash)) {
3477 tdb->ecode = TDB_ERR_EXISTS;
3478 goto fail;
3479 }
3480 } else {
3481 /* first try in-place update, on modify or replace. */
3482 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
3483 goto done;
3484 }
3485 if (tdb->ecode == TDB_ERR_NOEXIST &&
3486 flag == TDB_MODIFY) {
3487 /* if the record doesn't exist and we are in TDB_MODIFY mode then
3488 we should fail the store */
3489 goto fail;
3490 }
3491 }
3492 /* reset the error code potentially set by the tdb_update() */
3493 tdb->ecode = TDB_SUCCESS;
3494
3495 /* delete any existing record - if it doesn't exist we don't
3496 care. Doing this first reduces fragmentation, and avoids
3497 coalescing with `allocated' block before it's updated. */
3498 if (flag != TDB_INSERT)
3499 tdb_delete_hash(tdb, key, hash);
3500
3501 /* Copy key+value *before* allocating free space in case malloc
3502 fails and we are left with a dead spot in the tdb. */
3503
3504 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
3505 tdb->ecode = TDB_ERR_OOM;
3506 goto fail;
3507 }
3508
3509 memcpy(p, key.dptr, key.dsize);
3510 if (dbuf.dsize)
3511 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
3512
3513 if (tdb->max_dead_records != 0) {
3514 /*
3515 * Allow for some dead records per hash chain, look if we can
3516 * find one that can hold the new record. We need enough space
3517 * for key, data and tailer. If we find one, we don't have to
3518 * consult the central freelist.
3519 */
3520 rec_ptr = tdb_find_dead(
3521 tdb, hash, &rec,
3522 key.dsize + dbuf.dsize + sizeof(tdb_off_t));
3523
3524 if (rec_ptr != 0) {
3525 rec.key_len = key.dsize;
3526 rec.data_len = dbuf.dsize;
3527 rec.full_hash = hash;
3528 rec.magic = TDB_MAGIC;
3529 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
3530 || tdb->methods->tdb_write(
3531 tdb, rec_ptr + sizeof(rec),
3532 p, key.dsize + dbuf.dsize) == -1) {
3533 goto fail;
3534 }
3535 goto done;
3536 }
3537 }
3538
3539 /*
3540 * We have to allocate some space from the freelist, so this means we
3541 * have to lock it. Use the chance to purge all the DEAD records from
3542 * the hash chain under the freelist lock.
3543 */
3544
3545 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
3546 goto fail;
3547 }
3548
3549 if ((tdb->max_dead_records != 0)
3550 && (tdb_purge_dead(tdb, hash) == -1)) {
3551 tdb_unlock(tdb, -1, F_WRLCK);
3552 goto fail;
3553 }
3554
3555 /* we have to allocate some space */
3556 rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec);
3557
3558 tdb_unlock(tdb, -1, F_WRLCK);
3559
3560 if (rec_ptr == 0) {
3561 goto fail;
3562 }
3563
3564 /* Read hash top into next ptr */
3565 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
3566 goto fail;
3567
3568 rec.key_len = key.dsize;
3569 rec.data_len = dbuf.dsize;
3570 rec.full_hash = hash;
3571 rec.magic = TDB_MAGIC;
3572
3573 /* write out and point the top of the hash chain at it */
3574 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
3575 || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
3576 || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
3577 /* Need to tdb_unallocate() here */
3578 goto fail;
3579 }
3580
3581 done:
3582 ret = 0;
3583 fail:
3584 if (ret == 0) {
3585 tdb_increment_seqnum(tdb);
3586 }
3587
3588 SAFE_FREE(p);
3589 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3590 return ret;
3591 }
3592
3593
3594 /* Append to an entry. Create if not exist. */
tdb_append(struct tdb_context * tdb,TDB_DATA key,TDB_DATA new_dbuf)3595 int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
3596 {
3597 u32 hash;
3598 TDB_DATA dbuf;
3599 int ret = -1;
3600
3601 /* find which hash bucket it is in */
3602 hash = tdb->hash_fn(&key);
3603 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3604 return -1;
3605
3606 dbuf = tdb_fetch(tdb, key);
3607
3608 if (dbuf.dptr == NULL) {
3609 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
3610 } else {
3611 unsigned char *new_dptr = (unsigned char *)realloc(dbuf.dptr,
3612 dbuf.dsize + new_dbuf.dsize);
3613 if (new_dptr == NULL) {
3614 free(dbuf.dptr);
3615 }
3616 dbuf.dptr = new_dptr;
3617 }
3618
3619 if (dbuf.dptr == NULL) {
3620 tdb->ecode = TDB_ERR_OOM;
3621 goto failed;
3622 }
3623
3624 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
3625 dbuf.dsize += new_dbuf.dsize;
3626
3627 ret = tdb_store(tdb, key, dbuf, 0);
3628
3629 failed:
3630 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3631 SAFE_FREE(dbuf.dptr);
3632 return ret;
3633 }
3634
3635
3636 /*
3637 return the name of the current tdb file
3638 useful for external logging functions
3639 */
tdb_name(struct tdb_context * tdb)3640 const char *tdb_name(struct tdb_context *tdb)
3641 {
3642 return tdb->name;
3643 }
3644
3645 /*
3646 return the underlying file descriptor being used by tdb, or -1
3647 useful for external routines that want to check the device/inode
3648 of the fd
3649 */
tdb_fd(struct tdb_context * tdb)3650 int tdb_fd(struct tdb_context *tdb)
3651 {
3652 return tdb->fd;
3653 }
3654
3655 /*
3656 return the current logging function
3657 useful for external tdb routines that wish to log tdb errors
3658 */
tdb_log_fn(struct tdb_context * tdb)3659 tdb_log_func tdb_log_fn(struct tdb_context *tdb)
3660 {
3661 return tdb->log.log_fn;
3662 }
3663
3664
3665 /*
3666 get the tdb sequence number. Only makes sense if the writers opened
3667 with TDB_SEQNUM set. Note that this sequence number will wrap quite
3668 quickly, so it should only be used for a 'has something changed'
3669 test, not for code that relies on the count of the number of changes
3670 made. If you want a counter then use a tdb record.
3671
3672 The aim of this sequence number is to allow for a very lightweight
3673 test of a possible tdb change.
3674 */
tdb_get_seqnum(struct tdb_context * tdb)3675 int tdb_get_seqnum(struct tdb_context *tdb)
3676 {
3677 tdb_off_t seqnum=0;
3678
3679 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
3680 return seqnum;
3681 }
3682
tdb_hash_size(struct tdb_context * tdb)3683 int tdb_hash_size(struct tdb_context *tdb)
3684 {
3685 return tdb->header.hash_size;
3686 }
3687
tdb_map_size(struct tdb_context * tdb)3688 size_t tdb_map_size(struct tdb_context *tdb)
3689 {
3690 return tdb->map_size;
3691 }
3692
tdb_get_flags(struct tdb_context * tdb)3693 int tdb_get_flags(struct tdb_context *tdb)
3694 {
3695 return tdb->flags;
3696 }
3697
3698
3699 /*
3700 enable sequence number handling on an open tdb
3701 */
tdb_enable_seqnum(struct tdb_context * tdb)3702 void tdb_enable_seqnum(struct tdb_context *tdb)
3703 {
3704 tdb->flags |= TDB_SEQNUM;
3705 }
3706
3707 /* file: open.c */
3708
3709 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
3710 static struct tdb_context *tdbs = NULL;
3711
3712
3713 /* This is based on the hash algorithm from gdbm */
default_tdb_hash(TDB_DATA * key)3714 static unsigned int default_tdb_hash(TDB_DATA *key)
3715 {
3716 u32 value; /* Used to compute the hash value. */
3717 u32 i; /* Used to cycle through random values. */
3718
3719 /* Set the initial value from the key size. */
3720 for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
3721 value = (value + (key->dptr[i] << (i*5 % 24)));
3722
3723 return (1103515243 * value + 12345);
3724 }
3725
3726
3727 /* initialise a new database with a specified hash size */
tdb_new_database(struct tdb_context * tdb,int hash_size)3728 static int tdb_new_database(struct tdb_context *tdb, int hash_size)
3729 {
3730 struct tdb_header *newdb;
3731 int size, ret = -1;
3732
3733 /* We make it up in memory, then write it out if not internal */
3734 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off_t);
3735 if (!(newdb = (struct tdb_header *)calloc(size, 1)))
3736 return TDB_ERRCODE(TDB_ERR_OOM, -1);
3737
3738 /* Fill in the header */
3739 newdb->version = TDB_VERSION;
3740 newdb->hash_size = hash_size;
3741 if (tdb->flags & TDB_INTERNAL) {
3742 tdb->map_size = size;
3743 tdb->map_ptr = (char *)newdb;
3744 memcpy(&tdb->header, newdb, sizeof(tdb->header));
3745 /* Convert the `ondisk' version if asked. */
3746 CONVERT(*newdb);
3747 return 0;
3748 }
3749 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
3750 goto fail;
3751
3752 if (ftruncate(tdb->fd, 0) == -1)
3753 goto fail;
3754
3755 /* This creates an endian-converted header, as if read from disk */
3756 CONVERT(*newdb);
3757 memcpy(&tdb->header, newdb, sizeof(tdb->header));
3758 /* Don't endian-convert the magic food! */
3759 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
3760 if (write(tdb->fd, newdb, size) != size) {
3761 ret = -1;
3762 } else {
3763 ret = 0;
3764 }
3765
3766 fail:
3767 SAFE_FREE(newdb);
3768 return ret;
3769 }
3770
3771
3772
tdb_already_open(dev_t device,ino_t ino)3773 static int tdb_already_open(dev_t device,
3774 ino_t ino)
3775 {
3776 struct tdb_context *i;
3777
3778 for (i = tdbs; i; i = i->next) {
3779 if (i->device == device && i->inode == ino) {
3780 return 1;
3781 }
3782 }
3783
3784 return 0;
3785 }
3786
3787 /* open the database, creating it if necessary
3788
3789 The open_flags and mode are passed straight to the open call on the
3790 database file. A flags value of O_WRONLY is invalid. The hash size
3791 is advisory, use zero for a default value.
3792
3793 Return is NULL on error, in which case errno is also set. Don't
3794 try to call tdb_error or tdb_errname, just do strerror(errno).
3795
3796 @param name may be NULL for internal databases. */
tdb_open(const char * name,int hash_size,int tdb_flags,int open_flags,mode_t mode)3797 struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
3798 int open_flags, mode_t mode)
3799 {
3800 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
3801 }
3802
3803 /* a default logging function */
3804 static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...) PRINTF_ATTRIBUTE(3, 4);
null_log_fn(struct tdb_context * tdb,enum tdb_debug_level level,const char * fmt,...)3805 static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
3806 {
3807 }
3808
3809
tdb_open_ex(const char * name,int hash_size,int tdb_flags,int open_flags,mode_t mode,const struct tdb_logging_context * log_ctx,tdb_hash_func hash_fn)3810 struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
3811 int open_flags, mode_t mode,
3812 const struct tdb_logging_context *log_ctx,
3813 tdb_hash_func hash_fn)
3814 {
3815 struct tdb_context *tdb;
3816 struct stat st;
3817 int rev = 0, locked = 0;
3818 unsigned char *vp;
3819 u32 vertest;
3820
3821 if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
3822 /* Can't log this */
3823 errno = ENOMEM;
3824 goto fail;
3825 }
3826 tdb_io_init(tdb);
3827 tdb->fd = -1;
3828 tdb->name = NULL;
3829 tdb->map_ptr = NULL;
3830 tdb->flags = tdb_flags;
3831 tdb->open_flags = open_flags;
3832 if (log_ctx) {
3833 tdb->log = *log_ctx;
3834 } else {
3835 tdb->log.log_fn = null_log_fn;
3836 tdb->log.log_private = NULL;
3837 }
3838 tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
3839
3840 /* cache the page size */
3841 tdb->page_size = getpagesize();
3842 if (tdb->page_size <= 0) {
3843 tdb->page_size = 0x2000;
3844 }
3845
3846 if ((open_flags & O_ACCMODE) == O_WRONLY) {
3847 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n",
3848 name));
3849 errno = EINVAL;
3850 goto fail;
3851 }
3852
3853 if (hash_size == 0)
3854 hash_size = DEFAULT_HASH_SIZE;
3855 if ((open_flags & O_ACCMODE) == O_RDONLY) {
3856 tdb->read_only = 1;
3857 /* read only databases don't do locking or clear if first */
3858 tdb->flags |= TDB_NOLOCK;
3859 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
3860 }
3861
3862 /* internal databases don't mmap or lock, and start off cleared */
3863 if (tdb->flags & TDB_INTERNAL) {
3864 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
3865 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
3866 if (tdb_new_database(tdb, hash_size) != 0) {
3867 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: tdb_new_database failed!"));
3868 goto fail;
3869 }
3870 goto internal;
3871 }
3872
3873 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
3874 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_open_ex: could not open file %s: %s\n",
3875 name, strerror(errno)));
3876 goto fail; /* errno set by open(2) */
3877 }
3878
3879 /* ensure there is only one process initialising at once */
3880 if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
3881 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get global lock on %s: %s\n",
3882 name, strerror(errno)));
3883 goto fail; /* errno set by tdb_brlock */
3884 }
3885
3886 /* we need to zero database if we are the only one with it open */
3887 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
3888 (locked = (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0, 1) == 0))) {
3889 open_flags |= O_CREAT;
3890 if (ftruncate(tdb->fd, 0) == -1) {
3891 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
3892 "failed to truncate %s: %s\n",
3893 name, strerror(errno)));
3894 goto fail; /* errno set by ftruncate */
3895 }
3896 }
3897
3898 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
3899 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
3900 || (tdb->header.version != TDB_VERSION
3901 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
3902 /* its not a valid database - possibly initialise it */
3903 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
3904 errno = EIO; /* ie bad format or something */
3905 goto fail;
3906 }
3907 rev = (tdb->flags & TDB_CONVERT);
3908 }
3909 vp = (unsigned char *)&tdb->header.version;
3910 vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
3911 (((u32)vp[2]) << 8) | (u32)vp[3];
3912 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
3913 if (!rev)
3914 tdb->flags &= ~TDB_CONVERT;
3915 else {
3916 tdb->flags |= TDB_CONVERT;
3917 tdb_convert(&tdb->header, sizeof(tdb->header));
3918 }
3919 if (fstat(tdb->fd, &st) == -1)
3920 goto fail;
3921
3922 if (tdb->header.rwlocks != 0) {
3923 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n"));
3924 goto fail;
3925 }
3926
3927 /* Is it already in the open list? If so, fail. */
3928 if (tdb_already_open(st.st_dev, st.st_ino)) {
3929 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
3930 "%s (%d,%d) is already open in this process\n",
3931 name, (int)st.st_dev, (int)st.st_ino));
3932 errno = EBUSY;
3933 goto fail;
3934 }
3935
3936 if (!(tdb->name = (char *)strdup(name))) {
3937 errno = ENOMEM;
3938 goto fail;
3939 }
3940
3941 tdb->map_size = st.st_size;
3942 tdb->device = st.st_dev;
3943 tdb->inode = st.st_ino;
3944 tdb->max_dead_records = 0;
3945 tdb_mmap(tdb);
3946 if (locked) {
3947 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0, 1) == -1) {
3948 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
3949 "failed to take ACTIVE_LOCK on %s: %s\n",
3950 name, strerror(errno)));
3951 goto fail;
3952 }
3953
3954 }
3955
3956 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
3957 we didn't get the initial exclusive lock as we need to let all other
3958 users know we're using it. */
3959
3960 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
3961 /* leave this lock in place to indicate it's in use */
3962 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)
3963 goto fail;
3964 }
3965
3966 /* if needed, run recovery */
3967 if (tdb_transaction_recover(tdb) == -1) {
3968 goto fail;
3969 }
3970
3971 internal:
3972 /* Internal (memory-only) databases skip all the code above to
3973 * do with disk files, and resume here by releasing their
3974 * global lock and hooking into the active list. */
3975 if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1) == -1)
3976 goto fail;
3977 tdb->next = tdbs;
3978 tdbs = tdb;
3979 return tdb;
3980
3981 fail:
3982 { int save_errno = errno;
3983
3984 if (!tdb)
3985 return NULL;
3986
3987 if (tdb->map_ptr) {
3988 if (tdb->flags & TDB_INTERNAL)
3989 SAFE_FREE(tdb->map_ptr);
3990 else
3991 tdb_munmap(tdb);
3992 }
3993 SAFE_FREE(tdb->name);
3994 if (tdb->fd != -1)
3995 if (close(tdb->fd) != 0)
3996 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
3997 SAFE_FREE(tdb);
3998 errno = save_errno;
3999 return NULL;
4000 }
4001 }
4002
4003 /*
4004 * Set the maximum number of dead records per hash chain
4005 */
4006
tdb_set_max_dead(struct tdb_context * tdb,int max_dead)4007 void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
4008 {
4009 tdb->max_dead_records = max_dead;
4010 }
4011
4012 /**
4013 * Close a database.
4014 *
4015 * @returns -1 for error; 0 for success.
4016 **/
tdb_close(struct tdb_context * tdb)4017 int tdb_close(struct tdb_context *tdb)
4018 {
4019 struct tdb_context **i;
4020 int ret = 0;
4021
4022 if (tdb->transaction) {
4023 tdb_transaction_cancel(tdb);
4024 }
4025
4026 if (tdb->map_ptr) {
4027 if (tdb->flags & TDB_INTERNAL)
4028 SAFE_FREE(tdb->map_ptr);
4029 else
4030 tdb_munmap(tdb);
4031 }
4032 SAFE_FREE(tdb->name);
4033 if (tdb->fd != -1)
4034 ret = close(tdb->fd);
4035 SAFE_FREE(tdb->lockrecs);
4036
4037 /* Remove from contexts list */
4038 for (i = &tdbs; *i; i = &(*i)->next) {
4039 if (*i == tdb) {
4040 *i = tdb->next;
4041 break;
4042 }
4043 }
4044
4045 memset(tdb, 0, sizeof(*tdb));
4046 SAFE_FREE(tdb);
4047
4048 return ret;
4049 }
4050
4051 /* register a loging function */
tdb_set_logging_function(struct tdb_context * tdb,const struct tdb_logging_context * log_ctx)4052 void tdb_set_logging_function(struct tdb_context *tdb,
4053 const struct tdb_logging_context *log_ctx)
4054 {
4055 tdb->log = *log_ctx;
4056 }
4057
tdb_get_logging_private(struct tdb_context * tdb)4058 void *tdb_get_logging_private(struct tdb_context *tdb)
4059 {
4060 return tdb->log.log_private;
4061 }
4062
4063 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
4064 seek pointer from our parent and to re-establish locks */
tdb_reopen(struct tdb_context * tdb)4065 int tdb_reopen(struct tdb_context *tdb)
4066 {
4067 struct stat st;
4068
4069 if (tdb->flags & TDB_INTERNAL) {
4070 return 0; /* Nothing to do. */
4071 }
4072
4073 if (tdb->num_locks != 0 || tdb->global_lock.count) {
4074 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed with locks held\n"));
4075 goto fail;
4076 }
4077
4078 if (tdb->transaction != 0) {
4079 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed inside a transaction\n"));
4080 goto fail;
4081 }
4082
4083 if (tdb_munmap(tdb) != 0) {
4084 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
4085 goto fail;
4086 }
4087 if (close(tdb->fd) != 0)
4088 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
4089 tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
4090 if (tdb->fd == -1) {
4091 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: open failed (%s)\n", strerror(errno)));
4092 goto fail;
4093 }
4094 if ((tdb->flags & TDB_CLEAR_IF_FIRST) &&
4095 (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)) {
4096 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: failed to obtain active lock\n"));
4097 goto fail;
4098 }
4099 if (fstat(tdb->fd, &st) != 0) {
4100 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
4101 goto fail;
4102 }
4103 if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
4104 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: file dev/inode has changed!\n"));
4105 goto fail;
4106 }
4107 tdb_mmap(tdb);
4108
4109 return 0;
4110
4111 fail:
4112 tdb_close(tdb);
4113 return -1;
4114 }
4115
4116 /* reopen all tdb's */
tdb_reopen_all(int parent_longlived)4117 int tdb_reopen_all(int parent_longlived)
4118 {
4119 struct tdb_context *tdb;
4120
4121 for (tdb=tdbs; tdb; tdb = tdb->next) {
4122 /*
4123 * If the parent is longlived (ie. a
4124 * parent daemon architecture), we know
4125 * it will keep it's active lock on a
4126 * tdb opened with CLEAR_IF_FIRST. Thus
4127 * for child processes we don't have to
4128 * add an active lock. This is essential
4129 * to improve performance on systems that
4130 * keep POSIX locks as a non-scalable data
4131 * structure in the kernel.
4132 */
4133 if (parent_longlived) {
4134 /* Ensure no clear-if-first. */
4135 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
4136 }
4137
4138 if (tdb_reopen(tdb) != 0)
4139 return -1;
4140 }
4141
4142 return 0;
4143 }
4144