1 #pragma once
2 #ifndef IWKV_INTERNAL_H
3 #define IWKV_INTERNAL_H
4
5 #include "iwkv.h"
6 #include "iwlog.h"
7 #include "iwarr.h"
8 #include "iwutils.h"
9 #include "iwfsmfile.h"
10 #include "iwdlsnr.h"
11 #include "iwal.h"
12 #include "iwhmap.h"
13 #include "ksort.h"
14
15 #include <pthread.h>
16 #include <stdatomic.h>
17 #include <unistd.h>
18
19 #include "iwcfg.h"
20
21 #if defined(__APPLE__) || defined(__ANDROID__)
22 #include "pthread_spin_lock_shim.h"
23 #endif
24
25 // IWKV magic number
26 #define IWKV_MAGIC 0x69776b76U
27
28 // IWKV backup magic number
29 #define IWKV_BACKUP_MAGIC 0xBACBAC69U
30
31 // IWKV file format version
32 #define IWKV_FORMAT 2
33
34 // IWDB magic number
35 #define IWDB_MAGIC 0x69776462U
36
37 #ifdef IW_32
38 // Max database file size on 32 bit systems: 2Gb
39 #define IWKV_MAX_DBSZ 0x7fffffff
40 #else
41 // Max database file size: ~512Gb
42 #define IWKV_MAX_DBSZ 0x7fffffff80ULL
43 #endif
44
45 // Size of KV fsm block as power of 2
46 #define IWKV_FSM_BPOW 7U
47
48 #define IWKV_FSM_ALLOC_FLAGS (IWFSM_ALLOC_NO_OVERALLOCATE | IWFSM_SOLID_ALLOCATED_SPACE | IWFSM_ALLOC_NO_STATS)
49
50 // Length of KV fsm header in bytes
51 #define KVHDRSZ 255U
52
53 // [u1:flags,lvl:u1,lkl:u1,pnum:u1,p0:u4,kblk:u4,[pi0:u1,... pi32],n0-n23:u4,lk:u116]:u256 // SBLK
54
55 // Maximum length of prefix key to compare for v2 formst
56 #define PREFIX_KEY_LEN_V1 116U
57
58 #define PREFIX_KEY_LEN_V2 115U
59
60 // Number of skip list levels
61 #define SLEVELS 24U
62
63 #define AANUM (2U * SLEVELS + 2 /* levels + (new block created) + (db block may be updated) */)
64
65 // Lower key length in SBLK
66 #define SBLK_LKLEN PREFIX_KEY_LEN_V2
67
68 // Size of database start block in bytes
69 #define DB_SZ (2U * (1U << IWKV_FSM_BPOW))
70
71 // Size of `SBLK` in bytes
72 #define SBLK_SZ (2U * (1U << IWKV_FSM_BPOW))
73
74 // Number of SBLK blocks in one page
75 #define SBLK_PAGE_SBLK_NUM_V2 16U
76
77 // Size of page with adjacent SBLK blocks. 4096
78 // Data format version: v2
79 #define SBLK_PAGE_SZ_V2 (SBLK_PAGE_SBLK_NUM_V2 * SBLK_SZ)
80
81 // Number of `KV` blocks in KVBLK
82 #define KVBLK_IDXNUM 32U
83
84 // Initial `KVBLK` size power of 2
85 #define KVBLK_INISZPOW 9U
86
87 // KVBLK header size: blen:u1,idxsz:u2
88 #define KVBLK_HDRSZ 3U
89
90 // Max kvp offset bytes
91 #define KVP_MAX_OFF_VLEN 8U
92
93 // Max kvp len 0xfffffffULL bytes
94 #define KVP_MAX_LEN_VLEN 5U
95
96 #define KVBLK_MAX_IDX_SZ ((KVP_MAX_OFF_VLEN + KVP_MAX_LEN_VLEN) * KVBLK_IDXNUM)
97
98 // Max non KV size [blen:u1,idxsz:u2,[ps1:vn,pl1:vn,...,ps63,pl63]
99 #define KVBLK_MAX_NKV_SZ (KVBLK_HDRSZ + KVBLK_MAX_IDX_SZ)
100
101 #define ADDR2BLK(addr_) ((blkn_t) (((uint64_t) (addr_)) >> IWKV_FSM_BPOW))
102
103 #define BLK2ADDR(blk_) (((uint64_t) (blk_)) << IWKV_FSM_BPOW)
104
105 struct _IWKV;
106 struct _IWDB;
107
108 typedef uint32_t blkn_t;
109 typedef uint32_t dbid_t;
110
111 /* Key/Value pair stored in `KVBLK` */
112 typedef struct KV {
113 size_t keysz;
114 size_t valsz;
115 uint8_t *key;
116 uint8_t *val;
117 } KV;
118
119 /* Ket/Value (KV) index: Offset and length. */
120 typedef struct KVP {
121 off_t off; /**< KV block offset relative to `end` of KVBLK */
122 uint32_t len; /**< Length of kv pair block */
123 uint8_t ridx; /**< Position of the actually persisted slot in `KVBLK` */
124 } KVP;
125
126 typedef uint8_t kvblk_flags_t;
127 #define KVBLK_DEFAULT ((kvblk_flags_t) 0x00U)
128 /** KVBLK data is durty and should be flushed to mm */
129 #define KVBLK_DURTY ((kvblk_flags_t) 0x01U)
130
131 typedef uint8_t kvblk_rmkv_opts_t;
132 #define RMKV_SYNC ((kvblk_rmkv_opts_t) 0x01U)
133 #define RMKV_NO_RESIZE ((kvblk_rmkv_opts_t) 0x02U)
134
135 typedef uint8_t sblk_flags_t;
136 /** The lowest `SBLK` key is fully contained in `SBLK`. Persistent flag. */
137 #define SBLK_FULL_LKEY ((sblk_flags_t) 0x01U)
138 /** This block is the start database block. */
139 #define SBLK_DB ((sblk_flags_t) 0x08U)
140 /** Block data changed, block marked as durty and needs to be persisted */
141 #define SBLK_DURTY ((sblk_flags_t) 0x10U)
142
143 typedef uint8_t iwlctx_op_t;
144 /** Put key value operation */
145 #define IWLCTX_PUT ((iwlctx_op_t) 0x01U)
146 /** Delete key operation */
147 #define IWLCTX_DEL ((iwlctx_op_t) 0x01U)
148
149 /* KVBLK: [szpow:u1,idxsz:u2,[ps0:vn,pl0:vn,..., ps32,pl32]____[[KV],...]] */
150 typedef struct KVBLK {
151 IWDB db;
152 off_t addr; /**< Block address */
153 off_t maxoff; /**< Max pair offset */
154 uint16_t idxsz; /**< Size of KV pairs index in bytes */
155 int8_t zidx; /**< Index of first empty pair slot (zero index), or -1 */
156 uint8_t szpow; /**< Block size as power of 2 */
157 kvblk_flags_t flags; /**< Flags */
158 KVP pidx[KVBLK_IDXNUM]; /**< KV pairs index */
159 } KVBLK;
160
161 #define SBLK_PERSISTENT_FLAGS (SBLK_FULL_LKEY)
162 #define SBLK_CACHE_FLAGS (SBLK_CACHE_UPDATE | SBLK_CACHE_PUT | SBLK_CACHE_REMOVE)
163
164 struct _IWKV_cursor;
165
166 /* Database: [magic:u4,dbflg:u1,dbid:u4,next_db_blk:u4,p0:u4,n[24]:u4,c[24]:u4]:209 */
167 struct _IWDB {
168 // SBH
169 IWDB db; /**< Database ref */
170 off_t addr; /**< Database block address */
171 sblk_flags_t flags; /**< Flags */
172 // !SBH
173 IWKV iwkv;
174 pthread_rwlock_t rwl; /**< Database API RW lock */
175 pthread_spinlock_t cursors_slk; /**< Cursors set guard lock */
176 off_t next_db_addr; /**< Next IWDB addr */
177 struct _IWKV_cursor *cursors; /**< Active (currently in-use) database cursors */
178 struct _IWDB *next; /**< Next IWDB meta */
179 struct _IWDB *prev; /**< Prev IWDB meta */
180 dbid_t id; /**< Database ID */
181 volatile int32_t wk_count; /**< Number of active database workers */
182 blkn_t meta_blk; /**< Database meta block number */
183 blkn_t meta_blkn; /**< Database meta length (number of blocks) */
184 iwdb_flags_t dbflg; /**< Database specific flags */
185 atomic_bool open; /**< True if DB is in OPEN state */
186 volatile bool wk_pending_exclusive; /**< If true someone wants to acquire exclusive lock on IWDB */
187 uint32_t lcnt[SLEVELS]; /**< SBLK count per level */
188 };
189
190 /* Skiplist block: [u1:flags,lvl:u1,lkl:u1,pnum:u1,p0:u4,kblk:u4,[pi0:u1,... pi32],n0-n23:u4,lk:u116]:u256 // SBLK */
191 typedef struct SBLK {
192 // SBH
193 IWDB db; /**< Database ref */
194 off_t addr; /**< Block address */
195 sblk_flags_t flags; /**< Flags */
196 uint8_t lvl; /**< Skip list node level */
197 uint8_t bpos; /**< Position of SBLK in a page block starting with 1 (zero means SBLK deleted) */
198 blkn_t p0; /**< Prev node, if IWDB it is the last node */
199 blkn_t n[SLEVELS]; /**< Next nodes */
200 // !SBH
201 KVBLK *kvblk; /**< Associated KVBLK */
202 blkn_t kvblkn; /**< Associated KVBLK block number */
203 int8_t pnum; /**< Number of active kv indexes in `SBLK::pi` */
204 uint8_t lkl; /**< Lower key length within a buffer */
205 uint8_t pi[KVBLK_IDXNUM]; /**< Sorted KV slots, value is an index of kv slot in `KVBLK` */
206 uint8_t lk[PREFIX_KEY_LEN_V1]; /**< Lower key buffer */
207 } SBLK;
208
209 /** IWKV instance */
210 struct _IWKV {
211 IWFS_FSM fsm; /**< FSM pool */
212 pthread_rwlock_t rwl; /**< API RW lock */
213 iwrc fatalrc; /**< Fatal error occuried, no farther operations can be performed */
214 IWDB first_db; /**< First database in chain */
215 IWDB last_db; /**< Last database in chain */
216 IWDLSNR *dlsnr; /**< WAL data events listener */
217 IWHMAP *dbs; /**< Database id -> IWDB mapping */
218 iwkv_openflags oflags; /**< Open flags */
219 pthread_cond_t wk_cond; /**< Workers cond variable */
220 pthread_mutex_t wk_mtx; /**< Workers cond mutext */
221 int32_t fmt_version; /**< Database format version */
222 int32_t pklen; /**< Prefix key length in use */
223 volatile int32_t wk_count; /**< Number of active workers */
224 volatile bool wk_pending_exclusive; /**< If true someone wants to acquire exclusive lock on IWKV */
225 atomic_bool open; /**< True if kvstore is in OPEN state */
226 };
227
228 /** Database lookup context */
229 typedef struct IWLCTX {
230 IWDB db;
231 const IWKV_val *key; /**< Search key */
232 IWKV_val *val; /**< Update value */
233 SBLK *lower; /**< Next to upper bound block */
234 SBLK *upper; /**< Upper bound block */
235 SBLK *nb; /**< New block */
236 off_t destroy_addr; /**< Block to destroy address */
237 off_t upper_addr; /**< Upper block address used in `_lx_del_lr()` */
238 #ifndef NDEBUG
239 uint32_t num_cmps;
240 #endif
241 iwkv_opflags opflags; /**< Operation flags */
242 sblk_flags_t sbflags; /**< `SBLK` flags applied to all new/looked blocks in this context */
243 iwlctx_op_t op; /**< Context operation */
244 uint8_t saan; /**< Position of next free `SBLK` element in the `saa` area */
245 uint8_t kaan; /**< Position of next free `KVBLK` element in the `kaa` area */
246 int8_t nlvl; /**< Level of new inserted/deleted `SBLK` node. -1 if no new node inserted/deleted */
247 IWKV_PUT_HANDLER ph; /**< Optional put handler */
248 void *phop; /**< Put handler opaque data */
249 SBLK *plower[SLEVELS]; /**< Pinned lower nodes per level */
250 SBLK *pupper[SLEVELS]; /**< Pinned upper nodes per level */
251 IWKV_val ekey;
252 SBLK dblk; /**< First database block */
253 SBLK saa[AANUM]; /**< `SBLK` allocation area */
254 KVBLK kaa[AANUM]; /**< `KVBLK` allocation area */
255 uint8_t nbuf[IW_VNUMBUFSZ];
256 uint8_t incbuf[8]; /**< Buffer used to store incremented/decremented values `IWKV_VAL_INCREMENT` opflag */
257 } IWLCTX;
258
259 /** Cursor context */
260 struct _IWKV_cursor {
261 uint8_t cnpos; /**< Position in the current `SBLK` node */
262 bool closed; /**< Cursor closed */
263 int8_t skip_next; /**< When to skip next IWKV_CURSOR_NEXT|IWKV_CURSOR_PREV cursor move
264 due to the side effect of `iwkv_cursor_del()` call.
265 If `skip_next > 0` `IWKV_CURSOR_NEXT` will be skipped
266 If `skip_next < 0` `IWKV_CURSOR_PREV` will be skipped */
267 SBLK *cn; /**< Current `SBLK` node */
268 struct _IWKV_cursor *next; /**< Next cursor in active db cursors chain */
269 off_t dbaddr; /**< Database address used as `cn` */
270 IWLCTX lx; /**< Lookup context */
271 };
272
273 #define ENSURE_OPEN(iwkv_) \
274 if (!(iwkv_) || !((iwkv_)->open)) return IW_ERROR_INVALID_STATE; \
275 if ((iwkv_)->fatalrc) return (iwkv_)->fatalrc
276
277 #define ENSURE_OPEN_DB(db_) \
278 if (!(db_) || !(db_)->iwkv || !(db_)->open || !((db_)->iwkv->open)) return IW_ERROR_INVALID_STATE
279
280 #define API_RLOCK(iwkv_, rci_) \
281 ENSURE_OPEN(iwkv_); \
282 (rci_) = pthread_rwlock_rdlock(&(iwkv_)->rwl); \
283 if (rci_) return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_)
284
_api_rlock(IWKV iwkv)285 IW_INLINE iwrc _api_rlock(IWKV iwkv) {
286 int rci;
287 API_RLOCK(iwkv, rci);
288 return 0;
289 }
290
291 #define API_WLOCK(iwkv_, rci_) \
292 ENSURE_OPEN(iwkv_); \
293 (rci_) = pthread_rwlock_wrlock(&(iwkv_)->rwl); \
294 if (rci_) return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_)
295
_api_wlock(IWKV iwkv)296 IW_INLINE iwrc _api_wlock(IWKV iwkv) {
297 int rci;
298 API_WLOCK(iwkv, rci);
299 return 0;
300 }
301
302 #define API_UNLOCK(iwkv_, rci_, rc_) \
303 rci_ = pthread_rwlock_unlock(&(iwkv_)->rwl); \
304 if (rci_) IWRC(iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_), rc_)
305
306 #define API_DB_RLOCK(db_, rci_) \
307 do { \
308 API_RLOCK((db_)->iwkv, rci_); \
309 (rci_) = pthread_rwlock_rdlock(&(db_)->rwl); \
310 if (rci_) { \
311 pthread_rwlock_unlock(&(db_)->iwkv->rwl); \
312 return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_); \
313 } \
314 } while (0)
315
_api_db_rlock(IWDB db)316 IW_INLINE iwrc _api_db_rlock(IWDB db) {
317 int rci;
318 API_DB_RLOCK(db, rci);
319 return 0;
320 }
321
322 #define API_DB_WLOCK(db_, rci_) \
323 do { \
324 API_RLOCK((db_)->iwkv, rci_); \
325 (rci_) = pthread_rwlock_wrlock(&(db_)->rwl); \
326 if (rci_) { \
327 pthread_rwlock_unlock(&(db_)->iwkv->rwl); \
328 return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_); \
329 } \
330 } while (0)
331
_api_db_wlock(IWDB db)332 IW_INLINE iwrc _api_db_wlock(IWDB db) {
333 int rci;
334 API_DB_WLOCK(db, rci);
335 return 0;
336 }
337
338 #define API_DB_UNLOCK(db_, rci_, rc_) \
339 do { \
340 (rci_) = pthread_rwlock_unlock(&(db_)->rwl); \
341 if (rci_) IWRC(iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_), rc_); \
342 API_UNLOCK((db_)->iwkv, rci_, rc_); \
343 } while (0)
344
345 #define AAPOS_INC(aan_) \
346 do { \
347 if ((aan_) < AANUM - 1) { \
348 (aan_) = (aan_) + 1; \
349 } else { \
350 (aan_) = 0; \
351 } \
352 } while (0)
353
354
355 // SBLK
356 // [flags:u1,lvl:u1,lkl:u1,pnum:u1,p0:u4,kblk:u4,pi:u1[32],n:u4[24],bpos:u1,lk:u115]:u256
357
358 #define SOFF_FLAGS_U1 0
359 #define SOFF_LVL_U1 (SOFF_FLAGS_U1 + 1)
360 #define SOFF_LKL_U1 (SOFF_LVL_U1 + 1)
361 #define SOFF_PNUM_U1 (SOFF_LKL_U1 + 1)
362 #define SOFF_P0_U4 (SOFF_PNUM_U1 + 1)
363 #define SOFF_KBLK_U4 (SOFF_P0_U4 + 4)
364 #define SOFF_PI0_U1 (SOFF_KBLK_U4 + 4)
365 #define SOFF_N0_U4 (SOFF_PI0_U1 + 1 * KVBLK_IDXNUM)
366 #define SOFF_BPOS_U1_V2 (SOFF_N0_U4 + 4 * SLEVELS)
367 #define SOFF_LK_V2 (SOFF_BPOS_U1_V2 + 1)
368 #define SOFF_LK_V1 (SOFF_N0_U4 + 4 * SLEVELS)
369 #define SOFF_END (SOFF_LK_V2 + SBLK_LKLEN)
370 static_assert(SOFF_END == 256, "SOFF_END == 256");
371 static_assert(SBLK_SZ >= SOFF_END, "SBLK_SZ >= SOFF_END");
372
373 // DB
374 // [magic:u4,dbflg:u1,dbid:u4,next_db_blk:u4,p0:u4,n[24]:u4,c[24]:u4,meta_blk:u4,meta_blkn:u4]:217
375 #define DOFF_MAGIC_U4 0
376 #define DOFF_DBFLG_U1 (DOFF_MAGIC_U4 + 4)
377 #define DOFF_DBID_U4 (DOFF_DBFLG_U1 + 1)
378 #define DOFF_NEXTDB_U4 (DOFF_DBID_U4 + 4)
379 #define DOFF_P0_U4 (DOFF_NEXTDB_U4 + 4)
380 #define DOFF_N0_U4 (DOFF_P0_U4 + 4)
381 #define DOFF_C0_U4 (DOFF_N0_U4 + 4 * SLEVELS)
382 #define DOFF_METABLK_U4 (DOFF_C0_U4 + 4 * SLEVELS)
383 #define DOFF_METABLKN_U4 (DOFF_METABLK_U4 + 4)
384 #define DOFF_END (DOFF_METABLKN_U4 + 4)
385 static_assert(DOFF_END == 217, "DOFF_END == 217");
386 static_assert(DB_SZ >= DOFF_END, "DB_SZ >= DOFF_END");
387
388 // KVBLK
389 // [szpow:u1,idxsz:u2,[ps1:vn,pl1:vn,...,ps32,pl32]____[[_KV],...]] // KVBLK
390 #define KBLK_SZPOW_OFF 0
391
392
393 iwrc iwkv_exclusive_lock(IWKV iwkv);
394 iwrc iwkv_exclusive_unlock(IWKV iwkv);
395 void iwkvd_trigger_xor(uint64_t val);
396 void iwkvd_kvblk(FILE *f, KVBLK *kb, int maxvlen);
397 iwrc iwkvd_sblk(FILE *f, IWLCTX *lx, SBLK *sb, int flags);
398 void iwkvd_db(FILE *f, IWDB db, int flags, int plvl);
399
400 // IWKVD Trigger commands
401 #ifdef IW_TESTS
402 #define IWKVD_WAL_NO_CHECKPOINT_ON_CLOSE 1UL
403 #endif
404
405 #endif
406