• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #pragma once
2 #ifndef IWKV_INTERNAL_H
3 #define IWKV_INTERNAL_H
4 
5 #include "iwkv.h"
6 #include "iwlog.h"
7 #include "iwarr.h"
8 #include "iwutils.h"
9 #include "iwfsmfile.h"
10 #include "iwdlsnr.h"
11 #include "iwal.h"
12 #include "iwhmap.h"
13 #include "ksort.h"
14 
15 #include <pthread.h>
16 #include <stdatomic.h>
17 #include <unistd.h>
18 
19 #include "iwcfg.h"
20 
21 #if defined(__APPLE__) || defined(__ANDROID__)
22 #include "pthread_spin_lock_shim.h"
23 #endif
24 
25 // IWKV magic number
26 #define IWKV_MAGIC 0x69776b76U
27 
28 // IWKV backup magic number
29 #define IWKV_BACKUP_MAGIC 0xBACBAC69U
30 
31 // IWKV file format version
32 #define IWKV_FORMAT 2
33 
34 // IWDB magic number
35 #define IWDB_MAGIC 0x69776462U
36 
37 #ifdef IW_32
38 // Max database file size on 32 bit systems: 2Gb
39 #define IWKV_MAX_DBSZ 0x7fffffff
40 #else
41 // Max database file size: ~512Gb
42 #define IWKV_MAX_DBSZ 0x7fffffff80ULL
43 #endif
44 
45 // Size of KV fsm block as power of 2
46 #define IWKV_FSM_BPOW 7U
47 
48 #define IWKV_FSM_ALLOC_FLAGS (IWFSM_ALLOC_NO_OVERALLOCATE | IWFSM_SOLID_ALLOCATED_SPACE | IWFSM_ALLOC_NO_STATS)
49 
50 // Length of KV fsm header in bytes
51 #define KVHDRSZ 255U
52 
53 // [u1:flags,lvl:u1,lkl:u1,pnum:u1,p0:u4,kblk:u4,[pi0:u1,... pi32],n0-n23:u4,lk:u116]:u256 // SBLK
54 
55 // Maximum length of prefix key to compare for v2 formst
56 #define PREFIX_KEY_LEN_V1 116U
57 
58 #define PREFIX_KEY_LEN_V2 115U
59 
60 // Number of skip list levels
61 #define SLEVELS 24U
62 
63 #define AANUM (2U * SLEVELS + 2 /* levels + (new block created) + (db block may be updated) */)
64 
65 // Lower key length in SBLK
66 #define SBLK_LKLEN PREFIX_KEY_LEN_V2
67 
68 // Size of database start block in bytes
69 #define DB_SZ (2U * (1U << IWKV_FSM_BPOW))
70 
71 // Size of `SBLK` in bytes
72 #define SBLK_SZ (2U * (1U << IWKV_FSM_BPOW))
73 
74 // Number of SBLK blocks in one page
75 #define SBLK_PAGE_SBLK_NUM_V2 16U
76 
77 // Size of page with adjacent SBLK blocks. 4096
78 // Data format version: v2
79 #define SBLK_PAGE_SZ_V2 (SBLK_PAGE_SBLK_NUM_V2 * SBLK_SZ)
80 
81 // Number of `KV` blocks in KVBLK
82 #define KVBLK_IDXNUM 32U
83 
84 // Initial `KVBLK` size power of 2
85 #define KVBLK_INISZPOW 9U
86 
87 // KVBLK header size: blen:u1,idxsz:u2
88 #define KVBLK_HDRSZ 3U
89 
90 // Max kvp offset bytes
91 #define KVP_MAX_OFF_VLEN 8U
92 
93 // Max kvp len 0xfffffffULL bytes
94 #define KVP_MAX_LEN_VLEN 5U
95 
96 #define KVBLK_MAX_IDX_SZ ((KVP_MAX_OFF_VLEN + KVP_MAX_LEN_VLEN) * KVBLK_IDXNUM)
97 
98 // Max non KV size [blen:u1,idxsz:u2,[ps1:vn,pl1:vn,...,ps63,pl63]
99 #define KVBLK_MAX_NKV_SZ (KVBLK_HDRSZ + KVBLK_MAX_IDX_SZ)
100 
101 #define ADDR2BLK(addr_) ((blkn_t) (((uint64_t) (addr_)) >> IWKV_FSM_BPOW))
102 
103 #define BLK2ADDR(blk_) (((uint64_t) (blk_)) << IWKV_FSM_BPOW)
104 
105 struct _IWKV;
106 struct _IWDB;
107 
108 typedef uint32_t blkn_t;
109 typedef uint32_t dbid_t;
110 
111 /* Key/Value pair stored in `KVBLK` */
112 typedef struct KV {
113   size_t   keysz;
114   size_t   valsz;
115   uint8_t *key;
116   uint8_t *val;
117 } KV;
118 
119 /* Ket/Value (KV) index: Offset and length. */
120 typedef struct KVP {
121   off_t    off;   /**< KV block offset relative to `end` of KVBLK */
122   uint32_t len;   /**< Length of kv pair block */
123   uint8_t  ridx;  /**< Position of the actually persisted slot in `KVBLK` */
124 } KVP;
125 
126 typedef uint8_t kvblk_flags_t;
127 #define KVBLK_DEFAULT ((kvblk_flags_t) 0x00U)
128 /** KVBLK data is durty and should be flushed to mm */
129 #define KVBLK_DURTY ((kvblk_flags_t) 0x01U)
130 
131 typedef uint8_t kvblk_rmkv_opts_t;
132 #define RMKV_SYNC      ((kvblk_rmkv_opts_t) 0x01U)
133 #define RMKV_NO_RESIZE ((kvblk_rmkv_opts_t) 0x02U)
134 
135 typedef uint8_t sblk_flags_t;
136 /** The lowest `SBLK` key is fully contained in `SBLK`. Persistent flag. */
137 #define SBLK_FULL_LKEY ((sblk_flags_t) 0x01U)
138 /** This block is the start database block. */
139 #define SBLK_DB ((sblk_flags_t) 0x08U)
140 /** Block data changed, block marked as durty and needs to be persisted */
141 #define SBLK_DURTY ((sblk_flags_t) 0x10U)
142 
143 typedef uint8_t iwlctx_op_t;
144 /** Put key value operation */
145 #define IWLCTX_PUT ((iwlctx_op_t) 0x01U)
146 /** Delete key operation */
147 #define IWLCTX_DEL ((iwlctx_op_t) 0x01U)
148 
149 /* KVBLK: [szpow:u1,idxsz:u2,[ps0:vn,pl0:vn,..., ps32,pl32]____[[KV],...]] */
150 typedef struct KVBLK {
151   IWDB     db;
152   off_t    addr;              /**< Block address */
153   off_t    maxoff;            /**< Max pair offset */
154   uint16_t idxsz;             /**< Size of KV pairs index in bytes */
155   int8_t   zidx;              /**< Index of first empty pair slot (zero index), or -1 */
156   uint8_t  szpow;             /**< Block size as power of 2 */
157   kvblk_flags_t flags;        /**< Flags */
158   KVP pidx[KVBLK_IDXNUM];     /**< KV pairs index */
159 } KVBLK;
160 
161 #define SBLK_PERSISTENT_FLAGS (SBLK_FULL_LKEY)
162 #define SBLK_CACHE_FLAGS      (SBLK_CACHE_UPDATE | SBLK_CACHE_PUT | SBLK_CACHE_REMOVE)
163 
164 struct _IWKV_cursor;
165 
166 /* Database: [magic:u4,dbflg:u1,dbid:u4,next_db_blk:u4,p0:u4,n[24]:u4,c[24]:u4]:209 */
167 struct _IWDB {
168   // SBH
169   IWDB  db;                       /**< Database ref */
170   off_t addr;                     /**< Database block address */
171   sblk_flags_t flags;             /**< Flags */
172   // !SBH
173   IWKV    iwkv;
174   pthread_rwlock_t   rwl;             /**< Database API RW lock */
175   pthread_spinlock_t cursors_slk;     /**< Cursors set guard lock */
176   off_t next_db_addr;                 /**< Next IWDB addr */
177   struct _IWKV_cursor *cursors;       /**< Active (currently in-use) database cursors */
178   struct _IWDB *next;                 /**< Next IWDB meta */
179   struct _IWDB *prev;                 /**< Prev IWDB meta */
180   dbid_t id;                          /**< Database ID */
181   volatile int32_t wk_count;          /**< Number of active database workers */
182   blkn_t       meta_blk;              /**< Database meta block number */
183   blkn_t       meta_blkn;             /**< Database meta length (number of blocks) */
184   iwdb_flags_t dbflg;                 /**< Database specific flags */
185   atomic_bool  open;                  /**< True if DB is in OPEN state */
186   volatile bool wk_pending_exclusive; /**< If true someone wants to acquire exclusive lock on IWDB */
187   uint32_t      lcnt[SLEVELS];        /**< SBLK count per level */
188 };
189 
190 /* Skiplist block: [u1:flags,lvl:u1,lkl:u1,pnum:u1,p0:u4,kblk:u4,[pi0:u1,... pi32],n0-n23:u4,lk:u116]:u256 // SBLK */
191 typedef struct SBLK {
192   // SBH
193   IWDB  db;                   /**< Database ref */
194   off_t addr;                 /**< Block address */
195   sblk_flags_t flags;         /**< Flags */
196   uint8_t      lvl;           /**< Skip list node level */
197   uint8_t      bpos;          /**< Position of SBLK in a page block starting with 1 (zero means SBLK deleted) */
198   blkn_t       p0;            /**< Prev node, if IWDB it is the last node */
199   blkn_t       n[SLEVELS];    /**< Next nodes */
200   // !SBH
201   KVBLK  *kvblk;                 /**< Associated KVBLK */
202   blkn_t  kvblkn;                /**< Associated KVBLK block number */
203   int8_t  pnum;                  /**< Number of active kv indexes in `SBLK::pi` */
204   uint8_t lkl;                   /**< Lower key length within a buffer */
205   uint8_t pi[KVBLK_IDXNUM];      /**< Sorted KV slots, value is an index of kv slot in `KVBLK` */
206   uint8_t lk[PREFIX_KEY_LEN_V1]; /**< Lower key buffer */
207 } SBLK;
208 
209 /** IWKV instance */
210 struct _IWKV {
211   IWFS_FSM fsm;                          /**< FSM pool */
212   pthread_rwlock_t rwl;                  /**< API RW lock */
213   iwrc     fatalrc;                      /**< Fatal error occuried, no farther operations can be performed */
214   IWDB     first_db;                     /**< First database in chain */
215   IWDB     last_db;                      /**< Last database in chain */
216   IWDLSNR *dlsnr;                        /**< WAL data events listener */
217   IWHMAP  *dbs;                          /**< Database id -> IWDB mapping */
218   iwkv_openflags  oflags;                /**< Open flags */
219   pthread_cond_t  wk_cond;               /**< Workers cond variable */
220   pthread_mutex_t wk_mtx;                /**< Workers cond mutext */
221   int32_t fmt_version;                   /**< Database format version */
222   int32_t pklen;                         /**< Prefix key length in use */
223   volatile int32_t wk_count;             /**< Number of active workers */
224   volatile bool    wk_pending_exclusive; /**< If true someone wants to acquire exclusive lock on IWKV */
225   atomic_bool      open;                 /**< True if kvstore is in OPEN state */
226 };
227 
228 /** Database lookup context */
229 typedef struct IWLCTX {
230   IWDB db;
231   const IWKV_val *key;        /**< Search key */
232   IWKV_val       *val;        /**< Update value */
233   SBLK *lower;                /**< Next to upper bound block */
234   SBLK *upper;                /**< Upper bound block */
235   SBLK *nb;                   /**< New block */
236   off_t destroy_addr;         /**< Block to destroy address */
237   off_t upper_addr;           /**< Upper block address used in `_lx_del_lr()` */
238 #ifndef NDEBUG
239   uint32_t num_cmps;
240 #endif
241   iwkv_opflags opflags;       /**< Operation flags */
242   sblk_flags_t sbflags;       /**< `SBLK` flags applied to all new/looked blocks in this context */
243   iwlctx_op_t  op;            /**< Context operation */
244   uint8_t      saan;          /**< Position of next free `SBLK` element in the `saa` area */
245   uint8_t      kaan;          /**< Position of next free `KVBLK` element in the `kaa` area */
246   int8_t       nlvl;          /**< Level of new inserted/deleted `SBLK` node. -1 if no new node inserted/deleted */
247   IWKV_PUT_HANDLER ph;        /**< Optional put handler */
248   void    *phop;              /**< Put handler opaque data */
249   SBLK    *plower[SLEVELS];   /**< Pinned lower nodes per level */
250   SBLK    *pupper[SLEVELS];   /**< Pinned upper nodes per level */
251   IWKV_val ekey;
252   SBLK     dblk;              /**< First database block */
253   SBLK     saa[AANUM];        /**< `SBLK` allocation area */
254   KVBLK    kaa[AANUM];        /**< `KVBLK` allocation area */
255   uint8_t  nbuf[IW_VNUMBUFSZ];
256   uint8_t  incbuf[8];         /**< Buffer used to store incremented/decremented values `IWKV_VAL_INCREMENT` opflag */
257 } IWLCTX;
258 
259 /** Cursor context */
260 struct _IWKV_cursor {
261   uint8_t cnpos;              /**< Position in the current `SBLK` node */
262   bool    closed;             /**< Cursor closed */
263   int8_t  skip_next;          /**< When to skip next IWKV_CURSOR_NEXT|IWKV_CURSOR_PREV cursor move
264                                    due to the side effect of `iwkv_cursor_del()` call.
265                                    If `skip_next > 0` `IWKV_CURSOR_NEXT` will be skipped
266                                    If `skip_next < 0` `IWKV_CURSOR_PREV` will be skipped */
267   SBLK *cn;                   /**< Current `SBLK` node */
268   struct _IWKV_cursor *next;  /**< Next cursor in active db cursors chain */
269   off_t  dbaddr;              /**< Database address used as `cn` */
270   IWLCTX lx;                  /**< Lookup context */
271 };
272 
273 #define ENSURE_OPEN(iwkv_) \
274   if (!(iwkv_) || !((iwkv_)->open)) return IW_ERROR_INVALID_STATE; \
275   if ((iwkv_)->fatalrc) return (iwkv_)->fatalrc
276 
277 #define ENSURE_OPEN_DB(db_) \
278   if (!(db_) || !(db_)->iwkv || !(db_)->open || !((db_)->iwkv->open)) return IW_ERROR_INVALID_STATE
279 
280 #define API_RLOCK(iwkv_, rci_) \
281   ENSURE_OPEN(iwkv_);  \
282   (rci_) = pthread_rwlock_rdlock(&(iwkv_)->rwl); \
283   if (rci_) return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_)
284 
_api_rlock(IWKV iwkv)285 IW_INLINE iwrc _api_rlock(IWKV iwkv) {
286   int rci;
287   API_RLOCK(iwkv, rci);
288   return 0;
289 }
290 
291 #define API_WLOCK(iwkv_, rci_) \
292   ENSURE_OPEN(iwkv_);  \
293   (rci_) = pthread_rwlock_wrlock(&(iwkv_)->rwl); \
294   if (rci_) return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_)
295 
_api_wlock(IWKV iwkv)296 IW_INLINE iwrc _api_wlock(IWKV iwkv) {
297   int rci;
298   API_WLOCK(iwkv, rci);
299   return 0;
300 }
301 
302 #define API_UNLOCK(iwkv_, rci_, rc_)  \
303   rci_ = pthread_rwlock_unlock(&(iwkv_)->rwl); \
304   if (rci_) IWRC(iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_), rc_)
305 
306 #define API_DB_RLOCK(db_, rci_)                               \
307   do {                                                        \
308     API_RLOCK((db_)->iwkv, rci_);                             \
309     (rci_) = pthread_rwlock_rdlock(&(db_)->rwl);                \
310     if (rci_) {                                               \
311       pthread_rwlock_unlock(&(db_)->iwkv->rwl);               \
312       return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_);  \
313     }                                                         \
314   } while (0)
315 
_api_db_rlock(IWDB db)316 IW_INLINE iwrc _api_db_rlock(IWDB db) {
317   int rci;
318   API_DB_RLOCK(db, rci);
319   return 0;
320 }
321 
322 #define API_DB_WLOCK(db_, rci_)                               \
323   do {                                                        \
324     API_RLOCK((db_)->iwkv, rci_);                             \
325     (rci_) = pthread_rwlock_wrlock(&(db_)->rwl);                \
326     if (rci_) {                                               \
327       pthread_rwlock_unlock(&(db_)->iwkv->rwl);               \
328       return iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_);  \
329     }                                                         \
330   } while (0)
331 
_api_db_wlock(IWDB db)332 IW_INLINE iwrc _api_db_wlock(IWDB db) {
333   int rci;
334   API_DB_WLOCK(db, rci);
335   return 0;
336 }
337 
338 #define API_DB_UNLOCK(db_, rci_, rc_)                                     \
339   do {                                                                    \
340     (rci_) = pthread_rwlock_unlock(&(db_)->rwl);                            \
341     if (rci_) IWRC(iwrc_set_errno(IW_ERROR_THREADING_ERRNO, rci_), rc_);  \
342     API_UNLOCK((db_)->iwkv, rci_, rc_);                                   \
343   } while (0)
344 
345 #define AAPOS_INC(aan_)         \
346   do {                          \
347     if ((aan_) < AANUM - 1) {   \
348       (aan_) = (aan_) + 1;      \
349     } else {                    \
350       (aan_) = 0;               \
351     }                           \
352   } while (0)
353 
354 
355 // SBLK
356 // [flags:u1,lvl:u1,lkl:u1,pnum:u1,p0:u4,kblk:u4,pi:u1[32],n:u4[24],bpos:u1,lk:u115]:u256
357 
358 #define SOFF_FLAGS_U1   0
359 #define SOFF_LVL_U1     (SOFF_FLAGS_U1 + 1)
360 #define SOFF_LKL_U1     (SOFF_LVL_U1 + 1)
361 #define SOFF_PNUM_U1    (SOFF_LKL_U1 + 1)
362 #define SOFF_P0_U4      (SOFF_PNUM_U1 + 1)
363 #define SOFF_KBLK_U4    (SOFF_P0_U4 + 4)
364 #define SOFF_PI0_U1     (SOFF_KBLK_U4 + 4)
365 #define SOFF_N0_U4      (SOFF_PI0_U1 + 1 * KVBLK_IDXNUM)
366 #define SOFF_BPOS_U1_V2 (SOFF_N0_U4 + 4 * SLEVELS)
367 #define SOFF_LK_V2      (SOFF_BPOS_U1_V2 + 1)
368 #define SOFF_LK_V1      (SOFF_N0_U4 + 4 * SLEVELS)
369 #define SOFF_END        (SOFF_LK_V2 + SBLK_LKLEN)
370 static_assert(SOFF_END == 256, "SOFF_END == 256");
371 static_assert(SBLK_SZ >= SOFF_END, "SBLK_SZ >= SOFF_END");
372 
373 // DB
374 // [magic:u4,dbflg:u1,dbid:u4,next_db_blk:u4,p0:u4,n[24]:u4,c[24]:u4,meta_blk:u4,meta_blkn:u4]:217
375 #define DOFF_MAGIC_U4    0
376 #define DOFF_DBFLG_U1    (DOFF_MAGIC_U4 + 4)
377 #define DOFF_DBID_U4     (DOFF_DBFLG_U1 + 1)
378 #define DOFF_NEXTDB_U4   (DOFF_DBID_U4 + 4)
379 #define DOFF_P0_U4       (DOFF_NEXTDB_U4 + 4)
380 #define DOFF_N0_U4       (DOFF_P0_U4 + 4)
381 #define DOFF_C0_U4       (DOFF_N0_U4 + 4 * SLEVELS)
382 #define DOFF_METABLK_U4  (DOFF_C0_U4 + 4 * SLEVELS)
383 #define DOFF_METABLKN_U4 (DOFF_METABLK_U4 + 4)
384 #define DOFF_END         (DOFF_METABLKN_U4 + 4)
385 static_assert(DOFF_END == 217, "DOFF_END == 217");
386 static_assert(DB_SZ >= DOFF_END, "DB_SZ >= DOFF_END");
387 
388 // KVBLK
389 // [szpow:u1,idxsz:u2,[ps1:vn,pl1:vn,...,ps32,pl32]____[[_KV],...]] // KVBLK
390 #define KBLK_SZPOW_OFF 0
391 
392 
393 iwrc iwkv_exclusive_lock(IWKV iwkv);
394 iwrc iwkv_exclusive_unlock(IWKV iwkv);
395 void iwkvd_trigger_xor(uint64_t val);
396 void iwkvd_kvblk(FILE *f, KVBLK *kb, int maxvlen);
397 iwrc iwkvd_sblk(FILE *f, IWLCTX *lx, SBLK *sb, int flags);
398 void iwkvd_db(FILE *f, IWDB db, int flags, int plvl);
399 
400 // IWKVD Trigger commands
401 #ifdef IW_TESTS
402 #define IWKVD_WAL_NO_CHECKPOINT_ON_CLOSE 1UL
403 #endif
404 
405 #endif
406