• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * In-kernel transcendent memory (generic implementation)
3  *
4  * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
5  *
6  * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7  * "handles" (triples containing a pool id, and object id, and an index), to
8  * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
9  * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10  * set of functions (pamops).  Each pampd contains some representation of
11  * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
12  * the tmem handle is a three-level hierarchical key, and the value is always
13  * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
14  * referenced in the datastore by the pampd.  The hierarchy is required
15  * to ensure that certain invalidation functions can be performed efficiently
16  * (i.e. flush all indexes associated with this object_id, or
17  * flush all objects associated with this pool).
18  *
19  * Tmem must support potentially millions of pages and must be able to insert,
20  * find, and delete these pages at a potential frequency of thousands per
21  * second concurrently across many CPUs, (and, if used with KVM, across many
22  * vcpus across many guests).  Tmem is tracked with a hierarchy of data
23  * structures, organized by the elements in the handle-tuple: pool_id,
24  * object_id, and page index.  One or more "clients" (e.g. guests) each
25  * provide one or more tmem_pools.  Each pool, contains a hash table of
26  * rb_trees of tmem_objs.  Each tmem_obj contains a radix-tree-like tree
27  * of pointers, with intermediate nodes called tmem_objnodes.  Each leaf
28  * pointer in this tree points to a pampd, which is accessible only through
29  * a small set of callbacks registered by the PAM implementation (see
30  * tmem_register_pamops). Tmem only needs to memory allocation for objs
31  * and objnodes and this is done via a set of callbacks that must be
32  * registered by the tmem host implementation (e.g. see tmem_register_hostops).
33  */
34 
35 #include <linux/list.h>
36 #include <linux/spinlock.h>
37 #include <linux/atomic.h>
38 #include <linux/export.h>
39 #if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE)
40 #include <linux/delay.h>
41 #endif
42 
43 #include "tmem.h"
44 
45 /* data structure sentinels used for debugging... see tmem.h */
46 #define POOL_SENTINEL 0x87658765
47 #define OBJ_SENTINEL 0x12345678
48 #define OBJNODE_SENTINEL 0xfedcba09
49 
50 /*
51  * A tmem host implementation must use this function to register callbacks
52  * for memory allocation.
53  */
54 static struct tmem_hostops tmem_hostops;
55 
56 static void tmem_objnode_tree_init(void);
57 
tmem_register_hostops(struct tmem_hostops * m)58 void tmem_register_hostops(struct tmem_hostops *m)
59 {
60 	tmem_objnode_tree_init();
61 	tmem_hostops = *m;
62 }
63 
64 /*
65  * A tmem host implementation must use this function to register
66  * callbacks for a page-accessible memory (PAM) implementation.
67  */
68 static struct tmem_pamops tmem_pamops;
69 
tmem_register_pamops(struct tmem_pamops * m)70 void tmem_register_pamops(struct tmem_pamops *m)
71 {
72 	tmem_pamops = *m;
73 }
74 
75 /*
76  * Oid's are potentially very sparse and tmem_objs may have an indeterminately
77  * short life, being added and deleted at a relatively high frequency.
78  * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
79  * of the potentially huge number of tmem_objs, each pool manages a hashtable
80  * of rb_trees to reduce search, insert, delete, and rebalancing time.
81  * Each hashbucket also has a lock to manage concurrent access and no
82  * searches, inserts, or deletions can be performed unless the lock is held.
83  * As a result, care must be taken to ensure tmem routines are not called
84  * recursively; the vast majority of the time, a recursive call may work
85  * but a deadlock will occur a small fraction of the time due to the
86  * hashbucket lock.
87  *
88  * The following routines manage tmem_objs.  In all of these routines,
89  * the hashbucket lock is already held.
90  */
91 
92 /* Search for object==oid in pool, returns object if found. */
__tmem_obj_find(struct tmem_hashbucket * hb,struct tmem_oid * oidp,struct rb_node ** parent,struct rb_node *** link)93 static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
94 					struct tmem_oid *oidp,
95 					struct rb_node **parent,
96 					struct rb_node ***link)
97 {
98 	struct rb_node *_parent = NULL, **rbnode;
99 	struct tmem_obj *obj = NULL;
100 
101 	rbnode = &hb->obj_rb_root.rb_node;
102 	while (*rbnode) {
103 		BUG_ON(RB_EMPTY_NODE(*rbnode));
104 		_parent = *rbnode;
105 		obj = rb_entry(*rbnode, struct tmem_obj,
106 			       rb_tree_node);
107 		switch (tmem_oid_compare(oidp, &obj->oid)) {
108 		case 0: /* equal */
109 			goto out;
110 		case -1:
111 			rbnode = &(*rbnode)->rb_left;
112 			break;
113 		case 1:
114 			rbnode = &(*rbnode)->rb_right;
115 			break;
116 		}
117 	}
118 
119 	if (parent)
120 		*parent = _parent;
121 	if (link)
122 		*link = rbnode;
123 	obj = NULL;
124 out:
125 	return obj;
126 }
127 
tmem_obj_find(struct tmem_hashbucket * hb,struct tmem_oid * oidp)128 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
129 					struct tmem_oid *oidp)
130 {
131 	return __tmem_obj_find(hb, oidp, NULL, NULL);
132 }
133 
134 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
135 
136 /* Free an object that has no more pampds in it. */
tmem_obj_free(struct tmem_obj * obj,struct tmem_hashbucket * hb)137 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
138 {
139 	struct tmem_pool *pool;
140 
141 	BUG_ON(obj == NULL);
142 	ASSERT_SENTINEL(obj, OBJ);
143 	BUG_ON(obj->pampd_count > 0);
144 	pool = obj->pool;
145 	BUG_ON(pool == NULL);
146 	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
147 		tmem_pampd_destroy_all_in_obj(obj, false);
148 	BUG_ON(obj->objnode_tree_root != NULL);
149 	BUG_ON((long)obj->objnode_count != 0);
150 	atomic_dec(&pool->obj_count);
151 	BUG_ON(atomic_read(&pool->obj_count) < 0);
152 	INVERT_SENTINEL(obj, OBJ);
153 	obj->pool = NULL;
154 	tmem_oid_set_invalid(&obj->oid);
155 	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
156 }
157 
158 /*
159  * Initialize, and insert an tmem_object_root (called only if find failed).
160  */
tmem_obj_init(struct tmem_obj * obj,struct tmem_hashbucket * hb,struct tmem_pool * pool,struct tmem_oid * oidp)161 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
162 					struct tmem_pool *pool,
163 					struct tmem_oid *oidp)
164 {
165 	struct rb_root *root = &hb->obj_rb_root;
166 	struct rb_node **new = NULL, *parent = NULL;
167 
168 	BUG_ON(pool == NULL);
169 	atomic_inc(&pool->obj_count);
170 	obj->objnode_tree_height = 0;
171 	obj->objnode_tree_root = NULL;
172 	obj->pool = pool;
173 	obj->oid = *oidp;
174 	obj->objnode_count = 0;
175 	obj->pampd_count = 0;
176 #ifdef CONFIG_RAMSTER
177 	if (tmem_pamops.new_obj != NULL)
178 		(*tmem_pamops.new_obj)(obj);
179 #endif
180 	SET_SENTINEL(obj, OBJ);
181 
182 	if (__tmem_obj_find(hb, oidp, &parent, &new))
183 		BUG();
184 
185 	rb_link_node(&obj->rb_tree_node, parent, new);
186 	rb_insert_color(&obj->rb_tree_node, root);
187 }
188 
189 /*
190  * Tmem is managed as a set of tmem_pools with certain attributes, such as
191  * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
192  * and all pampds that belong to a tmem_pool.  A tmem_pool is created
193  * or deleted relatively rarely (for example, when a filesystem is
194  * mounted or unmounted).
195  */
196 
197 /* flush all data from a pool and, optionally, free it */
tmem_pool_flush(struct tmem_pool * pool,bool destroy)198 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
199 {
200 	struct rb_node *rbnode;
201 	struct tmem_obj *obj;
202 	struct tmem_hashbucket *hb = &pool->hashbucket[0];
203 	int i;
204 
205 	BUG_ON(pool == NULL);
206 	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
207 		spin_lock(&hb->lock);
208 		rbnode = rb_first(&hb->obj_rb_root);
209 		while (rbnode != NULL) {
210 			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
211 			rbnode = rb_next(rbnode);
212 			tmem_pampd_destroy_all_in_obj(obj, true);
213 			tmem_obj_free(obj, hb);
214 			(*tmem_hostops.obj_free)(obj, pool);
215 		}
216 		spin_unlock(&hb->lock);
217 	}
218 	if (destroy)
219 		list_del(&pool->pool_list);
220 }
221 
222 /*
223  * A tmem_obj contains a radix-tree-like tree in which the intermediate
224  * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
225  * is very specialized and tuned for specific uses and is not particularly
226  * suited for use from this code, though some code from the core algorithms has
227  * been reused, thus the copyright notices below).  Each tmem_objnode contains
228  * a set of pointers which point to either a set of intermediate tmem_objnodes
229  * or a set of of pampds.
230  *
231  * Portions Copyright (C) 2001 Momchil Velikov
232  * Portions Copyright (C) 2001 Christoph Hellwig
233  * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
234  */
235 
236 struct tmem_objnode_tree_path {
237 	struct tmem_objnode *objnode;
238 	int offset;
239 };
240 
241 /* objnode height_to_maxindex translation */
242 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
243 
tmem_objnode_tree_init(void)244 static void tmem_objnode_tree_init(void)
245 {
246 	unsigned int ht, tmp;
247 
248 	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
249 		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
250 		if (tmp >= OBJNODE_TREE_INDEX_BITS)
251 			tmem_objnode_tree_h2max[ht] = ~0UL;
252 		else
253 			tmem_objnode_tree_h2max[ht] =
254 			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
255 	}
256 }
257 
tmem_objnode_alloc(struct tmem_obj * obj)258 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
259 {
260 	struct tmem_objnode *objnode;
261 
262 	ASSERT_SENTINEL(obj, OBJ);
263 	BUG_ON(obj->pool == NULL);
264 	ASSERT_SENTINEL(obj->pool, POOL);
265 	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
266 	if (unlikely(objnode == NULL))
267 		goto out;
268 	objnode->obj = obj;
269 	SET_SENTINEL(objnode, OBJNODE);
270 	memset(&objnode->slots, 0, sizeof(objnode->slots));
271 	objnode->slots_in_use = 0;
272 	obj->objnode_count++;
273 out:
274 	return objnode;
275 }
276 
tmem_objnode_free(struct tmem_objnode * objnode)277 static void tmem_objnode_free(struct tmem_objnode *objnode)
278 {
279 	struct tmem_pool *pool;
280 	int i;
281 
282 	BUG_ON(objnode == NULL);
283 	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
284 		BUG_ON(objnode->slots[i] != NULL);
285 	ASSERT_SENTINEL(objnode, OBJNODE);
286 	INVERT_SENTINEL(objnode, OBJNODE);
287 	BUG_ON(objnode->obj == NULL);
288 	ASSERT_SENTINEL(objnode->obj, OBJ);
289 	pool = objnode->obj->pool;
290 	BUG_ON(pool == NULL);
291 	ASSERT_SENTINEL(pool, POOL);
292 	objnode->obj->objnode_count--;
293 	objnode->obj = NULL;
294 	(*tmem_hostops.objnode_free)(objnode, pool);
295 }
296 
297 /*
298  * Lookup index in object and return associated pampd (or NULL if not found).
299  */
__tmem_pampd_lookup_in_obj(struct tmem_obj * obj,uint32_t index)300 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
301 {
302 	unsigned int height, shift;
303 	struct tmem_objnode **slot = NULL;
304 
305 	BUG_ON(obj == NULL);
306 	ASSERT_SENTINEL(obj, OBJ);
307 	BUG_ON(obj->pool == NULL);
308 	ASSERT_SENTINEL(obj->pool, POOL);
309 
310 	height = obj->objnode_tree_height;
311 	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
312 		goto out;
313 	if (height == 0 && obj->objnode_tree_root) {
314 		slot = &obj->objnode_tree_root;
315 		goto out;
316 	}
317 	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
318 	slot = &obj->objnode_tree_root;
319 	while (height > 0) {
320 		if (*slot == NULL)
321 			goto out;
322 		slot = (struct tmem_objnode **)
323 			((*slot)->slots +
324 			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
325 		shift -= OBJNODE_TREE_MAP_SHIFT;
326 		height--;
327 	}
328 out:
329 	return slot != NULL ? (void **)slot : NULL;
330 }
331 
tmem_pampd_lookup_in_obj(struct tmem_obj * obj,uint32_t index)332 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
333 {
334 	struct tmem_objnode **slot;
335 
336 	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
337 	return slot != NULL ? *slot : NULL;
338 }
339 
340 #ifdef CONFIG_RAMSTER
tmem_pampd_replace_in_obj(struct tmem_obj * obj,uint32_t index,void * new_pampd,bool no_free)341 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
342 					void *new_pampd, bool no_free)
343 {
344 	struct tmem_objnode **slot;
345 	void *ret = NULL;
346 
347 	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
348 	if ((slot != NULL) && (*slot != NULL)) {
349 		void *old_pampd = *(void **)slot;
350 		*(void **)slot = new_pampd;
351 		if (!no_free)
352 			(*tmem_pamops.free)(old_pampd, obj->pool,
353 						NULL, 0, false);
354 		ret = new_pampd;
355 	}
356 	return ret;
357 }
358 #endif
359 
tmem_pampd_add_to_obj(struct tmem_obj * obj,uint32_t index,void * pampd)360 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
361 					void *pampd)
362 {
363 	int ret = 0;
364 	struct tmem_objnode *objnode = NULL, *newnode, *slot;
365 	unsigned int height, shift;
366 	int offset = 0;
367 
368 	/* if necessary, extend the tree to be higher  */
369 	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
370 		height = obj->objnode_tree_height + 1;
371 		if (index > tmem_objnode_tree_h2max[height])
372 			while (index > tmem_objnode_tree_h2max[height])
373 				height++;
374 		if (obj->objnode_tree_root == NULL) {
375 			obj->objnode_tree_height = height;
376 			goto insert;
377 		}
378 		do {
379 			newnode = tmem_objnode_alloc(obj);
380 			if (!newnode) {
381 				ret = -ENOMEM;
382 				goto out;
383 			}
384 			newnode->slots[0] = obj->objnode_tree_root;
385 			newnode->slots_in_use = 1;
386 			obj->objnode_tree_root = newnode;
387 			obj->objnode_tree_height++;
388 		} while (height > obj->objnode_tree_height);
389 	}
390 insert:
391 	slot = obj->objnode_tree_root;
392 	height = obj->objnode_tree_height;
393 	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
394 	while (height > 0) {
395 		if (slot == NULL) {
396 			/* add a child objnode.  */
397 			slot = tmem_objnode_alloc(obj);
398 			if (!slot) {
399 				ret = -ENOMEM;
400 				goto out;
401 			}
402 			if (objnode) {
403 
404 				objnode->slots[offset] = slot;
405 				objnode->slots_in_use++;
406 			} else
407 				obj->objnode_tree_root = slot;
408 		}
409 		/* go down a level */
410 		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
411 		objnode = slot;
412 		slot = objnode->slots[offset];
413 		shift -= OBJNODE_TREE_MAP_SHIFT;
414 		height--;
415 	}
416 	BUG_ON(slot != NULL);
417 	if (objnode) {
418 		objnode->slots_in_use++;
419 		objnode->slots[offset] = pampd;
420 	} else
421 		obj->objnode_tree_root = pampd;
422 	obj->pampd_count++;
423 out:
424 	return ret;
425 }
426 
tmem_pampd_delete_from_obj(struct tmem_obj * obj,uint32_t index)427 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
428 {
429 	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
430 	struct tmem_objnode_tree_path *pathp = path;
431 	struct tmem_objnode *slot = NULL;
432 	unsigned int height, shift;
433 	int offset;
434 
435 	BUG_ON(obj == NULL);
436 	ASSERT_SENTINEL(obj, OBJ);
437 	BUG_ON(obj->pool == NULL);
438 	ASSERT_SENTINEL(obj->pool, POOL);
439 	height = obj->objnode_tree_height;
440 	if (index > tmem_objnode_tree_h2max[height])
441 		goto out;
442 	slot = obj->objnode_tree_root;
443 	if (height == 0 && obj->objnode_tree_root) {
444 		obj->objnode_tree_root = NULL;
445 		goto out;
446 	}
447 	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
448 	pathp->objnode = NULL;
449 	do {
450 		if (slot == NULL)
451 			goto out;
452 		pathp++;
453 		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
454 		pathp->offset = offset;
455 		pathp->objnode = slot;
456 		slot = slot->slots[offset];
457 		shift -= OBJNODE_TREE_MAP_SHIFT;
458 		height--;
459 	} while (height > 0);
460 	if (slot == NULL)
461 		goto out;
462 	while (pathp->objnode) {
463 		pathp->objnode->slots[pathp->offset] = NULL;
464 		pathp->objnode->slots_in_use--;
465 		if (pathp->objnode->slots_in_use) {
466 			if (pathp->objnode == obj->objnode_tree_root) {
467 				while (obj->objnode_tree_height > 0 &&
468 				  obj->objnode_tree_root->slots_in_use == 1 &&
469 				  obj->objnode_tree_root->slots[0]) {
470 					struct tmem_objnode *to_free =
471 						obj->objnode_tree_root;
472 
473 					obj->objnode_tree_root =
474 							to_free->slots[0];
475 					obj->objnode_tree_height--;
476 					to_free->slots[0] = NULL;
477 					to_free->slots_in_use = 0;
478 					tmem_objnode_free(to_free);
479 				}
480 			}
481 			goto out;
482 		}
483 		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
484 		pathp--;
485 	}
486 	obj->objnode_tree_height = 0;
487 	obj->objnode_tree_root = NULL;
488 
489 out:
490 	if (slot != NULL)
491 		obj->pampd_count--;
492 	BUG_ON(obj->pampd_count < 0);
493 	return slot;
494 }
495 
496 /* Recursively walk the objnode_tree destroying pampds and objnodes. */
tmem_objnode_node_destroy(struct tmem_obj * obj,struct tmem_objnode * objnode,unsigned int ht)497 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
498 					struct tmem_objnode *objnode,
499 					unsigned int ht)
500 {
501 	int i;
502 
503 	if (ht == 0)
504 		return;
505 	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
506 		if (objnode->slots[i]) {
507 			if (ht == 1) {
508 				obj->pampd_count--;
509 				(*tmem_pamops.free)(objnode->slots[i],
510 						obj->pool, NULL, 0, true);
511 				objnode->slots[i] = NULL;
512 				continue;
513 			}
514 			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
515 			tmem_objnode_free(objnode->slots[i]);
516 			objnode->slots[i] = NULL;
517 		}
518 	}
519 }
520 
tmem_pampd_destroy_all_in_obj(struct tmem_obj * obj,bool pool_destroy)521 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
522 						bool pool_destroy)
523 {
524 	if (obj->objnode_tree_root == NULL)
525 		return;
526 	if (obj->objnode_tree_height == 0) {
527 		obj->pampd_count--;
528 		(*tmem_pamops.free)(obj->objnode_tree_root,
529 					obj->pool, NULL, 0, true);
530 	} else {
531 		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
532 					obj->objnode_tree_height);
533 		tmem_objnode_free(obj->objnode_tree_root);
534 		obj->objnode_tree_height = 0;
535 	}
536 	obj->objnode_tree_root = NULL;
537 #ifdef CONFIG_RAMSTER
538 	if (tmem_pamops.free_obj != NULL)
539 		(*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
540 #endif
541 }
542 
543 /*
544  * Tmem is operated on by a set of well-defined actions:
545  * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
546  * (The tmem ABI allows for subpages and exchanges but these operations
547  * are not included in this implementation.)
548  *
549  * These "tmem core" operations are implemented in the following functions.
550  */
551 
552 /*
553  * "Put" a page, e.g. associate the passed pampd with the passed handle.
554  * Tmem_put is complicated by a corner case: What if a page with matching
555  * handle already exists in tmem?  To guarantee coherency, one of two
556  * actions is necessary: Either the data for the page must be overwritten,
557  * or the page must be "flushed" so that the data is not accessible to a
558  * subsequent "get".  Since these "duplicate puts" are relatively rare,
559  * this implementation always flushes for simplicity.
560  */
tmem_put(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,bool raw,void * pampd_to_use)561 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
562 		bool raw, void *pampd_to_use)
563 {
564 	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
565 	void *pampd = NULL, *pampd_del = NULL;
566 	int ret = -ENOMEM;
567 	struct tmem_hashbucket *hb;
568 
569 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
570 	spin_lock(&hb->lock);
571 	obj = objfound = tmem_obj_find(hb, oidp);
572 	if (obj != NULL) {
573 		pampd = tmem_pampd_lookup_in_obj(objfound, index);
574 		if (pampd != NULL) {
575 			/* if found, is a dup put, flush the old one */
576 			pampd_del = tmem_pampd_delete_from_obj(obj, index);
577 			BUG_ON(pampd_del != pampd);
578 			(*tmem_pamops.free)(pampd, pool, oidp, index, true);
579 			if (obj->pampd_count == 0) {
580 				objnew = obj;
581 				objfound = NULL;
582 			}
583 			pampd = NULL;
584 		}
585 	} else {
586 		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
587 		if (unlikely(obj == NULL)) {
588 			ret = -ENOMEM;
589 			goto out;
590 		}
591 		tmem_obj_init(obj, hb, pool, oidp);
592 	}
593 	BUG_ON(obj == NULL);
594 	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
595 	pampd = pampd_to_use;
596 	BUG_ON(pampd_to_use == NULL);
597 	ret = tmem_pampd_add_to_obj(obj, index, pampd);
598 	if (unlikely(ret == -ENOMEM))
599 		/* may have partially built objnode tree ("stump") */
600 		goto delete_and_free;
601 	(*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
602 	goto out;
603 
604 delete_and_free:
605 	(void)tmem_pampd_delete_from_obj(obj, index);
606 	if (pampd)
607 		(*tmem_pamops.free)(pampd, pool, NULL, 0, true);
608 	if (objnew) {
609 		tmem_obj_free(objnew, hb);
610 		(*tmem_hostops.obj_free)(objnew, pool);
611 	}
612 out:
613 	spin_unlock(&hb->lock);
614 	return ret;
615 }
616 
617 #ifdef CONFIG_RAMSTER
618 /*
619  * For ramster only:  The following routines provide a two-step sequence
620  * to allow the caller to replace a pampd in the tmem data structures with
621  * another pampd. Here, we lookup the passed handle and, if found, return the
622  * associated pampd and object, leaving the hashbucket locked and returning
623  * a reference to it.  The caller is expected to immediately call the
624  * matching tmem_localify_finish routine which will handles the replacement
625  * and unlocks the hashbucket.
626  */
tmem_localify_get_pampd(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,struct tmem_obj ** ret_obj,void ** saved_hb)627 void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
628 				uint32_t index, struct tmem_obj **ret_obj,
629 				void **saved_hb)
630 {
631 	struct tmem_hashbucket *hb;
632 	struct tmem_obj *obj = NULL;
633 	void *pampd = NULL;
634 
635 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
636 	spin_lock(&hb->lock);
637 	obj = tmem_obj_find(hb, oidp);
638 	if (likely(obj != NULL))
639 		pampd = tmem_pampd_lookup_in_obj(obj, index);
640 	*ret_obj = obj;
641 	*saved_hb = (void *)hb;
642 	/* note, hashbucket remains locked */
643 	return pampd;
644 }
645 EXPORT_SYMBOL_GPL(tmem_localify_get_pampd);
646 
tmem_localify_finish(struct tmem_obj * obj,uint32_t index,void * pampd,void * saved_hb,bool delete)647 void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
648 			  void *pampd, void *saved_hb, bool delete)
649 {
650 	struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
651 
652 	BUG_ON(!spin_is_locked(&hb->lock));
653 	if (pampd != NULL) {
654 		BUG_ON(obj == NULL);
655 		(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
656 		(*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
657 	} else if (delete) {
658 		BUG_ON(obj == NULL);
659 		(void)tmem_pampd_delete_from_obj(obj, index);
660 	}
661 	spin_unlock(&hb->lock);
662 }
663 EXPORT_SYMBOL_GPL(tmem_localify_finish);
664 
665 /*
666  * For ramster only.  Helper function to support asynchronous tmem_get.
667  */
tmem_repatriate(void ** ppampd,struct tmem_hashbucket * hb,struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,bool free,char * data)668 static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
669 				struct tmem_pool *pool, struct tmem_oid *oidp,
670 				uint32_t index, bool free, char *data)
671 {
672 	void *old_pampd = *ppampd, *new_pampd = NULL;
673 	bool intransit = false;
674 	int ret = 0;
675 
676 	if (!is_ephemeral(pool))
677 		new_pampd = (*tmem_pamops.repatriate_preload)(
678 				old_pampd, pool, oidp, index, &intransit);
679 	if (intransit)
680 		ret = -EAGAIN;
681 	else if (new_pampd != NULL)
682 		*ppampd = new_pampd;
683 	/* must release the hb->lock else repatriate can't sleep */
684 	spin_unlock(&hb->lock);
685 	if (!intransit)
686 		ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
687 						oidp, index, free, data);
688 	if (ret == -EAGAIN) {
689 		/* rare I think, but should cond_resched()??? */
690 		usleep_range(10, 1000);
691 	} else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
692 		ret = -1;
693 	} else if (ret != 0 && ret != -ENOENT) {
694 		ret = -1;
695 	}
696 	/* note hb->lock has now been unlocked */
697 	return ret;
698 }
699 
700 /*
701  * For ramster only.  If a page in tmem matches the handle, replace the
702  * page so that any subsequent "get" gets the new page.  Returns 0 if
703  * there was a page to replace, else returns -1.
704  */
tmem_replace(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,void * new_pampd)705 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
706 			uint32_t index, void *new_pampd)
707 {
708 	struct tmem_obj *obj;
709 	int ret = -1;
710 	struct tmem_hashbucket *hb;
711 
712 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
713 	spin_lock(&hb->lock);
714 	obj = tmem_obj_find(hb, oidp);
715 	if (obj == NULL)
716 		goto out;
717 	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
718 	/* if we bug here, pamops wasn't properly set up for ramster */
719 	BUG_ON(tmem_pamops.replace_in_obj == NULL);
720 	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
721 out:
722 	spin_unlock(&hb->lock);
723 	return ret;
724 }
725 EXPORT_SYMBOL_GPL(tmem_replace);
726 #endif
727 
728 /*
729  * "Get" a page, e.g. if a pampd can be found matching the passed handle,
730  * use a pamops callback to recreated the page from the pampd with the
731  * matching handle.  By tmem definition, when a "get" is successful on
732  * an ephemeral page, the page is "flushed", and when a "get" is successful
733  * on a persistent page, the page is retained in tmem.  Note that to preserve
734  * coherency, "get" can never be skipped if tmem contains the data.
735  * That is, if a get is done with a certain handle and fails, any
736  * subsequent "get" must also fail (unless of course there is a
737  * "put" done with the same handle).
738  */
tmem_get(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,char * data,size_t * sizep,bool raw,int get_and_free)739 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
740 		char *data, size_t *sizep, bool raw, int get_and_free)
741 {
742 	struct tmem_obj *obj;
743 	void *pampd = NULL;
744 	bool ephemeral = is_ephemeral(pool);
745 	int ret = -1;
746 	struct tmem_hashbucket *hb;
747 	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
748 	bool lock_held = false;
749 	void **ppampd;
750 
751 	do {
752 		hb = &pool->hashbucket[tmem_oid_hash(oidp)];
753 		spin_lock(&hb->lock);
754 		lock_held = true;
755 		obj = tmem_obj_find(hb, oidp);
756 		if (obj == NULL)
757 			goto out;
758 		ppampd = __tmem_pampd_lookup_in_obj(obj, index);
759 		if (ppampd == NULL)
760 			goto out;
761 #ifdef CONFIG_RAMSTER
762 		if ((tmem_pamops.is_remote != NULL) &&
763 		     tmem_pamops.is_remote(*ppampd)) {
764 			ret = tmem_repatriate(ppampd, hb, pool, oidp,
765 						index, free, data);
766 			/* tmem_repatriate releases hb->lock */
767 			lock_held = false;
768 			*sizep = PAGE_SIZE;
769 			if (ret != -EAGAIN)
770 				goto out;
771 		}
772 #endif
773 	} while (ret == -EAGAIN);
774 	if (free)
775 		pampd = tmem_pampd_delete_from_obj(obj, index);
776 	else
777 		pampd = tmem_pampd_lookup_in_obj(obj, index);
778 	if (pampd == NULL)
779 		goto out;
780 	if (free) {
781 		if (obj->pampd_count == 0) {
782 			tmem_obj_free(obj, hb);
783 			(*tmem_hostops.obj_free)(obj, pool);
784 			obj = NULL;
785 		}
786 	}
787 	if (free)
788 		ret = (*tmem_pamops.get_data_and_free)(
789 				data, sizep, raw, pampd, pool, oidp, index);
790 	else
791 		ret = (*tmem_pamops.get_data)(
792 				data, sizep, raw, pampd, pool, oidp, index);
793 	if (ret < 0)
794 		goto out;
795 	ret = 0;
796 out:
797 	if (lock_held)
798 		spin_unlock(&hb->lock);
799 	return ret;
800 }
801 
802 /*
803  * If a page in tmem matches the handle, "flush" this page from tmem such
804  * that any subsequent "get" does not succeed (unless, of course, there
805  * was another "put" with the same handle).
806  */
tmem_flush_page(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index)807 int tmem_flush_page(struct tmem_pool *pool,
808 				struct tmem_oid *oidp, uint32_t index)
809 {
810 	struct tmem_obj *obj;
811 	void *pampd;
812 	int ret = -1;
813 	struct tmem_hashbucket *hb;
814 
815 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
816 	spin_lock(&hb->lock);
817 	obj = tmem_obj_find(hb, oidp);
818 	if (obj == NULL)
819 		goto out;
820 	pampd = tmem_pampd_delete_from_obj(obj, index);
821 	if (pampd == NULL)
822 		goto out;
823 	(*tmem_pamops.free)(pampd, pool, oidp, index, true);
824 	if (obj->pampd_count == 0) {
825 		tmem_obj_free(obj, hb);
826 		(*tmem_hostops.obj_free)(obj, pool);
827 	}
828 	ret = 0;
829 
830 out:
831 	spin_unlock(&hb->lock);
832 	return ret;
833 }
834 
835 /*
836  * "Flush" all pages in tmem matching this oid.
837  */
tmem_flush_object(struct tmem_pool * pool,struct tmem_oid * oidp)838 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
839 {
840 	struct tmem_obj *obj;
841 	struct tmem_hashbucket *hb;
842 	int ret = -1;
843 
844 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
845 	spin_lock(&hb->lock);
846 	obj = tmem_obj_find(hb, oidp);
847 	if (obj == NULL)
848 		goto out;
849 	tmem_pampd_destroy_all_in_obj(obj, false);
850 	tmem_obj_free(obj, hb);
851 	(*tmem_hostops.obj_free)(obj, pool);
852 	ret = 0;
853 
854 out:
855 	spin_unlock(&hb->lock);
856 	return ret;
857 }
858 
859 /*
860  * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
861  * all subsequent access to this tmem_pool.
862  */
tmem_destroy_pool(struct tmem_pool * pool)863 int tmem_destroy_pool(struct tmem_pool *pool)
864 {
865 	int ret = -1;
866 
867 	if (pool == NULL)
868 		goto out;
869 	tmem_pool_flush(pool, 1);
870 	ret = 0;
871 out:
872 	return ret;
873 }
874 
875 static LIST_HEAD(tmem_global_pool_list);
876 
877 /*
878  * Create a new tmem_pool with the provided flag and return
879  * a pool id provided by the tmem host implementation.
880  */
tmem_new_pool(struct tmem_pool * pool,uint32_t flags)881 void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
882 {
883 	int persistent = flags & TMEM_POOL_PERSIST;
884 	int shared = flags & TMEM_POOL_SHARED;
885 	struct tmem_hashbucket *hb = &pool->hashbucket[0];
886 	int i;
887 
888 	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
889 		hb->obj_rb_root = RB_ROOT;
890 		spin_lock_init(&hb->lock);
891 	}
892 	INIT_LIST_HEAD(&pool->pool_list);
893 	atomic_set(&pool->obj_count, 0);
894 	SET_SENTINEL(pool, POOL);
895 	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
896 	pool->persistent = persistent;
897 	pool->shared = shared;
898 }
899