• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * In-kernel transcendent memory (generic implementation)
3  *
4  * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
5  *
6  * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7  * "handles" (triples containing a pool id, and object id, and an index), to
8  * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
9  * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10  * set of functions (pamops).  Each pampd contains some representation of
11  * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
12  * pages and must be able to insert, find, and delete these pages at a
13  * potential frequency of thousands per second concurrently across many CPUs,
14  * (and, if used with KVM, across many vcpus across many guests).
15  * Tmem is tracked with a hierarchy of data structures, organized by
16  * the elements in a handle-tuple: pool_id, object_id, and page index.
17  * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
18  * Each pool, contains a hash table of rb_trees of tmem_objs.  Each
19  * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
20  * nodes called tmem_objnodes.  Each leaf pointer in this tree points to
21  * a pampd, which is accessible only through a small set of callbacks
22  * registered by the PAM implementation (see tmem_register_pamops). Tmem
23  * does all memory allocation via a set of callbacks registered by the tmem
24  * host implementation (e.g. see tmem_register_hostops).
25  */
26 
27 #include <linux/list.h>
28 #include <linux/spinlock.h>
29 #include <linux/atomic.h>
30 #include <linux/delay.h>
31 
32 #include "tmem.h"
33 
34 /* data structure sentinels used for debugging... see tmem.h */
35 #define POOL_SENTINEL 0x87658765
36 #define OBJ_SENTINEL 0x12345678
37 #define OBJNODE_SENTINEL 0xfedcba09
38 
39 /*
40  * A tmem host implementation must use this function to register callbacks
41  * for memory allocation.
42  */
43 static struct tmem_hostops tmem_hostops;
44 
45 static void tmem_objnode_tree_init(void);
46 
tmem_register_hostops(struct tmem_hostops * m)47 void tmem_register_hostops(struct tmem_hostops *m)
48 {
49 	tmem_objnode_tree_init();
50 	tmem_hostops = *m;
51 }
52 
53 /*
54  * A tmem host implementation must use this function to register
55  * callbacks for a page-accessible memory (PAM) implementation
56  */
57 static struct tmem_pamops tmem_pamops;
58 
tmem_register_pamops(struct tmem_pamops * m)59 void tmem_register_pamops(struct tmem_pamops *m)
60 {
61 	tmem_pamops = *m;
62 }
63 
64 /*
65  * Oid's are potentially very sparse and tmem_objs may have an indeterminately
66  * short life, being added and deleted at a relatively high frequency.
67  * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
68  * of the potentially huge number of tmem_objs, each pool manages a hashtable
69  * of rb_trees to reduce search, insert, delete, and rebalancing time.
70  * Each hashbucket also has a lock to manage concurrent access.
71  *
72  * The following routines manage tmem_objs.  When any tmem_obj is accessed,
73  * the hashbucket lock must be held.
74  */
75 
76 /* searches for object==oid in pool, returns locked object if found */
tmem_obj_find(struct tmem_hashbucket * hb,struct tmem_oid * oidp)77 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
78 					struct tmem_oid *oidp)
79 {
80 	struct rb_node *rbnode;
81 	struct tmem_obj *obj;
82 
83 	rbnode = hb->obj_rb_root.rb_node;
84 	while (rbnode) {
85 		BUG_ON(RB_EMPTY_NODE(rbnode));
86 		obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
87 		switch (tmem_oid_compare(oidp, &obj->oid)) {
88 		case 0: /* equal */
89 			goto out;
90 		case -1:
91 			rbnode = rbnode->rb_left;
92 			break;
93 		case 1:
94 			rbnode = rbnode->rb_right;
95 			break;
96 		}
97 	}
98 	obj = NULL;
99 out:
100 	return obj;
101 }
102 
103 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
104 
105 /* free an object that has no more pampds in it */
tmem_obj_free(struct tmem_obj * obj,struct tmem_hashbucket * hb)106 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
107 {
108 	struct tmem_pool *pool;
109 
110 	BUG_ON(obj == NULL);
111 	ASSERT_SENTINEL(obj, OBJ);
112 	BUG_ON(obj->pampd_count > 0);
113 	pool = obj->pool;
114 	BUG_ON(pool == NULL);
115 	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
116 		tmem_pampd_destroy_all_in_obj(obj);
117 	BUG_ON(obj->objnode_tree_root != NULL);
118 	BUG_ON((long)obj->objnode_count != 0);
119 	atomic_dec(&pool->obj_count);
120 	BUG_ON(atomic_read(&pool->obj_count) < 0);
121 	INVERT_SENTINEL(obj, OBJ);
122 	obj->pool = NULL;
123 	tmem_oid_set_invalid(&obj->oid);
124 	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
125 }
126 
127 /*
128  * initialize, and insert an tmem_object_root (called only if find failed)
129  */
tmem_obj_init(struct tmem_obj * obj,struct tmem_hashbucket * hb,struct tmem_pool * pool,struct tmem_oid * oidp)130 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
131 					struct tmem_pool *pool,
132 					struct tmem_oid *oidp)
133 {
134 	struct rb_root *root = &hb->obj_rb_root;
135 	struct rb_node **new = &(root->rb_node), *parent = NULL;
136 	struct tmem_obj *this;
137 
138 	BUG_ON(pool == NULL);
139 	atomic_inc(&pool->obj_count);
140 	obj->objnode_tree_height = 0;
141 	obj->objnode_tree_root = NULL;
142 	obj->pool = pool;
143 	obj->oid = *oidp;
144 	obj->objnode_count = 0;
145 	obj->pampd_count = 0;
146 	(*tmem_pamops.new_obj)(obj);
147 	SET_SENTINEL(obj, OBJ);
148 	while (*new) {
149 		BUG_ON(RB_EMPTY_NODE(*new));
150 		this = rb_entry(*new, struct tmem_obj, rb_tree_node);
151 		parent = *new;
152 		switch (tmem_oid_compare(oidp, &this->oid)) {
153 		case 0:
154 			BUG(); /* already present; should never happen! */
155 			break;
156 		case -1:
157 			new = &(*new)->rb_left;
158 			break;
159 		case 1:
160 			new = &(*new)->rb_right;
161 			break;
162 		}
163 	}
164 	rb_link_node(&obj->rb_tree_node, parent, new);
165 	rb_insert_color(&obj->rb_tree_node, root);
166 }
167 
168 /*
169  * Tmem is managed as a set of tmem_pools with certain attributes, such as
170  * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
171  * and all pampds that belong to a tmem_pool.  A tmem_pool is created
172  * or deleted relatively rarely (for example, when a filesystem is
173  * mounted or unmounted.
174  */
175 
176 /* flush all data from a pool and, optionally, free it */
tmem_pool_flush(struct tmem_pool * pool,bool destroy)177 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
178 {
179 	struct rb_node *rbnode;
180 	struct tmem_obj *obj;
181 	struct tmem_hashbucket *hb = &pool->hashbucket[0];
182 	int i;
183 
184 	BUG_ON(pool == NULL);
185 	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
186 		spin_lock(&hb->lock);
187 		rbnode = rb_first(&hb->obj_rb_root);
188 		while (rbnode != NULL) {
189 			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
190 			rbnode = rb_next(rbnode);
191 			tmem_pampd_destroy_all_in_obj(obj);
192 			tmem_obj_free(obj, hb);
193 			(*tmem_hostops.obj_free)(obj, pool);
194 		}
195 		spin_unlock(&hb->lock);
196 	}
197 	if (destroy)
198 		list_del(&pool->pool_list);
199 }
200 
201 /*
202  * A tmem_obj contains a radix-tree-like tree in which the intermediate
203  * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
204  * is very specialized and tuned for specific uses and is not particularly
205  * suited for use from this code, though some code from the core algorithms has
206  * been reused, thus the copyright notices below).  Each tmem_objnode contains
207  * a set of pointers which point to either a set of intermediate tmem_objnodes
208  * or a set of of pampds.
209  *
210  * Portions Copyright (C) 2001 Momchil Velikov
211  * Portions Copyright (C) 2001 Christoph Hellwig
212  * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
213  */
214 
215 struct tmem_objnode_tree_path {
216 	struct tmem_objnode *objnode;
217 	int offset;
218 };
219 
220 /* objnode height_to_maxindex translation */
221 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
222 
tmem_objnode_tree_init(void)223 static void tmem_objnode_tree_init(void)
224 {
225 	unsigned int ht, tmp;
226 
227 	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
228 		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
229 		if (tmp >= OBJNODE_TREE_INDEX_BITS)
230 			tmem_objnode_tree_h2max[ht] = ~0UL;
231 		else
232 			tmem_objnode_tree_h2max[ht] =
233 			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
234 	}
235 }
236 
tmem_objnode_alloc(struct tmem_obj * obj)237 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
238 {
239 	struct tmem_objnode *objnode;
240 
241 	ASSERT_SENTINEL(obj, OBJ);
242 	BUG_ON(obj->pool == NULL);
243 	ASSERT_SENTINEL(obj->pool, POOL);
244 	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
245 	if (unlikely(objnode == NULL))
246 		goto out;
247 	objnode->obj = obj;
248 	SET_SENTINEL(objnode, OBJNODE);
249 	memset(&objnode->slots, 0, sizeof(objnode->slots));
250 	objnode->slots_in_use = 0;
251 	obj->objnode_count++;
252 out:
253 	return objnode;
254 }
255 
tmem_objnode_free(struct tmem_objnode * objnode)256 static void tmem_objnode_free(struct tmem_objnode *objnode)
257 {
258 	struct tmem_pool *pool;
259 	int i;
260 
261 	BUG_ON(objnode == NULL);
262 	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
263 		BUG_ON(objnode->slots[i] != NULL);
264 	ASSERT_SENTINEL(objnode, OBJNODE);
265 	INVERT_SENTINEL(objnode, OBJNODE);
266 	BUG_ON(objnode->obj == NULL);
267 	ASSERT_SENTINEL(objnode->obj, OBJ);
268 	pool = objnode->obj->pool;
269 	BUG_ON(pool == NULL);
270 	ASSERT_SENTINEL(pool, POOL);
271 	objnode->obj->objnode_count--;
272 	objnode->obj = NULL;
273 	(*tmem_hostops.objnode_free)(objnode, pool);
274 }
275 
276 /*
277  * lookup index in object and return associated pampd (or NULL if not found)
278  */
__tmem_pampd_lookup_in_obj(struct tmem_obj * obj,uint32_t index)279 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
280 {
281 	unsigned int height, shift;
282 	struct tmem_objnode **slot = NULL;
283 
284 	BUG_ON(obj == NULL);
285 	ASSERT_SENTINEL(obj, OBJ);
286 	BUG_ON(obj->pool == NULL);
287 	ASSERT_SENTINEL(obj->pool, POOL);
288 
289 	height = obj->objnode_tree_height;
290 	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
291 		goto out;
292 	if (height == 0 && obj->objnode_tree_root) {
293 		slot = &obj->objnode_tree_root;
294 		goto out;
295 	}
296 	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
297 	slot = &obj->objnode_tree_root;
298 	while (height > 0) {
299 		if (*slot == NULL)
300 			goto out;
301 		slot = (struct tmem_objnode **)
302 			((*slot)->slots +
303 			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
304 		shift -= OBJNODE_TREE_MAP_SHIFT;
305 		height--;
306 	}
307 out:
308 	return slot != NULL ? (void **)slot : NULL;
309 }
310 
tmem_pampd_lookup_in_obj(struct tmem_obj * obj,uint32_t index)311 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
312 {
313 	struct tmem_objnode **slot;
314 
315 	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
316 	return slot != NULL ? *slot : NULL;
317 }
318 
tmem_pampd_replace_in_obj(struct tmem_obj * obj,uint32_t index,void * new_pampd,bool no_free)319 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
320 					void *new_pampd, bool no_free)
321 {
322 	struct tmem_objnode **slot;
323 	void *ret = NULL;
324 
325 	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
326 	if ((slot != NULL) && (*slot != NULL)) {
327 		void *old_pampd = *(void **)slot;
328 		*(void **)slot = new_pampd;
329 		if (!no_free)
330 			(*tmem_pamops.free)(old_pampd, obj->pool,
331 						NULL, 0, false);
332 		ret = new_pampd;
333 	}
334 	return ret;
335 }
336 
tmem_pampd_add_to_obj(struct tmem_obj * obj,uint32_t index,void * pampd)337 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
338 					void *pampd)
339 {
340 	int ret = 0;
341 	struct tmem_objnode *objnode = NULL, *newnode, *slot;
342 	unsigned int height, shift;
343 	int offset = 0;
344 
345 	/* if necessary, extend the tree to be higher  */
346 	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
347 		height = obj->objnode_tree_height + 1;
348 		if (index > tmem_objnode_tree_h2max[height])
349 			while (index > tmem_objnode_tree_h2max[height])
350 				height++;
351 		if (obj->objnode_tree_root == NULL) {
352 			obj->objnode_tree_height = height;
353 			goto insert;
354 		}
355 		do {
356 			newnode = tmem_objnode_alloc(obj);
357 			if (!newnode) {
358 				ret = -ENOMEM;
359 				goto out;
360 			}
361 			newnode->slots[0] = obj->objnode_tree_root;
362 			newnode->slots_in_use = 1;
363 			obj->objnode_tree_root = newnode;
364 			obj->objnode_tree_height++;
365 		} while (height > obj->objnode_tree_height);
366 	}
367 insert:
368 	slot = obj->objnode_tree_root;
369 	height = obj->objnode_tree_height;
370 	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
371 	while (height > 0) {
372 		if (slot == NULL) {
373 			/* add a child objnode.  */
374 			slot = tmem_objnode_alloc(obj);
375 			if (!slot) {
376 				ret = -ENOMEM;
377 				goto out;
378 			}
379 			if (objnode) {
380 
381 				objnode->slots[offset] = slot;
382 				objnode->slots_in_use++;
383 			} else
384 				obj->objnode_tree_root = slot;
385 		}
386 		/* go down a level */
387 		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
388 		objnode = slot;
389 		slot = objnode->slots[offset];
390 		shift -= OBJNODE_TREE_MAP_SHIFT;
391 		height--;
392 	}
393 	BUG_ON(slot != NULL);
394 	if (objnode) {
395 		objnode->slots_in_use++;
396 		objnode->slots[offset] = pampd;
397 	} else
398 		obj->objnode_tree_root = pampd;
399 	obj->pampd_count++;
400 out:
401 	return ret;
402 }
403 
tmem_pampd_delete_from_obj(struct tmem_obj * obj,uint32_t index)404 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
405 {
406 	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
407 	struct tmem_objnode_tree_path *pathp = path;
408 	struct tmem_objnode *slot = NULL;
409 	unsigned int height, shift;
410 	int offset;
411 
412 	BUG_ON(obj == NULL);
413 	ASSERT_SENTINEL(obj, OBJ);
414 	BUG_ON(obj->pool == NULL);
415 	ASSERT_SENTINEL(obj->pool, POOL);
416 	height = obj->objnode_tree_height;
417 	if (index > tmem_objnode_tree_h2max[height])
418 		goto out;
419 	slot = obj->objnode_tree_root;
420 	if (height == 0 && obj->objnode_tree_root) {
421 		obj->objnode_tree_root = NULL;
422 		goto out;
423 	}
424 	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
425 	pathp->objnode = NULL;
426 	do {
427 		if (slot == NULL)
428 			goto out;
429 		pathp++;
430 		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
431 		pathp->offset = offset;
432 		pathp->objnode = slot;
433 		slot = slot->slots[offset];
434 		shift -= OBJNODE_TREE_MAP_SHIFT;
435 		height--;
436 	} while (height > 0);
437 	if (slot == NULL)
438 		goto out;
439 	while (pathp->objnode) {
440 		pathp->objnode->slots[pathp->offset] = NULL;
441 		pathp->objnode->slots_in_use--;
442 		if (pathp->objnode->slots_in_use) {
443 			if (pathp->objnode == obj->objnode_tree_root) {
444 				while (obj->objnode_tree_height > 0 &&
445 				  obj->objnode_tree_root->slots_in_use == 1 &&
446 				  obj->objnode_tree_root->slots[0]) {
447 					struct tmem_objnode *to_free =
448 						obj->objnode_tree_root;
449 
450 					obj->objnode_tree_root =
451 							to_free->slots[0];
452 					obj->objnode_tree_height--;
453 					to_free->slots[0] = NULL;
454 					to_free->slots_in_use = 0;
455 					tmem_objnode_free(to_free);
456 				}
457 			}
458 			goto out;
459 		}
460 		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
461 		pathp--;
462 	}
463 	obj->objnode_tree_height = 0;
464 	obj->objnode_tree_root = NULL;
465 
466 out:
467 	if (slot != NULL)
468 		obj->pampd_count--;
469 	BUG_ON(obj->pampd_count < 0);
470 	return slot;
471 }
472 
473 /* recursively walk the objnode_tree destroying pampds and objnodes */
tmem_objnode_node_destroy(struct tmem_obj * obj,struct tmem_objnode * objnode,unsigned int ht)474 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
475 					struct tmem_objnode *objnode,
476 					unsigned int ht)
477 {
478 	int i;
479 
480 	if (ht == 0)
481 		return;
482 	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
483 		if (objnode->slots[i]) {
484 			if (ht == 1) {
485 				obj->pampd_count--;
486 				(*tmem_pamops.free)(objnode->slots[i],
487 						obj->pool, NULL, 0, true);
488 				objnode->slots[i] = NULL;
489 				continue;
490 			}
491 			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
492 			tmem_objnode_free(objnode->slots[i]);
493 			objnode->slots[i] = NULL;
494 		}
495 	}
496 }
497 
tmem_pampd_destroy_all_in_obj(struct tmem_obj * obj)498 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
499 {
500 	if (obj->objnode_tree_root == NULL)
501 		return;
502 	if (obj->objnode_tree_height == 0) {
503 		obj->pampd_count--;
504 		(*tmem_pamops.free)(obj->objnode_tree_root,
505 					obj->pool, NULL, 0, true);
506 	} else {
507 		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
508 					obj->objnode_tree_height);
509 		tmem_objnode_free(obj->objnode_tree_root);
510 		obj->objnode_tree_height = 0;
511 	}
512 	obj->objnode_tree_root = NULL;
513 	(*tmem_pamops.free_obj)(obj->pool, obj);
514 }
515 
516 /*
517  * Tmem is operated on by a set of well-defined actions:
518  * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
519  * (The tmem ABI allows for subpages and exchanges but these operations
520  * are not included in this implementation.)
521  *
522  * These "tmem core" operations are implemented in the following functions.
523  */
524 
525 /*
526  * "Put" a page, e.g. copy a page from the kernel into newly allocated
527  * PAM space (if such space is available).  Tmem_put is complicated by
528  * a corner case: What if a page with matching handle already exists in
529  * tmem?  To guarantee coherency, one of two actions is necessary: Either
530  * the data for the page must be overwritten, or the page must be
531  * "flushed" so that the data is not accessible to a subsequent "get".
532  * Since these "duplicate puts" are relatively rare, this implementation
533  * always flushes for simplicity.
534  */
tmem_put(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,char * data,size_t size,bool raw,int ephemeral)535 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
536 		char *data, size_t size, bool raw, int ephemeral)
537 {
538 	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
539 	void *pampd = NULL, *pampd_del = NULL;
540 	int ret = -ENOMEM;
541 	struct tmem_hashbucket *hb;
542 
543 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
544 	spin_lock(&hb->lock);
545 	obj = objfound = tmem_obj_find(hb, oidp);
546 	if (obj != NULL) {
547 		pampd = tmem_pampd_lookup_in_obj(objfound, index);
548 		if (pampd != NULL) {
549 			/* if found, is a dup put, flush the old one */
550 			pampd_del = tmem_pampd_delete_from_obj(obj, index);
551 			BUG_ON(pampd_del != pampd);
552 			(*tmem_pamops.free)(pampd, pool, oidp, index, true);
553 			if (obj->pampd_count == 0) {
554 				objnew = obj;
555 				objfound = NULL;
556 			}
557 			pampd = NULL;
558 		}
559 	} else {
560 		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
561 		if (unlikely(obj == NULL)) {
562 			ret = -ENOMEM;
563 			goto out;
564 		}
565 		tmem_obj_init(obj, hb, pool, oidp);
566 	}
567 	BUG_ON(obj == NULL);
568 	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
569 	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
570 					obj->pool, &obj->oid, index);
571 	if (unlikely(pampd == NULL))
572 		goto free;
573 	ret = tmem_pampd_add_to_obj(obj, index, pampd);
574 	if (unlikely(ret == -ENOMEM))
575 		/* may have partially built objnode tree ("stump") */
576 		goto delete_and_free;
577 	goto out;
578 
579 delete_and_free:
580 	(void)tmem_pampd_delete_from_obj(obj, index);
581 free:
582 	if (pampd)
583 		(*tmem_pamops.free)(pampd, pool, NULL, 0, true);
584 	if (objnew) {
585 		tmem_obj_free(objnew, hb);
586 		(*tmem_hostops.obj_free)(objnew, pool);
587 	}
588 out:
589 	spin_unlock(&hb->lock);
590 	return ret;
591 }
592 
tmem_localify_get_pampd(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,struct tmem_obj ** ret_obj,void ** saved_hb)593 void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
594 				uint32_t index, struct tmem_obj **ret_obj,
595 				void **saved_hb)
596 {
597 	struct tmem_hashbucket *hb;
598 	struct tmem_obj *obj = NULL;
599 	void *pampd = NULL;
600 
601 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
602 	spin_lock(&hb->lock);
603 	obj = tmem_obj_find(hb, oidp);
604 	if (likely(obj != NULL))
605 		pampd = tmem_pampd_lookup_in_obj(obj, index);
606 	*ret_obj = obj;
607 	*saved_hb = (void *)hb;
608 	/* note, hashbucket remains locked */
609 	return pampd;
610 }
611 
tmem_localify_finish(struct tmem_obj * obj,uint32_t index,void * pampd,void * saved_hb,bool delete)612 void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
613 			  void *pampd, void *saved_hb, bool delete)
614 {
615 	struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
616 
617 	BUG_ON(!spin_is_locked(&hb->lock));
618 	if (pampd != NULL) {
619 		BUG_ON(obj == NULL);
620 		(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
621 	} else if (delete) {
622 		BUG_ON(obj == NULL);
623 		(void)tmem_pampd_delete_from_obj(obj, index);
624 	}
625 	spin_unlock(&hb->lock);
626 }
627 
tmem_repatriate(void ** ppampd,struct tmem_hashbucket * hb,struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,bool free,char * data)628 static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
629 				struct tmem_pool *pool, struct tmem_oid *oidp,
630 				uint32_t index, bool free, char *data)
631 {
632 	void *old_pampd = *ppampd, *new_pampd = NULL;
633 	bool intransit = false;
634 	int ret = 0;
635 
636 
637 	if (!is_ephemeral(pool))
638 		new_pampd = (*tmem_pamops.repatriate_preload)(
639 				old_pampd, pool, oidp, index, &intransit);
640 	if (intransit)
641 		ret = -EAGAIN;
642 	else if (new_pampd != NULL)
643 		*ppampd = new_pampd;
644 	/* must release the hb->lock else repatriate can't sleep */
645 	spin_unlock(&hb->lock);
646 	if (!intransit)
647 		ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
648 						oidp, index, free, data);
649 	return ret;
650 }
651 
652 /*
653  * "Get" a page, e.g. if one can be found, copy the tmem page with the
654  * matching handle from PAM space to the kernel.  By tmem definition,
655  * when a "get" is successful on an ephemeral page, the page is "flushed",
656  * and when a "get" is successful on a persistent page, the page is retained
657  * in tmem.  Note that to preserve
658  * coherency, "get" can never be skipped if tmem contains the data.
659  * That is, if a get is done with a certain handle and fails, any
660  * subsequent "get" must also fail (unless of course there is a
661  * "put" done with the same handle).
662 
663  */
tmem_get(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,char * data,size_t * size,bool raw,int get_and_free)664 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
665 		char *data, size_t *size, bool raw, int get_and_free)
666 {
667 	struct tmem_obj *obj;
668 	void *pampd;
669 	bool ephemeral = is_ephemeral(pool);
670 	int ret = -1;
671 	struct tmem_hashbucket *hb;
672 	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
673 	bool lock_held = 0;
674 	void **ppampd;
675 
676 again:
677 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
678 	spin_lock(&hb->lock);
679 	lock_held = 1;
680 	obj = tmem_obj_find(hb, oidp);
681 	if (obj == NULL)
682 		goto out;
683 	ppampd = __tmem_pampd_lookup_in_obj(obj, index);
684 	if (ppampd == NULL)
685 		goto out;
686 	if (tmem_pamops.is_remote(*ppampd)) {
687 		ret = tmem_repatriate(ppampd, hb, pool, oidp,
688 					index, free, data);
689 		lock_held = 0; /* note hb->lock has been unlocked */
690 		if (ret == -EAGAIN) {
691 			/* rare I think, but should cond_resched()??? */
692 			usleep_range(10, 1000);
693 			goto again;
694 		} else if (ret != 0) {
695 			if (ret != -ENOENT)
696 				pr_err("UNTESTED case in tmem_get, ret=%d\n",
697 						ret);
698 			ret = -1;
699 			goto out;
700 		}
701 		goto out;
702 	}
703 	if (free)
704 		pampd = tmem_pampd_delete_from_obj(obj, index);
705 	else
706 		pampd = tmem_pampd_lookup_in_obj(obj, index);
707 	if (pampd == NULL)
708 		goto out;
709 	if (free) {
710 		if (obj->pampd_count == 0) {
711 			tmem_obj_free(obj, hb);
712 			(*tmem_hostops.obj_free)(obj, pool);
713 			obj = NULL;
714 		}
715 	}
716 	if (free)
717 		ret = (*tmem_pamops.get_data_and_free)(
718 				data, size, raw, pampd, pool, oidp, index);
719 	else
720 		ret = (*tmem_pamops.get_data)(
721 				data, size, raw, pampd, pool, oidp, index);
722 	if (ret < 0)
723 		goto out;
724 	ret = 0;
725 out:
726 	if (lock_held)
727 		spin_unlock(&hb->lock);
728 	return ret;
729 }
730 
731 /*
732  * If a page in tmem matches the handle, "flush" this page from tmem such
733  * that any subsequent "get" does not succeed (unless, of course, there
734  * was another "put" with the same handle).
735  */
tmem_flush_page(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index)736 int tmem_flush_page(struct tmem_pool *pool,
737 				struct tmem_oid *oidp, uint32_t index)
738 {
739 	struct tmem_obj *obj;
740 	void *pampd;
741 	int ret = -1;
742 	struct tmem_hashbucket *hb;
743 
744 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
745 	spin_lock(&hb->lock);
746 	obj = tmem_obj_find(hb, oidp);
747 	if (obj == NULL)
748 		goto out;
749 	pampd = tmem_pampd_delete_from_obj(obj, index);
750 	if (pampd == NULL)
751 		goto out;
752 	(*tmem_pamops.free)(pampd, pool, oidp, index, true);
753 	if (obj->pampd_count == 0) {
754 		tmem_obj_free(obj, hb);
755 		(*tmem_hostops.obj_free)(obj, pool);
756 	}
757 	ret = 0;
758 
759 out:
760 	spin_unlock(&hb->lock);
761 	return ret;
762 }
763 
764 /*
765  * If a page in tmem matches the handle, replace the page so that any
766  * subsequent "get" gets the new page.  Returns the new page if
767  * there was a page to replace, else returns NULL.
768  */
tmem_replace(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,void * new_pampd)769 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
770 			uint32_t index, void *new_pampd)
771 {
772 	struct tmem_obj *obj;
773 	int ret = -1;
774 	struct tmem_hashbucket *hb;
775 
776 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
777 	spin_lock(&hb->lock);
778 	obj = tmem_obj_find(hb, oidp);
779 	if (obj == NULL)
780 		goto out;
781 	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
782 	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
783 out:
784 	spin_unlock(&hb->lock);
785 	return ret;
786 }
787 
788 /*
789  * "Flush" all pages in tmem matching this oid.
790  */
tmem_flush_object(struct tmem_pool * pool,struct tmem_oid * oidp)791 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
792 {
793 	struct tmem_obj *obj;
794 	struct tmem_hashbucket *hb;
795 	int ret = -1;
796 
797 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
798 	spin_lock(&hb->lock);
799 	obj = tmem_obj_find(hb, oidp);
800 	if (obj == NULL)
801 		goto out;
802 	tmem_pampd_destroy_all_in_obj(obj);
803 	tmem_obj_free(obj, hb);
804 	(*tmem_hostops.obj_free)(obj, pool);
805 	ret = 0;
806 
807 out:
808 	spin_unlock(&hb->lock);
809 	return ret;
810 }
811 
812 /*
813  * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
814  * all subsequent access to this tmem_pool.
815  */
tmem_destroy_pool(struct tmem_pool * pool)816 int tmem_destroy_pool(struct tmem_pool *pool)
817 {
818 	int ret = -1;
819 
820 	if (pool == NULL)
821 		goto out;
822 	tmem_pool_flush(pool, 1);
823 	ret = 0;
824 out:
825 	return ret;
826 }
827 
828 static LIST_HEAD(tmem_global_pool_list);
829 
830 /*
831  * Create a new tmem_pool with the provided flag and return
832  * a pool id provided by the tmem host implementation.
833  */
tmem_new_pool(struct tmem_pool * pool,uint32_t flags)834 void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
835 {
836 	int persistent = flags & TMEM_POOL_PERSIST;
837 	int shared = flags & TMEM_POOL_SHARED;
838 	struct tmem_hashbucket *hb = &pool->hashbucket[0];
839 	int i;
840 
841 	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
842 		hb->obj_rb_root = RB_ROOT;
843 		spin_lock_init(&hb->lock);
844 	}
845 	INIT_LIST_HEAD(&pool->pool_list);
846 	atomic_set(&pool->obj_count, 0);
847 	SET_SENTINEL(pool, POOL);
848 	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
849 	pool->persistent = persistent;
850 	pool->shared = shared;
851 }
852