• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * zbud.c - Compression buddies allocator
3  *
4  * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5  *
6  * Compression buddies ("zbud") provides for efficiently packing two
7  * (or, possibly in the future, more) compressed pages ("zpages") into
8  * a single "raw" pageframe and for tracking both zpages and pageframes
9  * so that whole pageframes can be easily reclaimed in LRU-like order.
10  * It is designed to be used in conjunction with transcendent memory
11  * ("tmem"); for example separate LRU lists are maintained for persistent
12  * vs. ephemeral pages.
13  *
14  * A zbudpage is an overlay for a struct page and thus each zbudpage
15  * refers to a physical pageframe of RAM.  When the caller passes a
16  * struct page from the kernel's page allocator, zbud "transforms" it
17  * to a zbudpage which sets/uses a different set of fields than the
18  * struct-page and thus must "untransform" it back by reinitializing
19  * certain fields before the struct-page can be freed.  The fields
20  * of a zbudpage include a page lock for controlling access to the
21  * corresponding pageframe, and there is a size field for each zpage.
22  * Each zbudpage also lives on two linked lists: a "budlist" which is
23  * used to support efficient buddying of zpages; and an "lru" which
24  * is used for reclaiming pageframes in approximately least-recently-used
25  * order.
26  *
27  * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
28  * which contain the compressed data for zero, one, or two zbuds.  Contained
29  * with the compressed data is a tmem_handle which is a key to allow
30  * the same data to be found via the tmem interface so the zpage can
31  * be invalidated (for ephemeral pages) or repatriated to the swap cache
32  * (for persistent pages).  The contents of a zbudpageframe must never
33  * be accessed without holding the page lock for the corresponding
34  * zbudpage and, to accomodate highmem machines, the contents may
35  * only be examined or changes when kmapped.  Thus, when in use, a
36  * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
37  *
38  * Note that the term "zbud" refers to the combination of a zpage and
39  * a tmem_handle that is stored as one of possibly two "buddied" zpages;
40  * it also generically refers to this allocator... sorry for any confusion.
41  *
42  * A zbudref is a pointer to a struct zbudpage (which can be cast to a
43  * struct page), with the LSB either cleared or set to indicate, respectively,
44  * the first or second zpage in the zbudpageframe. Since a zbudref can be
45  * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
46  * references a stored tmem page and so is the only zbud data structure
47  * externally visible to zbud.c/zbud.h.
48  *
49  * Since we wish to reclaim entire pageframes but zpages may be randomly
50  * added and deleted to any given pageframe, we approximate LRU by
51  * promoting a pageframe to MRU when a zpage is added to it, but
52  * leaving it at the current place in the list when a zpage is deleted
53  * from it.  As a side effect, zpages that are difficult to buddy (e.g.
54  * very large paages) will be reclaimed faster than average, which seems
55  * reasonable.
56  *
57  * In the current implementation, no more than two zpages may be stored in
58  * any pageframe and no zpage ever crosses a pageframe boundary.  While
59  * other zpage allocation mechanisms may allow greater density, this two
60  * zpage-per-pageframe limit both ensures simple reclaim of pageframes
61  * (including garbage collection of references to the contents of those
62  * pageframes from tmem data structures) AND avoids the need for compaction.
63  * With additional complexity, zbud could be modified to support storing
64  * up to three zpages per pageframe or, to handle larger average zpages,
65  * up to three zpages per pair of pageframes, but it is not clear if the
66  * additional complexity would be worth it.  So consider it an exercise
67  * for future developers.
68  *
69  * Note also that zbud does no page allocation or freeing.  This is so
70  * that the caller has complete control over and, for accounting, visibility
71  * into if/when pages are allocated and freed.
72  *
73  * Finally, note that zbud limits the size of zpages it can store; the
74  * caller must check the zpage size with zbud_max_buddy_size before
75  * storing it, else BUGs will result.  User beware.
76  */
77 
78 #include <linux/module.h>
79 #include <linux/highmem.h>
80 #include <linux/list.h>
81 #include <linux/spinlock.h>
82 #include <linux/pagemap.h>
83 #include <linux/atomic.h>
84 #include <linux/bug.h>
85 #include "tmem.h"
86 #include "zcache.h"
87 #include "zbud.h"
88 
89 /*
90  * We need to ensure that a struct zbudpage is never larger than a
91  * struct page.  This is checked with a BUG_ON in zbud_init.
92  *
93  * The unevictable field indicates that a zbud is being added to the
94  * zbudpage.  Since this is a two-phase process (due to tmem locking),
95  * this field locks the zbudpage against eviction when a zbud match
96  * or creation is in process.  Since this addition process may occur
97  * in parallel for two zbuds in one zbudpage, the field is a counter
98  * that must not exceed two.
99  */
100 struct zbudpage {
101 	union {
102 		struct page page;
103 		struct {
104 			unsigned long space_for_flags;
105 			struct {
106 				unsigned zbud0_size:PAGE_SHIFT;
107 				unsigned zbud1_size:PAGE_SHIFT;
108 				unsigned unevictable:2;
109 			};
110 			struct list_head budlist;
111 			struct list_head lru;
112 		};
113 	};
114 };
115 #if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG
116 #error "zbud won't work for this arch, PAGE_SIZE is too large"
117 #endif
118 
119 struct zbudref {
120 	union {
121 		struct zbudpage *zbudpage;
122 		unsigned long zbudref;
123 	};
124 };
125 
126 #define CHUNK_SHIFT	6
127 #define CHUNK_SIZE	(1 << CHUNK_SHIFT)
128 #define CHUNK_MASK	(~(CHUNK_SIZE-1))
129 #define NCHUNKS		(PAGE_SIZE >> CHUNK_SHIFT)
130 #define MAX_CHUNK	(NCHUNKS-1)
131 
132 /*
133  * The following functions deal with the difference between struct
134  * page and struct zbudpage.  Note the hack of using the pageflags
135  * from struct page; this is to avoid duplicating all the complex
136  * pageflag macros.
137  */
zbudpage_spin_lock(struct zbudpage * zbudpage)138 static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
139 {
140 	struct page *page = (struct page *)zbudpage;
141 
142 	while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
143 		do {
144 			cpu_relax();
145 		} while (test_bit(PG_locked, &page->flags));
146 	}
147 }
148 
zbudpage_spin_unlock(struct zbudpage * zbudpage)149 static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
150 {
151 	struct page *page = (struct page *)zbudpage;
152 
153 	clear_bit(PG_locked, &page->flags);
154 }
155 
zbudpage_spin_trylock(struct zbudpage * zbudpage)156 static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
157 {
158 	return trylock_page((struct page *)zbudpage);
159 }
160 
zbudpage_is_locked(struct zbudpage * zbudpage)161 static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
162 {
163 	return PageLocked((struct page *)zbudpage);
164 }
165 
kmap_zbudpage_atomic(struct zbudpage * zbudpage)166 static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
167 {
168 	return kmap_atomic((struct page *)zbudpage);
169 }
170 
171 /*
172  * A dying zbudpage is an ephemeral page in the process of being evicted.
173  * Any data contained in the zbudpage is invalid and we are just waiting for
174  * the tmem pampds to be invalidated before freeing the page
175  */
zbudpage_is_dying(struct zbudpage * zbudpage)176 static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
177 {
178 	struct page *page = (struct page *)zbudpage;
179 
180 	return test_bit(PG_reclaim, &page->flags);
181 }
182 
zbudpage_set_dying(struct zbudpage * zbudpage)183 static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
184 {
185 	struct page *page = (struct page *)zbudpage;
186 
187 	set_bit(PG_reclaim, &page->flags);
188 }
189 
zbudpage_clear_dying(struct zbudpage * zbudpage)190 static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
191 {
192 	struct page *page = (struct page *)zbudpage;
193 
194 	clear_bit(PG_reclaim, &page->flags);
195 }
196 
197 /*
198  * A zombie zbudpage is a persistent page in the process of being evicted.
199  * The data contained in the zbudpage is valid and we are just waiting for
200  * the tmem pampds to be invalidated before freeing the page
201  */
zbudpage_is_zombie(struct zbudpage * zbudpage)202 static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
203 {
204 	struct page *page = (struct page *)zbudpage;
205 
206 	return test_bit(PG_dirty, &page->flags);
207 }
208 
zbudpage_set_zombie(struct zbudpage * zbudpage)209 static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
210 {
211 	struct page *page = (struct page *)zbudpage;
212 
213 	set_bit(PG_dirty, &page->flags);
214 }
215 
zbudpage_clear_zombie(struct zbudpage * zbudpage)216 static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
217 {
218 	struct page *page = (struct page *)zbudpage;
219 
220 	clear_bit(PG_dirty, &page->flags);
221 }
222 
kunmap_zbudpage_atomic(void * zbpg)223 static inline void kunmap_zbudpage_atomic(void *zbpg)
224 {
225 	kunmap_atomic(zbpg);
226 }
227 
228 /*
229  * zbud "translation" and helper functions
230  */
231 
zbudref_to_zbudpage(struct zbudref * zref)232 static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
233 {
234 	unsigned long zbud = (unsigned long)zref;
235 	zbud &= ~1UL;
236 	return (struct zbudpage *)zbud;
237 }
238 
zbudpage_to_zbudref(struct zbudpage * zbudpage,unsigned budnum)239 static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
240 							unsigned budnum)
241 {
242 	unsigned long zbud = (unsigned long)zbudpage;
243 	BUG_ON(budnum > 1);
244 	zbud |= budnum;
245 	return (struct zbudref *)zbud;
246 }
247 
zbudref_budnum(struct zbudref * zbudref)248 static inline int zbudref_budnum(struct zbudref *zbudref)
249 {
250 	unsigned long zbud = (unsigned long)zbudref;
251 	return zbud & 1UL;
252 }
253 
zbud_max_size(void)254 static inline unsigned zbud_max_size(void)
255 {
256 	return MAX_CHUNK << CHUNK_SHIFT;
257 }
258 
zbud_size_to_chunks(unsigned size)259 static inline unsigned zbud_size_to_chunks(unsigned size)
260 {
261 	BUG_ON(size == 0 || size > zbud_max_size());
262 	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
263 }
264 
265 /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
zbud_data(void * zbpg,unsigned budnum,unsigned size)266 static inline char *zbud_data(void *zbpg,
267 			unsigned budnum, unsigned size)
268 {
269 	char *p;
270 
271 	BUG_ON(size == 0 || size > zbud_max_size());
272 	p = (char *)zbpg;
273 	if (budnum == 1)
274 		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
275 	return p;
276 }
277 
278 /*
279  * These are all informative and exposed through debugfs... except for
280  * the arrays... anyone know how to do that?  To avoid confusion for
281  * debugfs viewers, some of these should also be atomic_long_t, but
282  * I don't know how to expose atomics via debugfs either...
283  */
284 static ssize_t zbud_eph_pageframes;
285 static ssize_t zbud_pers_pageframes;
286 static ssize_t zbud_eph_zpages;
287 static ssize_t zbud_pers_zpages;
288 static u64 zbud_eph_zbytes;
289 static u64 zbud_pers_zbytes;
290 static ssize_t zbud_eph_evicted_pageframes;
291 static ssize_t zbud_pers_evicted_pageframes;
292 static ssize_t zbud_eph_cumul_zpages;
293 static ssize_t zbud_pers_cumul_zpages;
294 static u64 zbud_eph_cumul_zbytes;
295 static u64 zbud_pers_cumul_zbytes;
296 static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS];
297 static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS];
298 static ssize_t zbud_eph_buddied_count;
299 static ssize_t zbud_pers_buddied_count;
300 static ssize_t zbud_eph_unbuddied_count;
301 static ssize_t zbud_pers_unbuddied_count;
302 static ssize_t zbud_eph_zombie_count;
303 static ssize_t zbud_pers_zombie_count;
304 static atomic_t zbud_eph_zombie_atomic;
305 static atomic_t zbud_pers_zombie_atomic;
306 
307 #ifdef CONFIG_DEBUG_FS
308 #include <linux/debugfs.h>
309 #define	zdfs	debugfs_create_size_t
310 #define	zdfs64	debugfs_create_u64
zbud_debugfs_init(void)311 static int zbud_debugfs_init(void)
312 {
313 	struct dentry *root = debugfs_create_dir("zbud", NULL);
314 	if (root == NULL)
315 		return -ENXIO;
316 
317 	/*
318 	 * would be nice to dump the sizes of the unbuddied
319 	 * arrays, like was done with sysfs, but it doesn't
320 	 * look like debugfs is flexible enough to do that
321 	 */
322 	zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
323 	zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
324 	zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
325 	zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
326 	zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
327 	zdfs("eph_evicted_pageframes", S_IRUGO, root,
328 				&zbud_eph_evicted_pageframes);
329 	zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
330 	zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
331 	zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
332 	zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
333 	zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
334 	zdfs("pers_evicted_pageframes", S_IRUGO, root,
335 				&zbud_pers_evicted_pageframes);
336 	zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
337 	zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
338 	zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
339 	zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
340 	zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
341 	return 0;
342 }
343 #undef	zdfs
344 #undef	zdfs64
345 #else
zbud_debugfs_init(void)346 static inline int zbud_debugfs_init(void)
347 {
348 	return 0;
349 }
350 #endif
351 
352 /* protects the buddied list and all unbuddied lists */
353 static DEFINE_SPINLOCK(zbud_eph_lists_lock);
354 static DEFINE_SPINLOCK(zbud_pers_lists_lock);
355 
356 struct zbud_unbuddied {
357 	struct list_head list;
358 	unsigned count;
359 };
360 
361 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
362 /* element 0 is never used but optimizing that isn't worth it */
363 static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
364 static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
365 static LIST_HEAD(zbud_eph_lru_list);
366 static LIST_HEAD(zbud_pers_lru_list);
367 static LIST_HEAD(zbud_eph_buddied_list);
368 static LIST_HEAD(zbud_pers_buddied_list);
369 static LIST_HEAD(zbud_eph_zombie_list);
370 static LIST_HEAD(zbud_pers_zombie_list);
371 
372 /*
373  * Given a struct page, transform it to a zbudpage so that it can be
374  * used by zbud and initialize fields as necessary.
375  */
zbud_init_zbudpage(struct page * page,bool eph)376 static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
377 {
378 	struct zbudpage *zbudpage = (struct zbudpage *)page;
379 
380 	BUG_ON(page == NULL);
381 	INIT_LIST_HEAD(&zbudpage->budlist);
382 	INIT_LIST_HEAD(&zbudpage->lru);
383 	zbudpage->zbud0_size = 0;
384 	zbudpage->zbud1_size = 0;
385 	zbudpage->unevictable = 0;
386 	if (eph)
387 		zbud_eph_pageframes++;
388 	else
389 		zbud_pers_pageframes++;
390 	return zbudpage;
391 }
392 
393 /* "Transform" a zbudpage back to a struct page suitable to free. */
zbud_unuse_zbudpage(struct zbudpage * zbudpage,bool eph)394 static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
395 								bool eph)
396 {
397 	struct page *page = (struct page *)zbudpage;
398 
399 	BUG_ON(!list_empty(&zbudpage->budlist));
400 	BUG_ON(!list_empty(&zbudpage->lru));
401 	BUG_ON(zbudpage->zbud0_size != 0);
402 	BUG_ON(zbudpage->zbud1_size != 0);
403 	BUG_ON(!PageLocked(page));
404 	BUG_ON(zbudpage->unevictable != 0);
405 	BUG_ON(zbudpage_is_dying(zbudpage));
406 	BUG_ON(zbudpage_is_zombie(zbudpage));
407 	if (eph)
408 		zbud_eph_pageframes--;
409 	else
410 		zbud_pers_pageframes--;
411 	zbudpage_spin_unlock(zbudpage);
412 	page_mapcount_reset(page);
413 	init_page_count(page);
414 	page->index = 0;
415 	return page;
416 }
417 
418 /* Mark a zbud as unused and do accounting */
zbud_unuse_zbud(struct zbudpage * zbudpage,int budnum,bool eph)419 static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
420 					int budnum, bool eph)
421 {
422 	unsigned size;
423 
424 	BUG_ON(!zbudpage_is_locked(zbudpage));
425 	if (budnum == 0) {
426 		size = zbudpage->zbud0_size;
427 		zbudpage->zbud0_size = 0;
428 	} else {
429 		size = zbudpage->zbud1_size;
430 		zbudpage->zbud1_size = 0;
431 	}
432 	if (eph) {
433 		zbud_eph_zbytes -= size;
434 		zbud_eph_zpages--;
435 	} else {
436 		zbud_pers_zbytes -= size;
437 		zbud_pers_zpages--;
438 	}
439 }
440 
441 /*
442  * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
443  * to some data, set up the zbud appropriately including data copying
444  * and accounting.  Note that if cdata is NULL, the data copying is
445  * skipped.  (This is useful for lazy writes such as for RAMster.)
446  */
zbud_init_zbud(struct zbudpage * zbudpage,struct tmem_handle * th,bool eph,void * cdata,unsigned budnum,unsigned size)447 static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
448 				bool eph, void *cdata,
449 				unsigned budnum, unsigned size)
450 {
451 	char *to;
452 	void *zbpg;
453 	struct tmem_handle *to_th;
454 	unsigned nchunks = zbud_size_to_chunks(size);
455 
456 	BUG_ON(!zbudpage_is_locked(zbudpage));
457 	zbpg = kmap_zbudpage_atomic(zbudpage);
458 	to = zbud_data(zbpg, budnum, size);
459 	to_th = (struct tmem_handle *)to;
460 	to_th->index = th->index;
461 	to_th->oid = th->oid;
462 	to_th->pool_id = th->pool_id;
463 	to_th->client_id = th->client_id;
464 	to += sizeof(struct tmem_handle);
465 	if (cdata != NULL)
466 		memcpy(to, cdata, size - sizeof(struct tmem_handle));
467 	kunmap_zbudpage_atomic(zbpg);
468 	if (budnum == 0)
469 		zbudpage->zbud0_size = size;
470 	else
471 		zbudpage->zbud1_size = size;
472 	if (eph) {
473 		zbud_eph_cumul_chunk_counts[nchunks]++;
474 		zbud_eph_zpages++;
475 		zbud_eph_cumul_zpages++;
476 		zbud_eph_zbytes += size;
477 		zbud_eph_cumul_zbytes += size;
478 	} else {
479 		zbud_pers_cumul_chunk_counts[nchunks]++;
480 		zbud_pers_zpages++;
481 		zbud_pers_cumul_zpages++;
482 		zbud_pers_zbytes += size;
483 		zbud_pers_cumul_zbytes += size;
484 	}
485 }
486 
487 /*
488  * Given a locked dying zbudpage, read out the tmem handles from the data,
489  * unlock the page, then use the handles to tell tmem to flush out its
490  * references
491  */
zbud_evict_tmem(struct zbudpage * zbudpage)492 static void zbud_evict_tmem(struct zbudpage *zbudpage)
493 {
494 	int i, j;
495 	uint32_t pool_id[2], client_id[2];
496 	uint32_t index[2];
497 	struct tmem_oid oid[2];
498 	struct tmem_pool *pool;
499 	void *zbpg;
500 	struct tmem_handle *th;
501 	unsigned size;
502 
503 	/* read out the tmem handles from the data and set aside */
504 	zbpg = kmap_zbudpage_atomic(zbudpage);
505 	for (i = 0, j = 0; i < 2; i++) {
506 		size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
507 		if (size) {
508 			th = (struct tmem_handle *)zbud_data(zbpg, i, size);
509 			client_id[j] = th->client_id;
510 			pool_id[j] = th->pool_id;
511 			oid[j] = th->oid;
512 			index[j] = th->index;
513 			j++;
514 			zbud_unuse_zbud(zbudpage, i, true);
515 		}
516 	}
517 	kunmap_zbudpage_atomic(zbpg);
518 	zbudpage_spin_unlock(zbudpage);
519 	/* zbudpage is now an unlocked dying... tell tmem to flush pointers */
520 	for (i = 0; i < j; i++) {
521 		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
522 		if (pool != NULL) {
523 			tmem_flush_page(pool, &oid[i], index[i]);
524 			zcache_put_pool(pool);
525 		}
526 	}
527 }
528 
529 /*
530  * Externally callable zbud handling routines.
531  */
532 
533 /*
534  * Return the maximum size compressed page that can be stored (secretly
535  * setting aside space for the tmem handle.
536  */
zbud_max_buddy_size(void)537 unsigned int zbud_max_buddy_size(void)
538 {
539 	return zbud_max_size() - sizeof(struct tmem_handle);
540 }
541 
542 /*
543  * Given a zbud reference, free the corresponding zbud from all lists,
544  * mark it as unused, do accounting, and if the freeing of the zbud
545  * frees up an entire pageframe, return it to the caller (else NULL).
546  */
zbud_free_and_delist(struct zbudref * zref,bool eph,unsigned int * zsize,unsigned int * zpages)547 struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
548 				  unsigned int *zsize, unsigned int *zpages)
549 {
550 	unsigned long budnum = zbudref_budnum(zref);
551 	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
552 	struct page *page = NULL;
553 	unsigned chunks, bud_size, other_bud_size;
554 	spinlock_t *lists_lock =
555 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
556 	struct zbud_unbuddied *unbud =
557 		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
558 
559 
560 	spin_lock(lists_lock);
561 	zbudpage_spin_lock(zbudpage);
562 	if (zbudpage_is_dying(zbudpage)) {
563 		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
564 		zbudpage_spin_unlock(zbudpage);
565 		spin_unlock(lists_lock);
566 		*zpages = 0;
567 		*zsize = 0;
568 		goto out;
569 	}
570 	if (budnum == 0) {
571 		bud_size = zbudpage->zbud0_size;
572 		other_bud_size = zbudpage->zbud1_size;
573 	} else {
574 		bud_size = zbudpage->zbud1_size;
575 		other_bud_size = zbudpage->zbud0_size;
576 	}
577 	*zsize = bud_size - sizeof(struct tmem_handle);
578 	*zpages = 1;
579 	zbud_unuse_zbud(zbudpage, budnum, eph);
580 	if (other_bud_size == 0) { /* was unbuddied: unlist and free */
581 		chunks = zbud_size_to_chunks(bud_size) ;
582 		if (zbudpage_is_zombie(zbudpage)) {
583 			if (eph)
584 				zbud_pers_zombie_count =
585 				  atomic_dec_return(&zbud_eph_zombie_atomic);
586 			else
587 				zbud_pers_zombie_count =
588 				  atomic_dec_return(&zbud_pers_zombie_atomic);
589 			zbudpage_clear_zombie(zbudpage);
590 		} else {
591 			BUG_ON(list_empty(&unbud[chunks].list));
592 			list_del_init(&zbudpage->budlist);
593 			unbud[chunks].count--;
594 		}
595 		list_del_init(&zbudpage->lru);
596 		spin_unlock(lists_lock);
597 		if (eph)
598 			zbud_eph_unbuddied_count--;
599 		else
600 			zbud_pers_unbuddied_count--;
601 		page = zbud_unuse_zbudpage(zbudpage, eph);
602 	} else { /* was buddied: move remaining buddy to unbuddied list */
603 		chunks = zbud_size_to_chunks(other_bud_size) ;
604 		if (!zbudpage_is_zombie(zbudpage)) {
605 			list_del_init(&zbudpage->budlist);
606 			list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
607 			unbud[chunks].count++;
608 		}
609 		if (eph) {
610 			zbud_eph_buddied_count--;
611 			zbud_eph_unbuddied_count++;
612 		} else {
613 			zbud_pers_unbuddied_count++;
614 			zbud_pers_buddied_count--;
615 		}
616 		/* don't mess with lru, no need to move it */
617 		zbudpage_spin_unlock(zbudpage);
618 		spin_unlock(lists_lock);
619 	}
620 out:
621 	return page;
622 }
623 
624 /*
625  * Given a tmem handle, and a kmapped pointer to compressed data of
626  * the given size, try to find an unbuddied zbudpage in which to
627  * create a zbud. If found, put it there, mark the zbudpage unevictable,
628  * and return a zbudref to it.  Else return NULL.
629  */
zbud_match_prep(struct tmem_handle * th,bool eph,void * cdata,unsigned size)630 struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
631 				void *cdata, unsigned size)
632 {
633 	struct zbudpage *zbudpage = NULL, *zbudpage2;
634 	unsigned long budnum = 0UL;
635 	unsigned nchunks;
636 	int i, found_good_buddy = 0;
637 	spinlock_t *lists_lock =
638 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
639 	struct zbud_unbuddied *unbud =
640 		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
641 
642 	size += sizeof(struct tmem_handle);
643 	nchunks = zbud_size_to_chunks(size);
644 	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
645 		spin_lock(lists_lock);
646 		if (!list_empty(&unbud[i].list)) {
647 			list_for_each_entry_safe(zbudpage, zbudpage2,
648 				    &unbud[i].list, budlist) {
649 				if (zbudpage_spin_trylock(zbudpage)) {
650 					found_good_buddy = i;
651 					goto found_unbuddied;
652 				}
653 			}
654 		}
655 		spin_unlock(lists_lock);
656 	}
657 	zbudpage = NULL;
658 	goto out;
659 
660 found_unbuddied:
661 	BUG_ON(!zbudpage_is_locked(zbudpage));
662 	BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
663 	if (zbudpage->zbud0_size == 0)
664 		budnum = 0UL;
665 	else if (zbudpage->zbud1_size == 0)
666 		budnum = 1UL;
667 	list_del_init(&zbudpage->budlist);
668 	if (eph) {
669 		list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
670 		unbud[found_good_buddy].count--;
671 		zbud_eph_unbuddied_count--;
672 		zbud_eph_buddied_count++;
673 		/* "promote" raw zbudpage to most-recently-used */
674 		list_del_init(&zbudpage->lru);
675 		list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
676 	} else {
677 		list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
678 		unbud[found_good_buddy].count--;
679 		zbud_pers_unbuddied_count--;
680 		zbud_pers_buddied_count++;
681 		/* "promote" raw zbudpage to most-recently-used */
682 		list_del_init(&zbudpage->lru);
683 		list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
684 	}
685 	zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
686 	zbudpage->unevictable++;
687 	BUG_ON(zbudpage->unevictable == 3);
688 	zbudpage_spin_unlock(zbudpage);
689 	spin_unlock(lists_lock);
690 out:
691 	return zbudpage_to_zbudref(zbudpage, budnum);
692 
693 }
694 
695 /*
696  * Given a tmem handle, and a kmapped pointer to compressed data of
697  * the given size, and a newly allocated struct page, create an unevictable
698  * zbud in that new page and return a zbudref to it.
699  */
zbud_create_prep(struct tmem_handle * th,bool eph,void * cdata,unsigned size,struct page * newpage)700 struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
701 					void *cdata, unsigned size,
702 					struct page *newpage)
703 {
704 	struct zbudpage *zbudpage;
705 	unsigned long budnum = 0;
706 	unsigned nchunks;
707 	spinlock_t *lists_lock =
708 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
709 	struct zbud_unbuddied *unbud =
710 		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
711 
712 #if 0
713 	/* this may be worth it later to support decompress-in-place? */
714 	static unsigned long counter;
715 	budnum = counter++ & 1;	/* alternate using zbud0 and zbud1 */
716 #endif
717 
718 	if (size  > zbud_max_buddy_size())
719 		return NULL;
720 	if (newpage == NULL)
721 		return NULL;
722 
723 	size += sizeof(struct tmem_handle);
724 	nchunks = zbud_size_to_chunks(size) ;
725 	spin_lock(lists_lock);
726 	zbudpage = zbud_init_zbudpage(newpage, eph);
727 	zbudpage_spin_lock(zbudpage);
728 	list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
729 	if (eph) {
730 		list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
731 		zbud_eph_unbuddied_count++;
732 	} else {
733 		list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
734 		zbud_pers_unbuddied_count++;
735 	}
736 	unbud[nchunks].count++;
737 	zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
738 	zbudpage->unevictable++;
739 	BUG_ON(zbudpage->unevictable == 3);
740 	zbudpage_spin_unlock(zbudpage);
741 	spin_unlock(lists_lock);
742 	return zbudpage_to_zbudref(zbudpage, budnum);
743 }
744 
745 /*
746  * Finish creation of a zbud by, assuming another zbud isn't being created
747  * in parallel, marking it evictable.
748  */
zbud_create_finish(struct zbudref * zref,bool eph)749 void zbud_create_finish(struct zbudref *zref, bool eph)
750 {
751 	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
752 	spinlock_t *lists_lock =
753 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
754 
755 	spin_lock(lists_lock);
756 	zbudpage_spin_lock(zbudpage);
757 	BUG_ON(zbudpage_is_dying(zbudpage));
758 	zbudpage->unevictable--;
759 	BUG_ON((int)zbudpage->unevictable < 0);
760 	zbudpage_spin_unlock(zbudpage);
761 	spin_unlock(lists_lock);
762 }
763 
764 /*
765  * Given a zbudref and a struct page, decompress the data from
766  * the zbud into the physical page represented by the struct page
767  * by upcalling to zcache_decompress
768  */
zbud_decompress(struct page * data_page,struct zbudref * zref,bool eph,void (* decompress)(char *,unsigned int,char *))769 int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
770 			void (*decompress)(char *, unsigned int, char *))
771 {
772 	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
773 	unsigned long budnum = zbudref_budnum(zref);
774 	void *zbpg;
775 	char *to_va, *from_va;
776 	unsigned size;
777 	int ret = -1;
778 	spinlock_t *lists_lock =
779 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
780 
781 	spin_lock(lists_lock);
782 	zbudpage_spin_lock(zbudpage);
783 	if (zbudpage_is_dying(zbudpage)) {
784 		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
785 		goto out;
786 	}
787 	zbpg = kmap_zbudpage_atomic(zbudpage);
788 	to_va = kmap_atomic(data_page);
789 	if (budnum == 0)
790 		size = zbudpage->zbud0_size;
791 	else
792 		size = zbudpage->zbud1_size;
793 	BUG_ON(size == 0 || size > zbud_max_size());
794 	from_va = zbud_data(zbpg, budnum, size);
795 	from_va += sizeof(struct tmem_handle);
796 	size -= sizeof(struct tmem_handle);
797 	decompress(from_va, size, to_va);
798 	kunmap_atomic(to_va);
799 	kunmap_zbudpage_atomic(zbpg);
800 	ret = 0;
801 out:
802 	zbudpage_spin_unlock(zbudpage);
803 	spin_unlock(lists_lock);
804 	return ret;
805 }
806 
807 /*
808  * Given a zbudref and a kernel pointer, copy the data from
809  * the zbud to the kernel pointer.
810  */
zbud_copy_from_zbud(char * to_va,struct zbudref * zref,size_t * sizep,bool eph)811 int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
812 				size_t *sizep, bool eph)
813 {
814 	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
815 	unsigned long budnum = zbudref_budnum(zref);
816 	void *zbpg;
817 	char *from_va;
818 	unsigned size;
819 	int ret = -1;
820 	spinlock_t *lists_lock =
821 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
822 
823 	spin_lock(lists_lock);
824 	zbudpage_spin_lock(zbudpage);
825 	if (zbudpage_is_dying(zbudpage)) {
826 		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
827 		goto out;
828 	}
829 	zbpg = kmap_zbudpage_atomic(zbudpage);
830 	if (budnum == 0)
831 		size = zbudpage->zbud0_size;
832 	else
833 		size = zbudpage->zbud1_size;
834 	BUG_ON(size == 0 || size > zbud_max_size());
835 	from_va = zbud_data(zbpg, budnum, size);
836 	from_va += sizeof(struct tmem_handle);
837 	size -= sizeof(struct tmem_handle);
838 	*sizep = size;
839 	memcpy(to_va, from_va, size);
840 
841 	kunmap_zbudpage_atomic(zbpg);
842 	ret = 0;
843 out:
844 	zbudpage_spin_unlock(zbudpage);
845 	spin_unlock(lists_lock);
846 	return ret;
847 }
848 
849 /*
850  * Given a zbudref and a kernel pointer, copy the data from
851  * the kernel pointer to the zbud.
852  */
zbud_copy_to_zbud(struct zbudref * zref,char * from_va,bool eph)853 int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
854 {
855 	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
856 	unsigned long budnum = zbudref_budnum(zref);
857 	void *zbpg;
858 	char *to_va;
859 	unsigned size;
860 	int ret = -1;
861 	spinlock_t *lists_lock =
862 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
863 
864 	spin_lock(lists_lock);
865 	zbudpage_spin_lock(zbudpage);
866 	if (zbudpage_is_dying(zbudpage)) {
867 		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
868 		goto out;
869 	}
870 	zbpg = kmap_zbudpage_atomic(zbudpage);
871 	if (budnum == 0)
872 		size = zbudpage->zbud0_size;
873 	else
874 		size = zbudpage->zbud1_size;
875 	BUG_ON(size == 0 || size > zbud_max_size());
876 	to_va = zbud_data(zbpg, budnum, size);
877 	to_va += sizeof(struct tmem_handle);
878 	size -= sizeof(struct tmem_handle);
879 	memcpy(to_va, from_va, size);
880 
881 	kunmap_zbudpage_atomic(zbpg);
882 	ret = 0;
883 out:
884 	zbudpage_spin_unlock(zbudpage);
885 	spin_unlock(lists_lock);
886 	return ret;
887 }
888 
889 /*
890  * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
891  * there are no references to it remaining, and return the now unused
892  * (and re-init'ed) struct page and the total amount of compressed
893  * data that was evicted.
894  */
zbud_evict_pageframe_lru(unsigned int * zsize,unsigned int * zpages)895 struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
896 {
897 	struct zbudpage *zbudpage = NULL, *zbudpage2;
898 	struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
899 	struct page *page = NULL;
900 	bool irqs_disabled = irqs_disabled();
901 
902 	/*
903 	 * Since this can be called indirectly from cleancache_put, which
904 	 * has interrupts disabled, as well as frontswap_put, which does not,
905 	 * we need to be able to handle both cases, even though it is ugly.
906 	 */
907 	if (irqs_disabled)
908 		spin_lock(&zbud_eph_lists_lock);
909 	else
910 		spin_lock_bh(&zbud_eph_lists_lock);
911 	*zsize = 0;
912 	if (list_empty(&zbud_eph_lru_list))
913 		goto unlock_out;
914 	list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
915 		/* skip a locked zbudpage */
916 		if (unlikely(!zbudpage_spin_trylock(zbudpage)))
917 			continue;
918 		/* skip an unevictable zbudpage */
919 		if (unlikely(zbudpage->unevictable != 0)) {
920 			zbudpage_spin_unlock(zbudpage);
921 			continue;
922 		}
923 		/* got a locked evictable page */
924 		goto evict_page;
925 
926 	}
927 unlock_out:
928 	/* no unlocked evictable pages, give up */
929 	if (irqs_disabled)
930 		spin_unlock(&zbud_eph_lists_lock);
931 	else
932 		spin_unlock_bh(&zbud_eph_lists_lock);
933 	goto out;
934 
935 evict_page:
936 	list_del_init(&zbudpage->budlist);
937 	list_del_init(&zbudpage->lru);
938 	zbudpage_set_dying(zbudpage);
939 	/*
940 	 * the zbudpage is now "dying" and attempts to read, write,
941 	 * or delete data from it will be ignored
942 	 */
943 	if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size !=  0) {
944 		*zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
945 				(2 * sizeof(struct tmem_handle));
946 		*zpages = 2;
947 	} else if (zbudpage->zbud0_size != 0) {
948 		unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
949 		*zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
950 		*zpages = 1;
951 	} else if (zbudpage->zbud1_size != 0) {
952 		unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
953 		*zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
954 		*zpages = 1;
955 	} else {
956 		BUG();
957 	}
958 	spin_unlock(&zbud_eph_lists_lock);
959 	zbud_eph_evicted_pageframes++;
960 	if (*zpages == 1)
961 		zbud_eph_unbuddied_count--;
962 	else
963 		zbud_eph_buddied_count--;
964 	zbud_evict_tmem(zbudpage);
965 	zbudpage_spin_lock(zbudpage);
966 	zbudpage_clear_dying(zbudpage);
967 	page = zbud_unuse_zbudpage(zbudpage, true);
968 	if (!irqs_disabled)
969 		local_bh_enable();
970 out:
971 	return page;
972 }
973 
974 /*
975  * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
976  * read the tmem_handle(s) out of it into the passed array, and return the
977  * number of zbuds.  Caller must perform necessary tmem functions and,
978  * indirectly, zbud functions to fetch any valid data and cause the
979  * now-zombified zbudpage to eventually be freed.  We track the zombified
980  * zbudpage count so it is possible to observe if there is a leak.
981  FIXME: describe (ramster) case where data pointers are passed in for memcpy
982  */
zbud_make_zombie_lru(struct tmem_handle * th,unsigned char ** data,unsigned int * zsize,bool eph)983 unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
984 					unsigned int *zsize, bool eph)
985 {
986 	struct zbudpage *zbudpage = NULL, *zbudpag2;
987 	struct tmem_handle *thfrom;
988 	char *from_va;
989 	void *zbpg;
990 	unsigned size;
991 	int ret = 0, i;
992 	spinlock_t *lists_lock =
993 		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
994 	struct list_head *lru_list =
995 		eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
996 
997 	spin_lock_bh(lists_lock);
998 	if (list_empty(lru_list))
999 		goto out;
1000 	list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
1001 		/* skip a locked zbudpage */
1002 		if (unlikely(!zbudpage_spin_trylock(zbudpage)))
1003 			continue;
1004 		/* skip an unevictable zbudpage */
1005 		if (unlikely(zbudpage->unevictable != 0)) {
1006 			zbudpage_spin_unlock(zbudpage);
1007 			continue;
1008 		}
1009 		/* got a locked evictable page */
1010 		goto zombify_page;
1011 	}
1012 	/* no unlocked evictable pages, give up */
1013 	goto out;
1014 
1015 zombify_page:
1016 	/* got an unlocked evictable page, zombify it */
1017 	list_del_init(&zbudpage->budlist);
1018 	zbudpage_set_zombie(zbudpage);
1019 	/* FIXME what accounting do I need to do here? */
1020 	list_del_init(&zbudpage->lru);
1021 	if (eph) {
1022 		list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1023 		zbud_eph_zombie_count =
1024 				atomic_inc_return(&zbud_eph_zombie_atomic);
1025 	} else {
1026 		list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1027 		zbud_pers_zombie_count =
1028 				atomic_inc_return(&zbud_pers_zombie_atomic);
1029 	}
1030 	/* FIXME what accounting do I need to do here? */
1031 	zbpg = kmap_zbudpage_atomic(zbudpage);
1032 	for (i = 0; i < 2; i++) {
1033 		size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1034 		if (size) {
1035 			from_va = zbud_data(zbpg, i, size);
1036 			thfrom = (struct tmem_handle *)from_va;
1037 			from_va += sizeof(struct tmem_handle);
1038 			size -= sizeof(struct tmem_handle);
1039 			if (th != NULL)
1040 				th[ret] = *thfrom;
1041 			if (data != NULL)
1042 				memcpy(data[ret], from_va, size);
1043 			if (zsize != NULL)
1044 				*zsize++ = size;
1045 			ret++;
1046 		}
1047 	}
1048 	kunmap_zbudpage_atomic(zbpg);
1049 	zbudpage_spin_unlock(zbudpage);
1050 out:
1051 	spin_unlock_bh(lists_lock);
1052 	return ret;
1053 }
1054 
zbud_init(void)1055 void zbud_init(void)
1056 {
1057 	int i;
1058 
1059 	zbud_debugfs_init();
1060 	BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1061 	BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1062 	for (i = 0; i < NCHUNKS; i++) {
1063 		INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1064 		INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
1065 	}
1066 }
1067