1 /* 2 * zswap.c - zswap driver file 3 * 4 * zswap is a backend for frontswap that takes pages that are in the process 5 * of being swapped out and attempts to compress and store them in a 6 * RAM-based memory pool. This can result in a significant I/O reduction on 7 * the swap device and, in the case where decompressing from RAM is faster 8 * than reading from the swap device, can also improve workload performance. 9 * 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 2 15 * of the License, or (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 */ 22 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/cpu.h> 27 #include <linux/highmem.h> 28 #include <linux/slab.h> 29 #include <linux/spinlock.h> 30 #include <linux/types.h> 31 #include <linux/atomic.h> 32 #include <linux/frontswap.h> 33 #include <linux/rbtree.h> 34 #include <linux/swap.h> 35 #include <linux/crypto.h> 36 #include <linux/mempool.h> 37 #include <linux/zpool.h> 38 39 #include <linux/mm_types.h> 40 #include <linux/page-flags.h> 41 #include <linux/swapops.h> 42 #include <linux/writeback.h> 43 #include <linux/pagemap.h> 44 45 /********************************* 46 * statistics 47 **********************************/ 48 /* Total bytes used by the compressed storage */ 49 static u64 zswap_pool_total_size; 50 /* The number of compressed pages currently stored in zswap */ 51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Compressed page was too big for the allocator to (optimally) store */ 67 static u64 zswap_reject_compress_poor; 68 /* Store failed because underlying allocator could not get memory */ 69 static u64 zswap_reject_alloc_fail; 70 /* Store failed because the entry metadata could not be allocated (rare) */ 71 static u64 zswap_reject_kmemcache_fail; 72 /* Duplicate store was encountered (rare) */ 73 static u64 zswap_duplicate_entry; 74 75 /********************************* 76 * tunables 77 **********************************/ 78 79 /* Enable/disable zswap (disabled by default) */ 80 static bool zswap_enabled; 81 static int zswap_enabled_param_set(const char *, 82 const struct kernel_param *); 83 static struct kernel_param_ops zswap_enabled_param_ops = { 84 .set = zswap_enabled_param_set, 85 .get = param_get_bool, 86 }; 87 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 88 89 /* Crypto compressor to use */ 90 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 91 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 92 static int zswap_compressor_param_set(const char *, 93 const struct kernel_param *); 94 static struct kernel_param_ops zswap_compressor_param_ops = { 95 .set = zswap_compressor_param_set, 96 .get = param_get_charp, 97 .free = param_free_charp, 98 }; 99 module_param_cb(compressor, &zswap_compressor_param_ops, 100 &zswap_compressor, 0644); 101 102 /* Compressed storage zpool to use */ 103 #define ZSWAP_ZPOOL_DEFAULT "zbud" 104 static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 105 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 106 static struct kernel_param_ops zswap_zpool_param_ops = { 107 .set = zswap_zpool_param_set, 108 .get = param_get_charp, 109 .free = param_free_charp, 110 }; 111 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 112 113 /* The maximum percentage of memory that the compressed pool can occupy */ 114 static unsigned int zswap_max_pool_percent = 20; 115 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 116 117 /********************************* 118 * data structures 119 **********************************/ 120 121 struct zswap_pool { 122 struct zpool *zpool; 123 struct crypto_comp * __percpu *tfm; 124 struct kref kref; 125 struct list_head list; 126 struct work_struct work; 127 struct notifier_block notifier; 128 char tfm_name[CRYPTO_MAX_ALG_NAME]; 129 }; 130 131 /* 132 * struct zswap_entry 133 * 134 * This structure contains the metadata for tracking a single compressed 135 * page within zswap. 136 * 137 * rbnode - links the entry into red-black tree for the appropriate swap type 138 * offset - the swap offset for the entry. Index into the red-black tree. 139 * refcount - the number of outstanding reference to the entry. This is needed 140 * to protect against premature freeing of the entry by code 141 * concurrent calls to load, invalidate, and writeback. The lock 142 * for the zswap_tree structure that contains the entry must 143 * be held while changing the refcount. Since the lock must 144 * be held, there is no reason to also make refcount atomic. 145 * length - the length in bytes of the compressed page data. Needed during 146 * decompression 147 * pool - the zswap_pool the entry's data is in 148 * handle - zpool allocation handle that stores the compressed page data 149 */ 150 struct zswap_entry { 151 struct rb_node rbnode; 152 pgoff_t offset; 153 int refcount; 154 unsigned int length; 155 struct zswap_pool *pool; 156 unsigned long handle; 157 }; 158 159 struct zswap_header { 160 swp_entry_t swpentry; 161 }; 162 163 /* 164 * The tree lock in the zswap_tree struct protects a few things: 165 * - the rbtree 166 * - the refcount field of each entry in the tree 167 */ 168 struct zswap_tree { 169 struct rb_root rbroot; 170 spinlock_t lock; 171 }; 172 173 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 174 175 /* RCU-protected iteration */ 176 static LIST_HEAD(zswap_pools); 177 /* protects zswap_pools list modification */ 178 static DEFINE_SPINLOCK(zswap_pools_lock); 179 /* pool counter to provide unique names to zpool */ 180 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 181 182 /* used by param callback function */ 183 static bool zswap_init_started; 184 185 /* fatal error during init */ 186 static bool zswap_init_failed; 187 188 /********************************* 189 * helpers and fwd declarations 190 **********************************/ 191 192 #define zswap_pool_debug(msg, p) \ 193 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 194 zpool_get_type((p)->zpool)) 195 196 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 197 static int zswap_pool_get(struct zswap_pool *pool); 198 static void zswap_pool_put(struct zswap_pool *pool); 199 200 static const struct zpool_ops zswap_zpool_ops = { 201 .evict = zswap_writeback_entry 202 }; 203 zswap_is_full(void)204 static bool zswap_is_full(void) 205 { 206 return totalram_pages * zswap_max_pool_percent / 100 < 207 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 208 } 209 zswap_update_total_size(void)210 static void zswap_update_total_size(void) 211 { 212 struct zswap_pool *pool; 213 u64 total = 0; 214 215 rcu_read_lock(); 216 217 list_for_each_entry_rcu(pool, &zswap_pools, list) 218 total += zpool_get_total_size(pool->zpool); 219 220 rcu_read_unlock(); 221 222 zswap_pool_total_size = total; 223 } 224 225 /********************************* 226 * zswap entry functions 227 **********************************/ 228 static struct kmem_cache *zswap_entry_cache; 229 zswap_entry_cache_create(void)230 static int __init zswap_entry_cache_create(void) 231 { 232 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 233 return zswap_entry_cache == NULL; 234 } 235 zswap_entry_cache_destroy(void)236 static void __init zswap_entry_cache_destroy(void) 237 { 238 kmem_cache_destroy(zswap_entry_cache); 239 } 240 zswap_entry_cache_alloc(gfp_t gfp)241 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 242 { 243 struct zswap_entry *entry; 244 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 245 if (!entry) 246 return NULL; 247 entry->refcount = 1; 248 RB_CLEAR_NODE(&entry->rbnode); 249 return entry; 250 } 251 zswap_entry_cache_free(struct zswap_entry * entry)252 static void zswap_entry_cache_free(struct zswap_entry *entry) 253 { 254 kmem_cache_free(zswap_entry_cache, entry); 255 } 256 257 /********************************* 258 * rbtree functions 259 **********************************/ zswap_rb_search(struct rb_root * root,pgoff_t offset)260 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 261 { 262 struct rb_node *node = root->rb_node; 263 struct zswap_entry *entry; 264 265 while (node) { 266 entry = rb_entry(node, struct zswap_entry, rbnode); 267 if (entry->offset > offset) 268 node = node->rb_left; 269 else if (entry->offset < offset) 270 node = node->rb_right; 271 else 272 return entry; 273 } 274 return NULL; 275 } 276 277 /* 278 * In the case that a entry with the same offset is found, a pointer to 279 * the existing entry is stored in dupentry and the function returns -EEXIST 280 */ zswap_rb_insert(struct rb_root * root,struct zswap_entry * entry,struct zswap_entry ** dupentry)281 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 282 struct zswap_entry **dupentry) 283 { 284 struct rb_node **link = &root->rb_node, *parent = NULL; 285 struct zswap_entry *myentry; 286 287 while (*link) { 288 parent = *link; 289 myentry = rb_entry(parent, struct zswap_entry, rbnode); 290 if (myentry->offset > entry->offset) 291 link = &(*link)->rb_left; 292 else if (myentry->offset < entry->offset) 293 link = &(*link)->rb_right; 294 else { 295 *dupentry = myentry; 296 return -EEXIST; 297 } 298 } 299 rb_link_node(&entry->rbnode, parent, link); 300 rb_insert_color(&entry->rbnode, root); 301 return 0; 302 } 303 zswap_rb_erase(struct rb_root * root,struct zswap_entry * entry)304 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 305 { 306 if (!RB_EMPTY_NODE(&entry->rbnode)) { 307 rb_erase(&entry->rbnode, root); 308 RB_CLEAR_NODE(&entry->rbnode); 309 } 310 } 311 312 /* 313 * Carries out the common pattern of freeing and entry's zpool allocation, 314 * freeing the entry itself, and decrementing the number of stored pages. 315 */ zswap_free_entry(struct zswap_entry * entry)316 static void zswap_free_entry(struct zswap_entry *entry) 317 { 318 zpool_free(entry->pool->zpool, entry->handle); 319 zswap_pool_put(entry->pool); 320 zswap_entry_cache_free(entry); 321 atomic_dec(&zswap_stored_pages); 322 zswap_update_total_size(); 323 } 324 325 /* caller must hold the tree lock */ zswap_entry_get(struct zswap_entry * entry)326 static void zswap_entry_get(struct zswap_entry *entry) 327 { 328 entry->refcount++; 329 } 330 331 /* caller must hold the tree lock 332 * remove from the tree and free it, if nobody reference the entry 333 */ zswap_entry_put(struct zswap_tree * tree,struct zswap_entry * entry)334 static void zswap_entry_put(struct zswap_tree *tree, 335 struct zswap_entry *entry) 336 { 337 int refcount = --entry->refcount; 338 339 BUG_ON(refcount < 0); 340 if (refcount == 0) { 341 zswap_rb_erase(&tree->rbroot, entry); 342 zswap_free_entry(entry); 343 } 344 } 345 346 /* caller must hold the tree lock */ zswap_entry_find_get(struct rb_root * root,pgoff_t offset)347 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 348 pgoff_t offset) 349 { 350 struct zswap_entry *entry; 351 352 entry = zswap_rb_search(root, offset); 353 if (entry) 354 zswap_entry_get(entry); 355 356 return entry; 357 } 358 359 /********************************* 360 * per-cpu code 361 **********************************/ 362 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 363 __zswap_cpu_dstmem_notifier(unsigned long action,unsigned long cpu)364 static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu) 365 { 366 u8 *dst; 367 368 switch (action) { 369 case CPU_UP_PREPARE: 370 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 371 if (!dst) { 372 pr_err("can't allocate compressor buffer\n"); 373 return NOTIFY_BAD; 374 } 375 per_cpu(zswap_dstmem, cpu) = dst; 376 break; 377 case CPU_DEAD: 378 case CPU_UP_CANCELED: 379 dst = per_cpu(zswap_dstmem, cpu); 380 kfree(dst); 381 per_cpu(zswap_dstmem, cpu) = NULL; 382 break; 383 default: 384 break; 385 } 386 return NOTIFY_OK; 387 } 388 zswap_cpu_dstmem_notifier(struct notifier_block * nb,unsigned long action,void * pcpu)389 static int zswap_cpu_dstmem_notifier(struct notifier_block *nb, 390 unsigned long action, void *pcpu) 391 { 392 return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu); 393 } 394 395 static struct notifier_block zswap_dstmem_notifier = { 396 .notifier_call = zswap_cpu_dstmem_notifier, 397 }; 398 zswap_cpu_dstmem_init(void)399 static int __init zswap_cpu_dstmem_init(void) 400 { 401 unsigned long cpu; 402 403 cpu_notifier_register_begin(); 404 for_each_online_cpu(cpu) 405 if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) == 406 NOTIFY_BAD) 407 goto cleanup; 408 __register_cpu_notifier(&zswap_dstmem_notifier); 409 cpu_notifier_register_done(); 410 return 0; 411 412 cleanup: 413 for_each_online_cpu(cpu) 414 __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); 415 cpu_notifier_register_done(); 416 return -ENOMEM; 417 } 418 zswap_cpu_dstmem_destroy(void)419 static void zswap_cpu_dstmem_destroy(void) 420 { 421 unsigned long cpu; 422 423 cpu_notifier_register_begin(); 424 for_each_online_cpu(cpu) 425 __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); 426 __unregister_cpu_notifier(&zswap_dstmem_notifier); 427 cpu_notifier_register_done(); 428 } 429 __zswap_cpu_comp_notifier(struct zswap_pool * pool,unsigned long action,unsigned long cpu)430 static int __zswap_cpu_comp_notifier(struct zswap_pool *pool, 431 unsigned long action, unsigned long cpu) 432 { 433 struct crypto_comp *tfm; 434 435 switch (action) { 436 case CPU_UP_PREPARE: 437 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 438 break; 439 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 440 if (IS_ERR_OR_NULL(tfm)) { 441 pr_err("could not alloc crypto comp %s : %ld\n", 442 pool->tfm_name, PTR_ERR(tfm)); 443 return NOTIFY_BAD; 444 } 445 *per_cpu_ptr(pool->tfm, cpu) = tfm; 446 break; 447 case CPU_DEAD: 448 case CPU_UP_CANCELED: 449 tfm = *per_cpu_ptr(pool->tfm, cpu); 450 if (!IS_ERR_OR_NULL(tfm)) 451 crypto_free_comp(tfm); 452 *per_cpu_ptr(pool->tfm, cpu) = NULL; 453 break; 454 default: 455 break; 456 } 457 return NOTIFY_OK; 458 } 459 zswap_cpu_comp_notifier(struct notifier_block * nb,unsigned long action,void * pcpu)460 static int zswap_cpu_comp_notifier(struct notifier_block *nb, 461 unsigned long action, void *pcpu) 462 { 463 unsigned long cpu = (unsigned long)pcpu; 464 struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier); 465 466 return __zswap_cpu_comp_notifier(pool, action, cpu); 467 } 468 zswap_cpu_comp_init(struct zswap_pool * pool)469 static int zswap_cpu_comp_init(struct zswap_pool *pool) 470 { 471 unsigned long cpu; 472 473 memset(&pool->notifier, 0, sizeof(pool->notifier)); 474 pool->notifier.notifier_call = zswap_cpu_comp_notifier; 475 476 cpu_notifier_register_begin(); 477 for_each_online_cpu(cpu) 478 if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) == 479 NOTIFY_BAD) 480 goto cleanup; 481 __register_cpu_notifier(&pool->notifier); 482 cpu_notifier_register_done(); 483 return 0; 484 485 cleanup: 486 for_each_online_cpu(cpu) 487 __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); 488 cpu_notifier_register_done(); 489 return -ENOMEM; 490 } 491 zswap_cpu_comp_destroy(struct zswap_pool * pool)492 static void zswap_cpu_comp_destroy(struct zswap_pool *pool) 493 { 494 unsigned long cpu; 495 496 cpu_notifier_register_begin(); 497 for_each_online_cpu(cpu) 498 __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); 499 __unregister_cpu_notifier(&pool->notifier); 500 cpu_notifier_register_done(); 501 } 502 503 /********************************* 504 * pool functions 505 **********************************/ 506 __zswap_pool_current(void)507 static struct zswap_pool *__zswap_pool_current(void) 508 { 509 struct zswap_pool *pool; 510 511 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 512 WARN_ON(!pool); 513 514 return pool; 515 } 516 zswap_pool_current(void)517 static struct zswap_pool *zswap_pool_current(void) 518 { 519 assert_spin_locked(&zswap_pools_lock); 520 521 return __zswap_pool_current(); 522 } 523 zswap_pool_current_get(void)524 static struct zswap_pool *zswap_pool_current_get(void) 525 { 526 struct zswap_pool *pool; 527 528 rcu_read_lock(); 529 530 pool = __zswap_pool_current(); 531 if (!pool || !zswap_pool_get(pool)) 532 pool = NULL; 533 534 rcu_read_unlock(); 535 536 return pool; 537 } 538 zswap_pool_last_get(void)539 static struct zswap_pool *zswap_pool_last_get(void) 540 { 541 struct zswap_pool *pool, *last = NULL; 542 543 rcu_read_lock(); 544 545 list_for_each_entry_rcu(pool, &zswap_pools, list) 546 last = pool; 547 if (!WARN_ON(!last) && !zswap_pool_get(last)) 548 last = NULL; 549 550 rcu_read_unlock(); 551 552 return last; 553 } 554 555 /* type and compressor must be null-terminated */ zswap_pool_find_get(char * type,char * compressor)556 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 557 { 558 struct zswap_pool *pool; 559 560 assert_spin_locked(&zswap_pools_lock); 561 562 list_for_each_entry_rcu(pool, &zswap_pools, list) { 563 if (strcmp(pool->tfm_name, compressor)) 564 continue; 565 if (strcmp(zpool_get_type(pool->zpool), type)) 566 continue; 567 /* if we can't get it, it's about to be destroyed */ 568 if (!zswap_pool_get(pool)) 569 continue; 570 return pool; 571 } 572 573 return NULL; 574 } 575 zswap_pool_create(char * type,char * compressor)576 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 577 { 578 struct zswap_pool *pool; 579 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 580 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 581 582 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 583 if (!pool) { 584 pr_err("pool alloc failed\n"); 585 return NULL; 586 } 587 588 /* unique name for each pool specifically required by zsmalloc */ 589 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 590 591 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 592 if (!pool->zpool) { 593 pr_err("%s zpool not available\n", type); 594 goto error; 595 } 596 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 597 598 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 599 pool->tfm = alloc_percpu(struct crypto_comp *); 600 if (!pool->tfm) { 601 pr_err("percpu alloc failed\n"); 602 goto error; 603 } 604 605 if (zswap_cpu_comp_init(pool)) 606 goto error; 607 pr_debug("using %s compressor\n", pool->tfm_name); 608 609 /* being the current pool takes 1 ref; this func expects the 610 * caller to always add the new pool as the current pool 611 */ 612 kref_init(&pool->kref); 613 INIT_LIST_HEAD(&pool->list); 614 615 zswap_pool_debug("created", pool); 616 617 return pool; 618 619 error: 620 free_percpu(pool->tfm); 621 if (pool->zpool) 622 zpool_destroy_pool(pool->zpool); 623 kfree(pool); 624 return NULL; 625 } 626 __zswap_pool_create_fallback(void)627 static __init struct zswap_pool *__zswap_pool_create_fallback(void) 628 { 629 if (!crypto_has_comp(zswap_compressor, 0, 0)) { 630 if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { 631 pr_err("default compressor %s not available\n", 632 zswap_compressor); 633 return NULL; 634 } 635 pr_err("compressor %s not available, using default %s\n", 636 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); 637 param_free_charp(&zswap_compressor); 638 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 639 } 640 if (!zpool_has_pool(zswap_zpool_type)) { 641 if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 642 pr_err("default zpool %s not available\n", 643 zswap_zpool_type); 644 return NULL; 645 } 646 pr_err("zpool %s not available, using default %s\n", 647 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); 648 param_free_charp(&zswap_zpool_type); 649 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 650 } 651 652 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 653 } 654 zswap_pool_destroy(struct zswap_pool * pool)655 static void zswap_pool_destroy(struct zswap_pool *pool) 656 { 657 zswap_pool_debug("destroying", pool); 658 659 zswap_cpu_comp_destroy(pool); 660 free_percpu(pool->tfm); 661 zpool_destroy_pool(pool->zpool); 662 kfree(pool); 663 } 664 zswap_pool_get(struct zswap_pool * pool)665 static int __must_check zswap_pool_get(struct zswap_pool *pool) 666 { 667 return kref_get_unless_zero(&pool->kref); 668 } 669 __zswap_pool_release(struct work_struct * work)670 static void __zswap_pool_release(struct work_struct *work) 671 { 672 struct zswap_pool *pool = container_of(work, typeof(*pool), work); 673 674 synchronize_rcu(); 675 676 /* nobody should have been able to get a kref... */ 677 WARN_ON(kref_get_unless_zero(&pool->kref)); 678 679 /* pool is now off zswap_pools list and has no references. */ 680 zswap_pool_destroy(pool); 681 } 682 __zswap_pool_empty(struct kref * kref)683 static void __zswap_pool_empty(struct kref *kref) 684 { 685 struct zswap_pool *pool; 686 687 pool = container_of(kref, typeof(*pool), kref); 688 689 spin_lock(&zswap_pools_lock); 690 691 WARN_ON(pool == zswap_pool_current()); 692 693 list_del_rcu(&pool->list); 694 695 INIT_WORK(&pool->work, __zswap_pool_release); 696 schedule_work(&pool->work); 697 698 spin_unlock(&zswap_pools_lock); 699 } 700 zswap_pool_put(struct zswap_pool * pool)701 static void zswap_pool_put(struct zswap_pool *pool) 702 { 703 kref_put(&pool->kref, __zswap_pool_empty); 704 } 705 706 /********************************* 707 * param callbacks 708 **********************************/ 709 710 /* val must be a null-terminated string */ __zswap_param_set(const char * val,const struct kernel_param * kp,char * type,char * compressor)711 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 712 char *type, char *compressor) 713 { 714 struct zswap_pool *pool, *put_pool = NULL; 715 char *s = strstrip((char *)val); 716 int ret; 717 718 if (zswap_init_failed) { 719 pr_err("can't set param, initialization failed\n"); 720 return -ENODEV; 721 } 722 723 /* no change required */ 724 if (!strcmp(s, *(char **)kp->arg)) 725 return 0; 726 727 /* if this is load-time (pre-init) param setting, 728 * don't create a pool; that's done during init. 729 */ 730 if (!zswap_init_started) 731 return param_set_charp(s, kp); 732 733 if (!type) { 734 if (!zpool_has_pool(s)) { 735 pr_err("zpool %s not available\n", s); 736 return -ENOENT; 737 } 738 type = s; 739 } else if (!compressor) { 740 if (!crypto_has_comp(s, 0, 0)) { 741 pr_err("compressor %s not available\n", s); 742 return -ENOENT; 743 } 744 compressor = s; 745 } else { 746 WARN_ON(1); 747 return -EINVAL; 748 } 749 750 spin_lock(&zswap_pools_lock); 751 752 pool = zswap_pool_find_get(type, compressor); 753 if (pool) { 754 zswap_pool_debug("using existing", pool); 755 WARN_ON(pool == zswap_pool_current()); 756 list_del_rcu(&pool->list); 757 } 758 759 spin_unlock(&zswap_pools_lock); 760 761 if (!pool) 762 pool = zswap_pool_create(type, compressor); 763 764 if (pool) 765 ret = param_set_charp(s, kp); 766 else 767 ret = -EINVAL; 768 769 spin_lock(&zswap_pools_lock); 770 771 if (!ret) { 772 put_pool = zswap_pool_current(); 773 list_add_rcu(&pool->list, &zswap_pools); 774 } else if (pool) { 775 /* add the possibly pre-existing pool to the end of the pools 776 * list; if it's new (and empty) then it'll be removed and 777 * destroyed by the put after we drop the lock 778 */ 779 list_add_tail_rcu(&pool->list, &zswap_pools); 780 put_pool = pool; 781 } 782 783 spin_unlock(&zswap_pools_lock); 784 785 /* drop the ref from either the old current pool, 786 * or the new pool we failed to add 787 */ 788 if (put_pool) 789 zswap_pool_put(put_pool); 790 791 return ret; 792 } 793 zswap_compressor_param_set(const char * val,const struct kernel_param * kp)794 static int zswap_compressor_param_set(const char *val, 795 const struct kernel_param *kp) 796 { 797 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 798 } 799 zswap_zpool_param_set(const char * val,const struct kernel_param * kp)800 static int zswap_zpool_param_set(const char *val, 801 const struct kernel_param *kp) 802 { 803 return __zswap_param_set(val, kp, NULL, zswap_compressor); 804 } 805 zswap_enabled_param_set(const char * val,const struct kernel_param * kp)806 static int zswap_enabled_param_set(const char *val, 807 const struct kernel_param *kp) 808 { 809 if (zswap_init_failed) { 810 pr_err("can't enable, initialization failed\n"); 811 return -ENODEV; 812 } 813 814 return param_set_bool(val, kp); 815 } 816 817 /********************************* 818 * writeback code 819 **********************************/ 820 /* return enum for zswap_get_swap_cache_page */ 821 enum zswap_get_swap_ret { 822 ZSWAP_SWAPCACHE_NEW, 823 ZSWAP_SWAPCACHE_EXIST, 824 ZSWAP_SWAPCACHE_FAIL, 825 }; 826 827 /* 828 * zswap_get_swap_cache_page 829 * 830 * This is an adaption of read_swap_cache_async() 831 * 832 * This function tries to find a page with the given swap entry 833 * in the swapper_space address space (the swap cache). If the page 834 * is found, it is returned in retpage. Otherwise, a page is allocated, 835 * added to the swap cache, and returned in retpage. 836 * 837 * If success, the swap cache page is returned in retpage 838 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 839 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 840 * the new page is added to swapcache and locked 841 * Returns ZSWAP_SWAPCACHE_FAIL on error 842 */ zswap_get_swap_cache_page(swp_entry_t entry,struct page ** retpage)843 static int zswap_get_swap_cache_page(swp_entry_t entry, 844 struct page **retpage) 845 { 846 bool page_was_allocated; 847 848 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 849 NULL, 0, &page_was_allocated); 850 if (page_was_allocated) 851 return ZSWAP_SWAPCACHE_NEW; 852 if (!*retpage) 853 return ZSWAP_SWAPCACHE_FAIL; 854 return ZSWAP_SWAPCACHE_EXIST; 855 } 856 857 /* 858 * Attempts to free an entry by adding a page to the swap cache, 859 * decompressing the entry data into the page, and issuing a 860 * bio write to write the page back to the swap device. 861 * 862 * This can be thought of as a "resumed writeback" of the page 863 * to the swap device. We are basically resuming the same swap 864 * writeback path that was intercepted with the frontswap_store() 865 * in the first place. After the page has been decompressed into 866 * the swap cache, the compressed version stored by zswap can be 867 * freed. 868 */ zswap_writeback_entry(struct zpool * pool,unsigned long handle)869 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 870 { 871 struct zswap_header *zhdr; 872 swp_entry_t swpentry; 873 struct zswap_tree *tree; 874 pgoff_t offset; 875 struct zswap_entry *entry; 876 struct page *page; 877 struct crypto_comp *tfm; 878 u8 *src, *dst; 879 unsigned int dlen; 880 int ret; 881 struct writeback_control wbc = { 882 .sync_mode = WB_SYNC_NONE, 883 }; 884 885 /* extract swpentry from data */ 886 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 887 swpentry = zhdr->swpentry; /* here */ 888 zpool_unmap_handle(pool, handle); 889 tree = zswap_trees[swp_type(swpentry)]; 890 offset = swp_offset(swpentry); 891 892 /* find and ref zswap entry */ 893 spin_lock(&tree->lock); 894 entry = zswap_entry_find_get(&tree->rbroot, offset); 895 if (!entry) { 896 /* entry was invalidated */ 897 spin_unlock(&tree->lock); 898 return 0; 899 } 900 spin_unlock(&tree->lock); 901 BUG_ON(offset != entry->offset); 902 903 /* try to allocate swap cache page */ 904 switch (zswap_get_swap_cache_page(swpentry, &page)) { 905 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 906 ret = -ENOMEM; 907 goto fail; 908 909 case ZSWAP_SWAPCACHE_EXIST: 910 /* page is already in the swap cache, ignore for now */ 911 put_page(page); 912 ret = -EEXIST; 913 goto fail; 914 915 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 916 /* decompress */ 917 dlen = PAGE_SIZE; 918 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 919 ZPOOL_MM_RO) + sizeof(struct zswap_header); 920 dst = kmap_atomic(page); 921 tfm = *get_cpu_ptr(entry->pool->tfm); 922 ret = crypto_comp_decompress(tfm, src, entry->length, 923 dst, &dlen); 924 put_cpu_ptr(entry->pool->tfm); 925 kunmap_atomic(dst); 926 zpool_unmap_handle(entry->pool->zpool, entry->handle); 927 BUG_ON(ret); 928 BUG_ON(dlen != PAGE_SIZE); 929 930 /* page is up to date */ 931 SetPageUptodate(page); 932 } 933 934 /* move it to the tail of the inactive list after end_writeback */ 935 SetPageReclaim(page); 936 937 /* start writeback */ 938 __swap_writepage(page, &wbc, end_swap_bio_write); 939 put_page(page); 940 zswap_written_back_pages++; 941 942 spin_lock(&tree->lock); 943 /* drop local reference */ 944 zswap_entry_put(tree, entry); 945 946 /* 947 * There are two possible situations for entry here: 948 * (1) refcount is 1(normal case), entry is valid and on the tree 949 * (2) refcount is 0, entry is freed and not on the tree 950 * because invalidate happened during writeback 951 * search the tree and free the entry if find entry 952 */ 953 if (entry == zswap_rb_search(&tree->rbroot, offset)) 954 zswap_entry_put(tree, entry); 955 spin_unlock(&tree->lock); 956 957 goto end; 958 959 /* 960 * if we get here due to ZSWAP_SWAPCACHE_EXIST 961 * a load may happening concurrently 962 * it is safe and okay to not free the entry 963 * if we free the entry in the following put 964 * it it either okay to return !0 965 */ 966 fail: 967 spin_lock(&tree->lock); 968 zswap_entry_put(tree, entry); 969 spin_unlock(&tree->lock); 970 971 end: 972 return ret; 973 } 974 zswap_shrink(void)975 static int zswap_shrink(void) 976 { 977 struct zswap_pool *pool; 978 int ret; 979 980 pool = zswap_pool_last_get(); 981 if (!pool) 982 return -ENOENT; 983 984 ret = zpool_shrink(pool->zpool, 1, NULL); 985 986 zswap_pool_put(pool); 987 988 return ret; 989 } 990 991 /********************************* 992 * frontswap hooks 993 **********************************/ 994 /* attempts to compress and store an single page */ zswap_frontswap_store(unsigned type,pgoff_t offset,struct page * page)995 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 996 struct page *page) 997 { 998 struct zswap_tree *tree = zswap_trees[type]; 999 struct zswap_entry *entry, *dupentry; 1000 struct crypto_comp *tfm; 1001 int ret; 1002 unsigned int dlen = PAGE_SIZE, len; 1003 unsigned long handle; 1004 char *buf; 1005 u8 *src, *dst; 1006 struct zswap_header *zhdr; 1007 1008 if (!zswap_enabled || !tree) { 1009 ret = -ENODEV; 1010 goto reject; 1011 } 1012 1013 /* reclaim space if needed */ 1014 if (zswap_is_full()) { 1015 zswap_pool_limit_hit++; 1016 if (zswap_shrink()) { 1017 zswap_reject_reclaim_fail++; 1018 ret = -ENOMEM; 1019 goto reject; 1020 } 1021 } 1022 1023 /* allocate entry */ 1024 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1025 if (!entry) { 1026 zswap_reject_kmemcache_fail++; 1027 ret = -ENOMEM; 1028 goto reject; 1029 } 1030 1031 /* if entry is successfully added, it keeps the reference */ 1032 entry->pool = zswap_pool_current_get(); 1033 if (!entry->pool) { 1034 ret = -EINVAL; 1035 goto freepage; 1036 } 1037 1038 /* compress */ 1039 dst = get_cpu_var(zswap_dstmem); 1040 tfm = *get_cpu_ptr(entry->pool->tfm); 1041 src = kmap_atomic(page); 1042 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1043 kunmap_atomic(src); 1044 put_cpu_ptr(entry->pool->tfm); 1045 if (ret) { 1046 ret = -EINVAL; 1047 goto put_dstmem; 1048 } 1049 1050 /* store */ 1051 len = dlen + sizeof(struct zswap_header); 1052 ret = zpool_malloc(entry->pool->zpool, len, 1053 __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1054 &handle); 1055 if (ret == -ENOSPC) { 1056 zswap_reject_compress_poor++; 1057 goto put_dstmem; 1058 } 1059 if (ret) { 1060 zswap_reject_alloc_fail++; 1061 goto put_dstmem; 1062 } 1063 zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1064 zhdr->swpentry = swp_entry(type, offset); 1065 buf = (u8 *)(zhdr + 1); 1066 memcpy(buf, dst, dlen); 1067 zpool_unmap_handle(entry->pool->zpool, handle); 1068 put_cpu_var(zswap_dstmem); 1069 1070 /* populate entry */ 1071 entry->offset = offset; 1072 entry->handle = handle; 1073 entry->length = dlen; 1074 1075 /* map */ 1076 spin_lock(&tree->lock); 1077 do { 1078 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1079 if (ret == -EEXIST) { 1080 zswap_duplicate_entry++; 1081 /* remove from rbtree */ 1082 zswap_rb_erase(&tree->rbroot, dupentry); 1083 zswap_entry_put(tree, dupentry); 1084 } 1085 } while (ret == -EEXIST); 1086 spin_unlock(&tree->lock); 1087 1088 /* update stats */ 1089 atomic_inc(&zswap_stored_pages); 1090 zswap_update_total_size(); 1091 1092 return 0; 1093 1094 put_dstmem: 1095 put_cpu_var(zswap_dstmem); 1096 zswap_pool_put(entry->pool); 1097 freepage: 1098 zswap_entry_cache_free(entry); 1099 reject: 1100 return ret; 1101 } 1102 1103 /* 1104 * returns 0 if the page was successfully decompressed 1105 * return -1 on entry not found or error 1106 */ zswap_frontswap_load(unsigned type,pgoff_t offset,struct page * page)1107 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1108 struct page *page) 1109 { 1110 struct zswap_tree *tree = zswap_trees[type]; 1111 struct zswap_entry *entry; 1112 struct crypto_comp *tfm; 1113 u8 *src, *dst; 1114 unsigned int dlen; 1115 int ret; 1116 1117 /* find */ 1118 spin_lock(&tree->lock); 1119 entry = zswap_entry_find_get(&tree->rbroot, offset); 1120 if (!entry) { 1121 /* entry was written back */ 1122 spin_unlock(&tree->lock); 1123 return -1; 1124 } 1125 spin_unlock(&tree->lock); 1126 1127 /* decompress */ 1128 dlen = PAGE_SIZE; 1129 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 1130 ZPOOL_MM_RO) + sizeof(struct zswap_header); 1131 dst = kmap_atomic(page); 1132 tfm = *get_cpu_ptr(entry->pool->tfm); 1133 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1134 put_cpu_ptr(entry->pool->tfm); 1135 kunmap_atomic(dst); 1136 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1137 BUG_ON(ret); 1138 1139 spin_lock(&tree->lock); 1140 zswap_entry_put(tree, entry); 1141 spin_unlock(&tree->lock); 1142 1143 return 0; 1144 } 1145 1146 /* frees an entry in zswap */ zswap_frontswap_invalidate_page(unsigned type,pgoff_t offset)1147 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1148 { 1149 struct zswap_tree *tree = zswap_trees[type]; 1150 struct zswap_entry *entry; 1151 1152 /* find */ 1153 spin_lock(&tree->lock); 1154 entry = zswap_rb_search(&tree->rbroot, offset); 1155 if (!entry) { 1156 /* entry was written back */ 1157 spin_unlock(&tree->lock); 1158 return; 1159 } 1160 1161 /* remove from rbtree */ 1162 zswap_rb_erase(&tree->rbroot, entry); 1163 1164 /* drop the initial reference from entry creation */ 1165 zswap_entry_put(tree, entry); 1166 1167 spin_unlock(&tree->lock); 1168 } 1169 1170 /* frees all zswap entries for the given swap type */ zswap_frontswap_invalidate_area(unsigned type)1171 static void zswap_frontswap_invalidate_area(unsigned type) 1172 { 1173 struct zswap_tree *tree = zswap_trees[type]; 1174 struct zswap_entry *entry, *n; 1175 1176 if (!tree) 1177 return; 1178 1179 /* walk the tree and free everything */ 1180 spin_lock(&tree->lock); 1181 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1182 zswap_free_entry(entry); 1183 tree->rbroot = RB_ROOT; 1184 spin_unlock(&tree->lock); 1185 kfree(tree); 1186 zswap_trees[type] = NULL; 1187 } 1188 zswap_frontswap_init(unsigned type)1189 static void zswap_frontswap_init(unsigned type) 1190 { 1191 struct zswap_tree *tree; 1192 1193 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); 1194 if (!tree) { 1195 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1196 return; 1197 } 1198 1199 tree->rbroot = RB_ROOT; 1200 spin_lock_init(&tree->lock); 1201 zswap_trees[type] = tree; 1202 } 1203 1204 static struct frontswap_ops zswap_frontswap_ops = { 1205 .store = zswap_frontswap_store, 1206 .load = zswap_frontswap_load, 1207 .invalidate_page = zswap_frontswap_invalidate_page, 1208 .invalidate_area = zswap_frontswap_invalidate_area, 1209 .init = zswap_frontswap_init 1210 }; 1211 1212 /********************************* 1213 * debugfs functions 1214 **********************************/ 1215 #ifdef CONFIG_DEBUG_FS 1216 #include <linux/debugfs.h> 1217 1218 static struct dentry *zswap_debugfs_root; 1219 zswap_debugfs_init(void)1220 static int __init zswap_debugfs_init(void) 1221 { 1222 if (!debugfs_initialized()) 1223 return -ENODEV; 1224 1225 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1226 if (!zswap_debugfs_root) 1227 return -ENOMEM; 1228 1229 debugfs_create_u64("pool_limit_hit", S_IRUGO, 1230 zswap_debugfs_root, &zswap_pool_limit_hit); 1231 debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 1232 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1233 debugfs_create_u64("reject_alloc_fail", S_IRUGO, 1234 zswap_debugfs_root, &zswap_reject_alloc_fail); 1235 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 1236 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1237 debugfs_create_u64("reject_compress_poor", S_IRUGO, 1238 zswap_debugfs_root, &zswap_reject_compress_poor); 1239 debugfs_create_u64("written_back_pages", S_IRUGO, 1240 zswap_debugfs_root, &zswap_written_back_pages); 1241 debugfs_create_u64("duplicate_entry", S_IRUGO, 1242 zswap_debugfs_root, &zswap_duplicate_entry); 1243 debugfs_create_u64("pool_total_size", S_IRUGO, 1244 zswap_debugfs_root, &zswap_pool_total_size); 1245 debugfs_create_atomic_t("stored_pages", S_IRUGO, 1246 zswap_debugfs_root, &zswap_stored_pages); 1247 1248 return 0; 1249 } 1250 zswap_debugfs_exit(void)1251 static void __exit zswap_debugfs_exit(void) 1252 { 1253 debugfs_remove_recursive(zswap_debugfs_root); 1254 } 1255 #else zswap_debugfs_init(void)1256 static int __init zswap_debugfs_init(void) 1257 { 1258 return 0; 1259 } 1260 zswap_debugfs_exit(void)1261 static void __exit zswap_debugfs_exit(void) { } 1262 #endif 1263 1264 /********************************* 1265 * module init and exit 1266 **********************************/ init_zswap(void)1267 static int __init init_zswap(void) 1268 { 1269 struct zswap_pool *pool; 1270 1271 zswap_init_started = true; 1272 1273 if (zswap_entry_cache_create()) { 1274 pr_err("entry cache creation failed\n"); 1275 goto cache_fail; 1276 } 1277 1278 if (zswap_cpu_dstmem_init()) { 1279 pr_err("dstmem alloc failed\n"); 1280 goto dstmem_fail; 1281 } 1282 1283 pool = __zswap_pool_create_fallback(); 1284 if (!pool) { 1285 pr_err("pool creation failed\n"); 1286 goto pool_fail; 1287 } 1288 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1289 zpool_get_type(pool->zpool)); 1290 1291 list_add(&pool->list, &zswap_pools); 1292 1293 frontswap_register_ops(&zswap_frontswap_ops); 1294 if (zswap_debugfs_init()) 1295 pr_warn("debugfs initialization failed\n"); 1296 return 0; 1297 1298 pool_fail: 1299 zswap_cpu_dstmem_destroy(); 1300 dstmem_fail: 1301 zswap_entry_cache_destroy(); 1302 cache_fail: 1303 /* if built-in, we aren't unloaded on failure; don't allow use */ 1304 zswap_init_failed = true; 1305 zswap_enabled = false; 1306 return -ENOMEM; 1307 } 1308 /* must be late so crypto has time to come up */ 1309 late_initcall(init_zswap); 1310 1311 MODULE_LICENSE("GPL"); 1312 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1313 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1314