1 /*
2 * Compressed RAM block device
3 *
4 * Copyright (C) 2008, 2009, 2010 Nitin Gupta
5 * 2012, 2013 Minchan Kim
6 *
7 * This code is released using a dual license strategy: BSD/GPL
8 * You can choose the licence that better fits your requirements.
9 *
10 * Released under the terms of 3-clause BSD License
11 * Released under the terms of GNU General Public License Version 2.0
12 *
13 */
14
15 #define KMSG_COMPONENT "zram"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/buffer_head.h>
24 #include <linux/device.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/backing-dev.h>
28 #include <linux/string.h>
29 #include <linux/vmalloc.h>
30 #include <linux/err.h>
31 #include <linux/idr.h>
32 #include <linux/sysfs.h>
33 #include <linux/debugfs.h>
34 #include <linux/cpuhotplug.h>
35 #include <linux/part_stat.h>
36
37 #ifdef CONFIG_ZRAM_GROUP
38 #include <linux/memcontrol.h>
39 #endif
40
41 #include "zram_drv.h"
42
43 static DEFINE_IDR(zram_index_idr);
44 /* idr index must be protected */
45 static DEFINE_MUTEX(zram_index_mutex);
46
47 static int zram_major;
48 static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
49
50 /* Module params (documentation at end) */
51 static unsigned int num_devices = 1;
52 /*
53 * Pages that compress to sizes equals or greater than this are stored
54 * uncompressed in memory.
55 */
56 static size_t huge_class_size;
57
58 static const struct block_device_operations zram_devops;
59
60 static void zram_free_page(struct zram *zram, size_t index);
61 static int zram_read_page(struct zram *zram, struct page *page, u32 index,
62 struct bio *parent);
63
init_done(struct zram * zram)64 static inline bool init_done(struct zram *zram)
65 {
66 return zram->disksize;
67 }
68
dev_to_zram(struct device * dev)69 static inline struct zram *dev_to_zram(struct device *dev)
70 {
71 return (struct zram *)dev_to_disk(dev)->private_data;
72 }
73
zram_set_element(struct zram * zram,u32 index,unsigned long element)74 static inline void zram_set_element(struct zram *zram, u32 index,
75 unsigned long element)
76 {
77 zram->table[index].element = element;
78 }
79
zram_get_element(struct zram * zram,u32 index)80 static unsigned long zram_get_element(struct zram *zram, u32 index)
81 {
82 return zram->table[index].element;
83 }
84
zram_allocated(struct zram * zram,u32 index)85 static inline bool zram_allocated(struct zram *zram, u32 index)
86 {
87 return zram_get_obj_size(zram, index) ||
88 zram_test_flag(zram, index, ZRAM_SAME) ||
89 zram_test_flag(zram, index, ZRAM_WB);
90 }
91
92 #if PAGE_SIZE != 4096
is_partial_io(struct bio_vec * bvec)93 static inline bool is_partial_io(struct bio_vec *bvec)
94 {
95 return bvec->bv_len != PAGE_SIZE;
96 }
97 #define ZRAM_PARTIAL_IO 1
98 #else
is_partial_io(struct bio_vec * bvec)99 static inline bool is_partial_io(struct bio_vec *bvec)
100 {
101 return false;
102 }
103 #endif
104
zram_set_priority(struct zram * zram,u32 index,u32 prio)105 static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
106 {
107 prio &= ZRAM_COMP_PRIORITY_MASK;
108 /*
109 * Clear previous priority value first, in case if we recompress
110 * further an already recompressed page
111 */
112 zram->table[index].flags &= ~(ZRAM_COMP_PRIORITY_MASK <<
113 ZRAM_COMP_PRIORITY_BIT1);
114 zram->table[index].flags |= (prio << ZRAM_COMP_PRIORITY_BIT1);
115 }
116
zram_get_priority(struct zram * zram,u32 index)117 static inline u32 zram_get_priority(struct zram *zram, u32 index)
118 {
119 u32 prio = zram->table[index].flags >> ZRAM_COMP_PRIORITY_BIT1;
120
121 return prio & ZRAM_COMP_PRIORITY_MASK;
122 }
123
zram_accessed(struct zram * zram,u32 index)124 static void zram_accessed(struct zram *zram, u32 index)
125 {
126 zram_clear_flag(zram, index, ZRAM_IDLE);
127 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
128 zram->table[index].ac_time = ktime_get_boottime();
129 #endif
130 }
131
update_used_max(struct zram * zram,const unsigned long pages)132 static inline void update_used_max(struct zram *zram,
133 const unsigned long pages)
134 {
135 unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
136
137 do {
138 if (cur_max >= pages)
139 return;
140 } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
141 &cur_max, pages));
142 }
143
zram_fill_page(void * ptr,unsigned long len,unsigned long value)144 static inline void zram_fill_page(void *ptr, unsigned long len,
145 unsigned long value)
146 {
147 WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
148 memset_l(ptr, value, len / sizeof(unsigned long));
149 }
150
page_same_filled(void * ptr,unsigned long * element)151 static bool page_same_filled(void *ptr, unsigned long *element)
152 {
153 unsigned long *page;
154 unsigned long val;
155 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
156
157 page = (unsigned long *)ptr;
158 val = page[0];
159
160 if (val != page[last_pos])
161 return false;
162
163 for (pos = 1; pos < last_pos; pos++) {
164 if (val != page[pos])
165 return false;
166 }
167
168 *element = val;
169
170 return true;
171 }
172
initstate_show(struct device * dev,struct device_attribute * attr,char * buf)173 static ssize_t initstate_show(struct device *dev,
174 struct device_attribute *attr, char *buf)
175 {
176 u32 val;
177 struct zram *zram = dev_to_zram(dev);
178
179 down_read(&zram->init_lock);
180 val = init_done(zram);
181 up_read(&zram->init_lock);
182
183 return scnprintf(buf, PAGE_SIZE, "%u\n", val);
184 }
185
disksize_show(struct device * dev,struct device_attribute * attr,char * buf)186 static ssize_t disksize_show(struct device *dev,
187 struct device_attribute *attr, char *buf)
188 {
189 struct zram *zram = dev_to_zram(dev);
190
191 return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
192 }
193
mem_limit_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)194 static ssize_t mem_limit_store(struct device *dev,
195 struct device_attribute *attr, const char *buf, size_t len)
196 {
197 u64 limit;
198 char *tmp;
199 struct zram *zram = dev_to_zram(dev);
200
201 limit = memparse(buf, &tmp);
202 if (buf == tmp) /* no chars parsed, invalid input */
203 return -EINVAL;
204
205 down_write(&zram->init_lock);
206 zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
207 up_write(&zram->init_lock);
208
209 return len;
210 }
211
mem_used_max_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)212 static ssize_t mem_used_max_store(struct device *dev,
213 struct device_attribute *attr, const char *buf, size_t len)
214 {
215 int err;
216 unsigned long val;
217 struct zram *zram = dev_to_zram(dev);
218
219 err = kstrtoul(buf, 10, &val);
220 if (err || val != 0)
221 return -EINVAL;
222
223 down_read(&zram->init_lock);
224 if (init_done(zram)) {
225 atomic_long_set(&zram->stats.max_used_pages,
226 zs_get_total_pages(zram->mem_pool));
227 }
228 up_read(&zram->init_lock);
229
230 return len;
231 }
232
233 /*
234 * Mark all pages which are older than or equal to cutoff as IDLE.
235 * Callers should hold the zram init lock in read mode
236 */
mark_idle(struct zram * zram,ktime_t cutoff)237 static void mark_idle(struct zram *zram, ktime_t cutoff)
238 {
239 int is_idle = 1;
240 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
241 int index;
242
243 for (index = 0; index < nr_pages; index++) {
244 /*
245 * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
246 * See the comment in writeback_store.
247 *
248 * Also do not mark ZRAM_SAME slots as ZRAM_IDLE, because no
249 * post-processing (recompress, writeback) happens to the
250 * ZRAM_SAME slot.
251 *
252 * And ZRAM_WB slots simply cannot be ZRAM_IDLE.
253 */
254 zram_slot_lock(zram, index);
255 if (!zram_allocated(zram, index) ||
256 zram_test_flag(zram, index, ZRAM_WB) ||
257 zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
258 zram_test_flag(zram, index, ZRAM_SAME)) {
259 zram_slot_unlock(zram, index);
260 continue;
261 }
262
263 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
264 is_idle = !cutoff ||
265 ktime_after(cutoff, zram->table[index].ac_time);
266 #endif
267 if (is_idle)
268 zram_set_flag(zram, index, ZRAM_IDLE);
269 else
270 zram_clear_flag(zram, index, ZRAM_IDLE);
271 zram_slot_unlock(zram, index);
272 }
273 }
274
idle_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)275 static ssize_t idle_store(struct device *dev,
276 struct device_attribute *attr, const char *buf, size_t len)
277 {
278 struct zram *zram = dev_to_zram(dev);
279 ktime_t cutoff_time = 0;
280 ssize_t rv = -EINVAL;
281
282 if (!sysfs_streq(buf, "all")) {
283 /*
284 * If it did not parse as 'all' try to treat it as an integer
285 * when we have memory tracking enabled.
286 */
287 u64 age_sec;
288
289 if (IS_ENABLED(CONFIG_ZRAM_TRACK_ENTRY_ACTIME) && !kstrtoull(buf, 0, &age_sec))
290 cutoff_time = ktime_sub(ktime_get_boottime(),
291 ns_to_ktime(age_sec * NSEC_PER_SEC));
292 else
293 goto out;
294 }
295
296 down_read(&zram->init_lock);
297 if (!init_done(zram))
298 goto out_unlock;
299
300 /*
301 * A cutoff_time of 0 marks everything as idle, this is the
302 * "all" behavior.
303 */
304 mark_idle(zram, cutoff_time);
305 rv = len;
306
307 out_unlock:
308 up_read(&zram->init_lock);
309 out:
310 return rv;
311 }
312
313 #ifdef CONFIG_ZRAM_WRITEBACK
writeback_limit_enable_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)314 static ssize_t writeback_limit_enable_store(struct device *dev,
315 struct device_attribute *attr, const char *buf, size_t len)
316 {
317 struct zram *zram = dev_to_zram(dev);
318 u64 val;
319 ssize_t ret = -EINVAL;
320
321 if (kstrtoull(buf, 10, &val))
322 return ret;
323
324 down_read(&zram->init_lock);
325 spin_lock(&zram->wb_limit_lock);
326 zram->wb_limit_enable = val;
327 spin_unlock(&zram->wb_limit_lock);
328 up_read(&zram->init_lock);
329 ret = len;
330
331 return ret;
332 }
333
writeback_limit_enable_show(struct device * dev,struct device_attribute * attr,char * buf)334 static ssize_t writeback_limit_enable_show(struct device *dev,
335 struct device_attribute *attr, char *buf)
336 {
337 bool val;
338 struct zram *zram = dev_to_zram(dev);
339
340 down_read(&zram->init_lock);
341 spin_lock(&zram->wb_limit_lock);
342 val = zram->wb_limit_enable;
343 spin_unlock(&zram->wb_limit_lock);
344 up_read(&zram->init_lock);
345
346 return scnprintf(buf, PAGE_SIZE, "%d\n", val);
347 }
348
writeback_limit_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)349 static ssize_t writeback_limit_store(struct device *dev,
350 struct device_attribute *attr, const char *buf, size_t len)
351 {
352 struct zram *zram = dev_to_zram(dev);
353 u64 val;
354 ssize_t ret = -EINVAL;
355
356 if (kstrtoull(buf, 10, &val))
357 return ret;
358
359 down_read(&zram->init_lock);
360 spin_lock(&zram->wb_limit_lock);
361 zram->bd_wb_limit = val;
362 spin_unlock(&zram->wb_limit_lock);
363 up_read(&zram->init_lock);
364 ret = len;
365
366 return ret;
367 }
368
writeback_limit_show(struct device * dev,struct device_attribute * attr,char * buf)369 static ssize_t writeback_limit_show(struct device *dev,
370 struct device_attribute *attr, char *buf)
371 {
372 u64 val;
373 struct zram *zram = dev_to_zram(dev);
374
375 down_read(&zram->init_lock);
376 spin_lock(&zram->wb_limit_lock);
377 val = zram->bd_wb_limit;
378 spin_unlock(&zram->wb_limit_lock);
379 up_read(&zram->init_lock);
380
381 return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
382 }
383
reset_bdev(struct zram * zram)384 static void reset_bdev(struct zram *zram)
385 {
386 struct block_device *bdev;
387
388 if (!zram->backing_dev)
389 return;
390
391 bdev = zram->bdev;
392 blkdev_put(bdev, zram);
393 /* hope filp_close flush all of IO */
394 filp_close(zram->backing_dev, NULL);
395 zram->backing_dev = NULL;
396 zram->bdev = NULL;
397 zram->disk->fops = &zram_devops;
398 kvfree(zram->bitmap);
399 zram->bitmap = NULL;
400 }
401
backing_dev_show(struct device * dev,struct device_attribute * attr,char * buf)402 static ssize_t backing_dev_show(struct device *dev,
403 struct device_attribute *attr, char *buf)
404 {
405 struct file *file;
406 struct zram *zram = dev_to_zram(dev);
407 char *p;
408 ssize_t ret;
409
410 down_read(&zram->init_lock);
411 file = zram->backing_dev;
412 if (!file) {
413 memcpy(buf, "none\n", 5);
414 up_read(&zram->init_lock);
415 return 5;
416 }
417
418 p = file_path(file, buf, PAGE_SIZE - 1);
419 if (IS_ERR(p)) {
420 ret = PTR_ERR(p);
421 goto out;
422 }
423
424 ret = strlen(p);
425 memmove(buf, p, ret);
426 buf[ret++] = '\n';
427 out:
428 up_read(&zram->init_lock);
429 return ret;
430 }
431
backing_dev_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)432 static ssize_t backing_dev_store(struct device *dev,
433 struct device_attribute *attr, const char *buf, size_t len)
434 {
435 char *file_name;
436 size_t sz;
437 struct file *backing_dev = NULL;
438 struct inode *inode;
439 struct address_space *mapping;
440 unsigned int bitmap_sz;
441 unsigned long nr_pages, *bitmap = NULL;
442 struct block_device *bdev = NULL;
443 int err;
444 struct zram *zram = dev_to_zram(dev);
445
446 file_name = kmalloc(PATH_MAX, GFP_KERNEL);
447 if (!file_name)
448 return -ENOMEM;
449
450 down_write(&zram->init_lock);
451 if (init_done(zram)) {
452 pr_info("Can't setup backing device for initialized device\n");
453 err = -EBUSY;
454 goto out;
455 }
456
457 strscpy(file_name, buf, PATH_MAX);
458 /* ignore trailing newline */
459 sz = strlen(file_name);
460 if (sz > 0 && file_name[sz - 1] == '\n')
461 file_name[sz - 1] = 0x00;
462
463 backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
464 if (IS_ERR(backing_dev)) {
465 err = PTR_ERR(backing_dev);
466 backing_dev = NULL;
467 goto out;
468 }
469
470 mapping = backing_dev->f_mapping;
471 inode = mapping->host;
472
473 /* Support only block device in this moment */
474 if (!S_ISBLK(inode->i_mode)) {
475 err = -ENOTBLK;
476 goto out;
477 }
478
479 bdev = blkdev_get_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
480 zram, NULL);
481 if (IS_ERR(bdev)) {
482 err = PTR_ERR(bdev);
483 bdev = NULL;
484 goto out;
485 }
486
487 nr_pages = i_size_read(inode) >> PAGE_SHIFT;
488 /* Refuse to use zero sized device (also prevents self reference) */
489 if (!nr_pages) {
490 err = -EINVAL;
491 goto out;
492 }
493
494 bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
495 bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
496 if (!bitmap) {
497 err = -ENOMEM;
498 goto out;
499 }
500
501 reset_bdev(zram);
502
503 zram->bdev = bdev;
504 zram->backing_dev = backing_dev;
505 zram->bitmap = bitmap;
506 zram->nr_pages = nr_pages;
507 up_write(&zram->init_lock);
508
509 pr_info("setup backing device %s\n", file_name);
510 kfree(file_name);
511
512 return len;
513 out:
514 kvfree(bitmap);
515
516 if (bdev)
517 blkdev_put(bdev, zram);
518
519 if (backing_dev)
520 filp_close(backing_dev, NULL);
521
522 up_write(&zram->init_lock);
523
524 kfree(file_name);
525
526 return err;
527 }
528
alloc_block_bdev(struct zram * zram)529 static unsigned long alloc_block_bdev(struct zram *zram)
530 {
531 unsigned long blk_idx = 1;
532 retry:
533 /* skip 0 bit to confuse zram.handle = 0 */
534 blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
535 if (blk_idx == zram->nr_pages)
536 return 0;
537
538 if (test_and_set_bit(blk_idx, zram->bitmap))
539 goto retry;
540
541 atomic64_inc(&zram->stats.bd_count);
542 return blk_idx;
543 }
544
free_block_bdev(struct zram * zram,unsigned long blk_idx)545 static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
546 {
547 int was_set;
548
549 was_set = test_and_clear_bit(blk_idx, zram->bitmap);
550 WARN_ON_ONCE(!was_set);
551 atomic64_dec(&zram->stats.bd_count);
552 }
553
read_from_bdev_async(struct zram * zram,struct page * page,unsigned long entry,struct bio * parent)554 static void read_from_bdev_async(struct zram *zram, struct page *page,
555 unsigned long entry, struct bio *parent)
556 {
557 struct bio *bio;
558
559 bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO);
560 bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
561 __bio_add_page(bio, page, PAGE_SIZE, 0);
562 bio_chain(bio, parent);
563 submit_bio(bio);
564 }
565
566 #define HUGE_WRITEBACK (1<<0)
567 #define IDLE_WRITEBACK (1<<1)
568 #define INCOMPRESSIBLE_WRITEBACK (1<<2)
569
writeback_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)570 static ssize_t writeback_store(struct device *dev,
571 struct device_attribute *attr, const char *buf, size_t len)
572 {
573 struct zram *zram = dev_to_zram(dev);
574 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
575 unsigned long index = 0;
576 struct bio bio;
577 struct bio_vec bio_vec;
578 struct page *page;
579 ssize_t ret = len;
580 int mode, err;
581 unsigned long blk_idx = 0;
582
583 if (sysfs_streq(buf, "idle"))
584 mode = IDLE_WRITEBACK;
585 else if (sysfs_streq(buf, "huge"))
586 mode = HUGE_WRITEBACK;
587 else if (sysfs_streq(buf, "huge_idle"))
588 mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
589 else if (sysfs_streq(buf, "incompressible"))
590 mode = INCOMPRESSIBLE_WRITEBACK;
591 else
592 return -EINVAL;
593
594 down_read(&zram->init_lock);
595 if (!init_done(zram)) {
596 ret = -EINVAL;
597 goto release_init_lock;
598 }
599
600 if (!zram->backing_dev) {
601 ret = -ENODEV;
602 goto release_init_lock;
603 }
604
605 page = alloc_page(GFP_KERNEL);
606 if (!page) {
607 ret = -ENOMEM;
608 goto release_init_lock;
609 }
610
611 for (index = 0; index < nr_pages; index++) {
612 spin_lock(&zram->wb_limit_lock);
613 if (zram->wb_limit_enable && !zram->bd_wb_limit) {
614 spin_unlock(&zram->wb_limit_lock);
615 ret = -EIO;
616 break;
617 }
618 spin_unlock(&zram->wb_limit_lock);
619
620 if (!blk_idx) {
621 blk_idx = alloc_block_bdev(zram);
622 if (!blk_idx) {
623 ret = -ENOSPC;
624 break;
625 }
626 }
627
628 zram_slot_lock(zram, index);
629 if (!zram_allocated(zram, index))
630 goto next;
631
632 if (zram_test_flag(zram, index, ZRAM_WB) ||
633 zram_test_flag(zram, index, ZRAM_SAME) ||
634 zram_test_flag(zram, index, ZRAM_UNDER_WB))
635 goto next;
636
637 if (mode & IDLE_WRITEBACK &&
638 !zram_test_flag(zram, index, ZRAM_IDLE))
639 goto next;
640 if (mode & HUGE_WRITEBACK &&
641 !zram_test_flag(zram, index, ZRAM_HUGE))
642 goto next;
643 if (mode & INCOMPRESSIBLE_WRITEBACK &&
644 !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
645 goto next;
646
647 /*
648 * Clearing ZRAM_UNDER_WB is duty of caller.
649 * IOW, zram_free_page never clear it.
650 */
651 zram_set_flag(zram, index, ZRAM_UNDER_WB);
652 /* Need for hugepage writeback racing */
653 zram_set_flag(zram, index, ZRAM_IDLE);
654 zram_slot_unlock(zram, index);
655 if (zram_read_page(zram, page, index, NULL)) {
656 zram_slot_lock(zram, index);
657 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
658 zram_clear_flag(zram, index, ZRAM_IDLE);
659 zram_slot_unlock(zram, index);
660 continue;
661 }
662
663 bio_init(&bio, zram->bdev, &bio_vec, 1,
664 REQ_OP_WRITE | REQ_SYNC);
665 bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
666 __bio_add_page(&bio, page, PAGE_SIZE, 0);
667
668 /*
669 * XXX: A single page IO would be inefficient for write
670 * but it would be not bad as starter.
671 */
672 err = submit_bio_wait(&bio);
673 if (err) {
674 zram_slot_lock(zram, index);
675 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
676 zram_clear_flag(zram, index, ZRAM_IDLE);
677 zram_slot_unlock(zram, index);
678 /*
679 * BIO errors are not fatal, we continue and simply
680 * attempt to writeback the remaining objects (pages).
681 * At the same time we need to signal user-space that
682 * some writes (at least one, but also could be all of
683 * them) were not successful and we do so by returning
684 * the most recent BIO error.
685 */
686 ret = err;
687 continue;
688 }
689
690 atomic64_inc(&zram->stats.bd_writes);
691 /*
692 * We released zram_slot_lock so need to check if the slot was
693 * changed. If there is freeing for the slot, we can catch it
694 * easily by zram_allocated.
695 * A subtle case is the slot is freed/reallocated/marked as
696 * ZRAM_IDLE again. To close the race, idle_store doesn't
697 * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
698 * Thus, we could close the race by checking ZRAM_IDLE bit.
699 */
700 zram_slot_lock(zram, index);
701 if (!zram_allocated(zram, index) ||
702 !zram_test_flag(zram, index, ZRAM_IDLE)) {
703 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
704 zram_clear_flag(zram, index, ZRAM_IDLE);
705 goto next;
706 }
707
708 zram_free_page(zram, index);
709 zram_clear_flag(zram, index, ZRAM_UNDER_WB);
710 zram_set_flag(zram, index, ZRAM_WB);
711 zram_set_element(zram, index, blk_idx);
712 blk_idx = 0;
713 atomic64_inc(&zram->stats.pages_stored);
714 spin_lock(&zram->wb_limit_lock);
715 if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
716 zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
717 spin_unlock(&zram->wb_limit_lock);
718 next:
719 zram_slot_unlock(zram, index);
720 }
721
722 if (blk_idx)
723 free_block_bdev(zram, blk_idx);
724 __free_page(page);
725 release_init_lock:
726 up_read(&zram->init_lock);
727
728 return ret;
729 }
730
731 struct zram_work {
732 struct work_struct work;
733 struct zram *zram;
734 unsigned long entry;
735 struct page *page;
736 int error;
737 };
738
zram_sync_read(struct work_struct * work)739 static void zram_sync_read(struct work_struct *work)
740 {
741 struct zram_work *zw = container_of(work, struct zram_work, work);
742 struct bio_vec bv;
743 struct bio bio;
744
745 bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ);
746 bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9);
747 __bio_add_page(&bio, zw->page, PAGE_SIZE, 0);
748 zw->error = submit_bio_wait(&bio);
749 }
750
751 /*
752 * Block layer want one ->submit_bio to be active at a time, so if we use
753 * chained IO with parent IO in same context, it's a deadlock. To avoid that,
754 * use a worker thread context.
755 */
read_from_bdev_sync(struct zram * zram,struct page * page,unsigned long entry)756 static int read_from_bdev_sync(struct zram *zram, struct page *page,
757 unsigned long entry)
758 {
759 struct zram_work work;
760
761 work.page = page;
762 work.zram = zram;
763 work.entry = entry;
764
765 INIT_WORK_ONSTACK(&work.work, zram_sync_read);
766 queue_work(system_unbound_wq, &work.work);
767 flush_work(&work.work);
768 destroy_work_on_stack(&work.work);
769
770 return work.error;
771 }
772
read_from_bdev(struct zram * zram,struct page * page,unsigned long entry,struct bio * parent)773 static int read_from_bdev(struct zram *zram, struct page *page,
774 unsigned long entry, struct bio *parent)
775 {
776 atomic64_inc(&zram->stats.bd_reads);
777 if (!parent) {
778 if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO)))
779 return -EIO;
780 return read_from_bdev_sync(zram, page, entry);
781 }
782 read_from_bdev_async(zram, page, entry, parent);
783 return 0;
784 }
785 #else
reset_bdev(struct zram * zram)786 static inline void reset_bdev(struct zram *zram) {};
read_from_bdev(struct zram * zram,struct page * page,unsigned long entry,struct bio * parent)787 static int read_from_bdev(struct zram *zram, struct page *page,
788 unsigned long entry, struct bio *parent)
789 {
790 return -EIO;
791 }
792
free_block_bdev(struct zram * zram,unsigned long blk_idx)793 static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
794 #endif
795
796 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
797
798 static struct dentry *zram_debugfs_root;
799
zram_debugfs_create(void)800 static void zram_debugfs_create(void)
801 {
802 zram_debugfs_root = debugfs_create_dir("zram", NULL);
803 }
804
zram_debugfs_destroy(void)805 static void zram_debugfs_destroy(void)
806 {
807 debugfs_remove_recursive(zram_debugfs_root);
808 }
809
read_block_state(struct file * file,char __user * buf,size_t count,loff_t * ppos)810 static ssize_t read_block_state(struct file *file, char __user *buf,
811 size_t count, loff_t *ppos)
812 {
813 char *kbuf;
814 ssize_t index, written = 0;
815 struct zram *zram = file->private_data;
816 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
817 struct timespec64 ts;
818
819 kbuf = kvmalloc(count, GFP_KERNEL);
820 if (!kbuf)
821 return -ENOMEM;
822
823 down_read(&zram->init_lock);
824 if (!init_done(zram)) {
825 up_read(&zram->init_lock);
826 kvfree(kbuf);
827 return -EINVAL;
828 }
829
830 for (index = *ppos; index < nr_pages; index++) {
831 int copied;
832
833 zram_slot_lock(zram, index);
834 if (!zram_allocated(zram, index))
835 goto next;
836
837 ts = ktime_to_timespec64(zram->table[index].ac_time);
838 copied = snprintf(kbuf + written, count,
839 "%12zd %12lld.%06lu %c%c%c%c%c%c\n",
840 index, (s64)ts.tv_sec,
841 ts.tv_nsec / NSEC_PER_USEC,
842 zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
843 zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
844 zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
845 zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
846 zram_get_priority(zram, index) ? 'r' : '.',
847 zram_test_flag(zram, index,
848 ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
849
850 if (count <= copied) {
851 zram_slot_unlock(zram, index);
852 break;
853 }
854 written += copied;
855 count -= copied;
856 next:
857 zram_slot_unlock(zram, index);
858 *ppos += 1;
859 }
860
861 up_read(&zram->init_lock);
862 if (copy_to_user(buf, kbuf, written))
863 written = -EFAULT;
864 kvfree(kbuf);
865
866 return written;
867 }
868
869 static const struct file_operations proc_zram_block_state_op = {
870 .open = simple_open,
871 .read = read_block_state,
872 .llseek = default_llseek,
873 };
874
zram_debugfs_register(struct zram * zram)875 static void zram_debugfs_register(struct zram *zram)
876 {
877 if (!zram_debugfs_root)
878 return;
879
880 zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
881 zram_debugfs_root);
882 debugfs_create_file("block_state", 0400, zram->debugfs_dir,
883 zram, &proc_zram_block_state_op);
884 }
885
zram_debugfs_unregister(struct zram * zram)886 static void zram_debugfs_unregister(struct zram *zram)
887 {
888 debugfs_remove_recursive(zram->debugfs_dir);
889 }
890 #else
zram_debugfs_create(void)891 static void zram_debugfs_create(void) {};
zram_debugfs_destroy(void)892 static void zram_debugfs_destroy(void) {};
zram_debugfs_register(struct zram * zram)893 static void zram_debugfs_register(struct zram *zram) {};
zram_debugfs_unregister(struct zram * zram)894 static void zram_debugfs_unregister(struct zram *zram) {};
895 #endif
896
897 /*
898 * We switched to per-cpu streams and this attr is not needed anymore.
899 * However, we will keep it around for some time, because:
900 * a) we may revert per-cpu streams in the future
901 * b) it's visible to user space and we need to follow our 2 years
902 * retirement rule; but we already have a number of 'soon to be
903 * altered' attrs, so max_comp_streams need to wait for the next
904 * layoff cycle.
905 */
max_comp_streams_show(struct device * dev,struct device_attribute * attr,char * buf)906 static ssize_t max_comp_streams_show(struct device *dev,
907 struct device_attribute *attr, char *buf)
908 {
909 return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
910 }
911
max_comp_streams_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)912 static ssize_t max_comp_streams_store(struct device *dev,
913 struct device_attribute *attr, const char *buf, size_t len)
914 {
915 return len;
916 }
917
comp_algorithm_set(struct zram * zram,u32 prio,const char * alg)918 static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
919 {
920 /* Do not free statically defined compression algorithms */
921 if (zram->comp_algs[prio] != default_compressor)
922 kfree(zram->comp_algs[prio]);
923
924 zram->comp_algs[prio] = alg;
925 }
926
__comp_algorithm_show(struct zram * zram,u32 prio,char * buf)927 static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf)
928 {
929 ssize_t sz;
930
931 down_read(&zram->init_lock);
932 sz = zcomp_available_show(zram->comp_algs[prio], buf);
933 up_read(&zram->init_lock);
934
935 return sz;
936 }
937
__comp_algorithm_store(struct zram * zram,u32 prio,const char * buf)938 static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
939 {
940 char *compressor;
941 size_t sz;
942
943 sz = strlen(buf);
944 if (sz >= CRYPTO_MAX_ALG_NAME)
945 return -E2BIG;
946
947 compressor = kstrdup(buf, GFP_KERNEL);
948 if (!compressor)
949 return -ENOMEM;
950
951 /* ignore trailing newline */
952 if (sz > 0 && compressor[sz - 1] == '\n')
953 compressor[sz - 1] = 0x00;
954
955 if (!zcomp_available_algorithm(compressor)) {
956 kfree(compressor);
957 return -EINVAL;
958 }
959
960 down_write(&zram->init_lock);
961 if (init_done(zram)) {
962 up_write(&zram->init_lock);
963 kfree(compressor);
964 pr_info("Can't change algorithm for initialized device\n");
965 return -EBUSY;
966 }
967
968 comp_algorithm_set(zram, prio, compressor);
969 up_write(&zram->init_lock);
970 return 0;
971 }
972
comp_algorithm_show(struct device * dev,struct device_attribute * attr,char * buf)973 static ssize_t comp_algorithm_show(struct device *dev,
974 struct device_attribute *attr,
975 char *buf)
976 {
977 struct zram *zram = dev_to_zram(dev);
978
979 return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf);
980 }
981
comp_algorithm_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)982 static ssize_t comp_algorithm_store(struct device *dev,
983 struct device_attribute *attr,
984 const char *buf,
985 size_t len)
986 {
987 struct zram *zram = dev_to_zram(dev);
988 int ret;
989
990 ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf);
991 return ret ? ret : len;
992 }
993
994 #ifdef CONFIG_ZRAM_MULTI_COMP
recomp_algorithm_show(struct device * dev,struct device_attribute * attr,char * buf)995 static ssize_t recomp_algorithm_show(struct device *dev,
996 struct device_attribute *attr,
997 char *buf)
998 {
999 struct zram *zram = dev_to_zram(dev);
1000 ssize_t sz = 0;
1001 u32 prio;
1002
1003 for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
1004 if (!zram->comp_algs[prio])
1005 continue;
1006
1007 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio);
1008 sz += __comp_algorithm_show(zram, prio, buf + sz);
1009 }
1010
1011 return sz;
1012 }
1013
recomp_algorithm_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)1014 static ssize_t recomp_algorithm_store(struct device *dev,
1015 struct device_attribute *attr,
1016 const char *buf,
1017 size_t len)
1018 {
1019 struct zram *zram = dev_to_zram(dev);
1020 int prio = ZRAM_SECONDARY_COMP;
1021 char *args, *param, *val;
1022 char *alg = NULL;
1023 int ret;
1024
1025 args = skip_spaces(buf);
1026 while (*args) {
1027 args = next_arg(args, ¶m, &val);
1028
1029 if (!val || !*val)
1030 return -EINVAL;
1031
1032 if (!strcmp(param, "algo")) {
1033 alg = val;
1034 continue;
1035 }
1036
1037 if (!strcmp(param, "priority")) {
1038 ret = kstrtoint(val, 10, &prio);
1039 if (ret)
1040 return ret;
1041 continue;
1042 }
1043 }
1044
1045 if (!alg)
1046 return -EINVAL;
1047
1048 if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
1049 return -EINVAL;
1050
1051 ret = __comp_algorithm_store(zram, prio, alg);
1052 return ret ? ret : len;
1053 }
1054 #endif
1055
compact_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)1056 static ssize_t compact_store(struct device *dev,
1057 struct device_attribute *attr, const char *buf, size_t len)
1058 {
1059 struct zram *zram = dev_to_zram(dev);
1060
1061 down_read(&zram->init_lock);
1062 if (!init_done(zram)) {
1063 up_read(&zram->init_lock);
1064 return -EINVAL;
1065 }
1066
1067 zs_compact(zram->mem_pool);
1068 up_read(&zram->init_lock);
1069
1070 return len;
1071 }
1072
io_stat_show(struct device * dev,struct device_attribute * attr,char * buf)1073 static ssize_t io_stat_show(struct device *dev,
1074 struct device_attribute *attr, char *buf)
1075 {
1076 struct zram *zram = dev_to_zram(dev);
1077 ssize_t ret;
1078
1079 down_read(&zram->init_lock);
1080 ret = scnprintf(buf, PAGE_SIZE,
1081 "%8llu %8llu 0 %8llu\n",
1082 (u64)atomic64_read(&zram->stats.failed_reads),
1083 (u64)atomic64_read(&zram->stats.failed_writes),
1084 (u64)atomic64_read(&zram->stats.notify_free));
1085 up_read(&zram->init_lock);
1086
1087 return ret;
1088 }
1089
mm_stat_show(struct device * dev,struct device_attribute * attr,char * buf)1090 static ssize_t mm_stat_show(struct device *dev,
1091 struct device_attribute *attr, char *buf)
1092 {
1093 struct zram *zram = dev_to_zram(dev);
1094 struct zs_pool_stats pool_stats;
1095 u64 orig_size, mem_used = 0;
1096 long max_used;
1097 ssize_t ret;
1098
1099 memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
1100
1101 down_read(&zram->init_lock);
1102 if (init_done(zram)) {
1103 mem_used = zs_get_total_pages(zram->mem_pool);
1104 zs_pool_stats(zram->mem_pool, &pool_stats);
1105 }
1106
1107 orig_size = atomic64_read(&zram->stats.pages_stored);
1108 max_used = atomic_long_read(&zram->stats.max_used_pages);
1109
1110 ret = scnprintf(buf, PAGE_SIZE,
1111 "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
1112 orig_size << PAGE_SHIFT,
1113 (u64)atomic64_read(&zram->stats.compr_data_size),
1114 mem_used << PAGE_SHIFT,
1115 zram->limit_pages << PAGE_SHIFT,
1116 max_used << PAGE_SHIFT,
1117 (u64)atomic64_read(&zram->stats.same_pages),
1118 atomic_long_read(&pool_stats.pages_compacted),
1119 (u64)atomic64_read(&zram->stats.huge_pages),
1120 (u64)atomic64_read(&zram->stats.huge_pages_since));
1121 up_read(&zram->init_lock);
1122
1123 return ret;
1124 }
1125
1126 #ifdef CONFIG_ZRAM_WRITEBACK
1127 #define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
bd_stat_show(struct device * dev,struct device_attribute * attr,char * buf)1128 static ssize_t bd_stat_show(struct device *dev,
1129 struct device_attribute *attr, char *buf)
1130 {
1131 struct zram *zram = dev_to_zram(dev);
1132 ssize_t ret;
1133
1134 down_read(&zram->init_lock);
1135 ret = scnprintf(buf, PAGE_SIZE,
1136 "%8llu %8llu %8llu\n",
1137 FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
1138 FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
1139 FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
1140 up_read(&zram->init_lock);
1141
1142 return ret;
1143 }
1144 #endif
1145
debug_stat_show(struct device * dev,struct device_attribute * attr,char * buf)1146 static ssize_t debug_stat_show(struct device *dev,
1147 struct device_attribute *attr, char *buf)
1148 {
1149 int version = 1;
1150 struct zram *zram = dev_to_zram(dev);
1151 ssize_t ret;
1152
1153 down_read(&zram->init_lock);
1154 ret = scnprintf(buf, PAGE_SIZE,
1155 "version: %d\n%8llu %8llu\n",
1156 version,
1157 (u64)atomic64_read(&zram->stats.writestall),
1158 (u64)atomic64_read(&zram->stats.miss_free));
1159 up_read(&zram->init_lock);
1160
1161 return ret;
1162 }
1163
1164 static DEVICE_ATTR_RO(io_stat);
1165 static DEVICE_ATTR_RO(mm_stat);
1166 #ifdef CONFIG_ZRAM_WRITEBACK
1167 static DEVICE_ATTR_RO(bd_stat);
1168 #endif
1169 static DEVICE_ATTR_RO(debug_stat);
1170
1171 #ifdef CONFIG_ZRAM_GROUP
group_show(struct device * dev,struct device_attribute * attr,char * buf)1172 static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf)
1173 {
1174 struct zram *zram = dev_to_zram(dev);
1175 int ret = 0;
1176
1177 down_read(&zram->init_lock);
1178 if (zram->zgrp_ctrl == ZGRP_NONE)
1179 ret = snprintf(buf, PAGE_SIZE - 1, "disable\n");
1180 else if (zram->zgrp_ctrl == ZGRP_TRACK)
1181 ret = snprintf(buf, PAGE_SIZE - 1, "readonly\n");
1182 #ifdef CONFIG_ZRAM_GROUP_WRITEBACK
1183 else if (zram->zgrp_ctrl == ZGRP_WRITE)
1184 ret = snprintf(buf, PAGE_SIZE - 1, "readwrite\n");
1185 #endif
1186 up_read(&zram->init_lock);
1187
1188 return ret;
1189 }
1190
group_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)1191 static ssize_t group_store(struct device *dev, struct device_attribute *attr,
1192 const char *buf, size_t len)
1193 {
1194 struct zram *zram = dev_to_zram(dev);
1195 int ret;
1196 #ifdef CONFIG_ZRAM_GROUP_DEBUG
1197 u32 op, gid, index;
1198
1199 ret = sscanf(buf, "%u %u %u", &op, &index, &gid);
1200 if (ret == 3) {
1201 pr_info("op[%u] index[%u] gid[%u].\n", op, index, gid);
1202 group_debug(zram, op, index, gid);
1203 return len;
1204 }
1205 #endif
1206
1207 ret = len;
1208 down_write(&zram->init_lock);
1209 if (init_done(zram)) {
1210 pr_info("Can't setup group ctrl for initialized device!\n");
1211 ret = -EBUSY;
1212 goto out;
1213 }
1214 if (!strcmp(buf, "disable\n"))
1215 zram->zgrp_ctrl = ZGRP_NONE;
1216 else if (!strcmp(buf, "readonly\n"))
1217 zram->zgrp_ctrl = ZGRP_TRACK;
1218 #ifdef CONFIG_ZRAM_GROUP_WRITEBACK
1219 else if (!strcmp(buf, "readwrite\n"))
1220 zram->zgrp_ctrl = ZGRP_WRITE;
1221 #endif
1222 else
1223 ret = -EINVAL;
1224 out:
1225 up_write(&zram->init_lock);
1226
1227 return ret;
1228 }
1229 #endif
1230
zram_meta_free(struct zram * zram,u64 disksize)1231 static void zram_meta_free(struct zram *zram, u64 disksize)
1232 {
1233 size_t num_pages = disksize >> PAGE_SHIFT;
1234 size_t index;
1235
1236 if (!zram->table)
1237 return;
1238
1239 /* Free all pages that are still in this zram device */
1240 for (index = 0; index < num_pages; index++)
1241 zram_free_page(zram, index);
1242
1243 zs_destroy_pool(zram->mem_pool);
1244 vfree(zram->table);
1245 zram->table = NULL;
1246 #ifdef CONFIG_ZRAM_GROUP
1247 zram_group_deinit(zram);
1248 #endif
1249 }
1250
zram_meta_alloc(struct zram * zram,u64 disksize)1251 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1252 {
1253 size_t num_pages;
1254
1255 num_pages = disksize >> PAGE_SHIFT;
1256 zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1257 if (!zram->table)
1258 return false;
1259
1260 zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1261 if (!zram->mem_pool) {
1262 vfree(zram->table);
1263 zram->table = NULL;
1264 return false;
1265 }
1266
1267 if (!huge_class_size)
1268 huge_class_size = zs_huge_class_size(zram->mem_pool);
1269 #ifdef CONFIG_ZRAM_GROUP
1270 zram_group_init(zram, num_pages);
1271 #endif
1272
1273 return true;
1274 }
1275
1276 /*
1277 * To protect concurrent access to the same index entry,
1278 * caller should hold this table index entry's bit_spinlock to
1279 * indicate this index entry is accessing.
1280 */
zram_free_page(struct zram * zram,size_t index)1281 static void zram_free_page(struct zram *zram, size_t index)
1282 {
1283 unsigned long handle;
1284
1285 #ifdef CONFIG_ZRAM_GROUP
1286 zram_group_untrack_obj(zram, index);
1287 #endif
1288
1289 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
1290 zram->table[index].ac_time = 0;
1291 #endif
1292 if (zram_test_flag(zram, index, ZRAM_IDLE))
1293 zram_clear_flag(zram, index, ZRAM_IDLE);
1294
1295 if (zram_test_flag(zram, index, ZRAM_HUGE)) {
1296 zram_clear_flag(zram, index, ZRAM_HUGE);
1297 atomic64_dec(&zram->stats.huge_pages);
1298 }
1299
1300 if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1301 zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE);
1302
1303 zram_set_priority(zram, index, 0);
1304
1305 if (zram_test_flag(zram, index, ZRAM_WB)) {
1306 zram_clear_flag(zram, index, ZRAM_WB);
1307 free_block_bdev(zram, zram_get_element(zram, index));
1308 goto out;
1309 }
1310
1311 /*
1312 * No memory is allocated for same element filled pages.
1313 * Simply clear same page flag.
1314 */
1315 if (zram_test_flag(zram, index, ZRAM_SAME)) {
1316 zram_clear_flag(zram, index, ZRAM_SAME);
1317 atomic64_dec(&zram->stats.same_pages);
1318 goto out;
1319 }
1320
1321 handle = zram_get_handle(zram, index);
1322 if (!handle)
1323 return;
1324
1325 zs_free(zram->mem_pool, handle);
1326
1327 atomic64_sub(zram_get_obj_size(zram, index),
1328 &zram->stats.compr_data_size);
1329 out:
1330 atomic64_dec(&zram->stats.pages_stored);
1331 zram_set_handle(zram, index, 0);
1332 zram_set_obj_size(zram, index, 0);
1333 WARN_ON_ONCE(zram->table[index].flags &
1334 ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
1335 }
1336
1337 /*
1338 * Reads (decompresses if needed) a page from zspool (zsmalloc).
1339 * Corresponding ZRAM slot should be locked.
1340 */
zram_read_from_zspool(struct zram * zram,struct page * page,u32 index)1341 static int zram_read_from_zspool(struct zram *zram, struct page *page,
1342 u32 index)
1343 {
1344 struct zcomp_strm *zstrm;
1345 unsigned long handle;
1346 unsigned int size;
1347 void *src, *dst;
1348 u32 prio;
1349 int ret;
1350
1351 handle = zram_get_handle(zram, index);
1352 if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
1353 unsigned long value;
1354 void *mem;
1355
1356 value = handle ? zram_get_element(zram, index) : 0;
1357 mem = kmap_atomic(page);
1358 zram_fill_page(mem, PAGE_SIZE, value);
1359 kunmap_atomic(mem);
1360 return 0;
1361 }
1362
1363 size = zram_get_obj_size(zram, index);
1364
1365 if (size != PAGE_SIZE) {
1366 prio = zram_get_priority(zram, index);
1367 zstrm = zcomp_stream_get(zram->comps[prio]);
1368 }
1369
1370 src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1371 if (size == PAGE_SIZE) {
1372 dst = kmap_atomic(page);
1373 memcpy(dst, src, PAGE_SIZE);
1374 kunmap_atomic(dst);
1375 ret = 0;
1376 } else {
1377 dst = kmap_atomic(page);
1378 ret = zcomp_decompress(zstrm, src, size, dst);
1379 kunmap_atomic(dst);
1380 zcomp_stream_put(zram->comps[prio]);
1381 }
1382 zs_unmap_object(zram->mem_pool, handle);
1383 return ret;
1384 }
1385
zram_read_page(struct zram * zram,struct page * page,u32 index,struct bio * parent)1386 static int zram_read_page(struct zram *zram, struct page *page, u32 index,
1387 struct bio *parent)
1388 {
1389 int ret;
1390
1391 zram_slot_lock(zram, index);
1392 #ifdef CONFIG_ZRAM_GROUP_WRITEBACK
1393 if (!parent) {
1394 ret = zram_group_fault_obj(zram, index);
1395 if (ret) {
1396 zram_slot_unlock(zram, index);
1397 return ret;
1398 }
1399 }
1400
1401 if (zram_test_flag(zram, index, ZRAM_GWB)) {
1402 zram_slot_unlock(zram, index);
1403 return -EIO;
1404 }
1405 #endif
1406 if (!zram_test_flag(zram, index, ZRAM_WB)) {
1407 /* Slot should be locked through out the function call */
1408 ret = zram_read_from_zspool(zram, page, index);
1409 zram_slot_unlock(zram, index);
1410 } else {
1411 /*
1412 * The slot should be unlocked before reading from the backing
1413 * device.
1414 */
1415 zram_slot_unlock(zram, index);
1416
1417 ret = read_from_bdev(zram, page, zram_get_element(zram, index),
1418 parent);
1419 }
1420
1421 /* Should NEVER happen. Return bio error if it does. */
1422 if (WARN_ON(ret < 0))
1423 pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
1424
1425 return ret;
1426 }
1427
1428 /*
1429 * Use a temporary buffer to decompress the page, as the decompressor
1430 * always expects a full page for the output.
1431 */
zram_bvec_read_partial(struct zram * zram,struct bio_vec * bvec,u32 index,int offset)1432 static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec,
1433 u32 index, int offset)
1434 {
1435 struct page *page = alloc_page(GFP_NOIO);
1436 int ret;
1437
1438 if (!page)
1439 return -ENOMEM;
1440 ret = zram_read_page(zram, page, index, NULL);
1441 if (likely(!ret))
1442 memcpy_to_bvec(bvec, page_address(page) + offset);
1443 __free_page(page);
1444 return ret;
1445 }
1446
zram_bvec_read(struct zram * zram,struct bio_vec * bvec,u32 index,int offset,struct bio * bio)1447 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
1448 u32 index, int offset, struct bio *bio)
1449 {
1450 if (is_partial_io(bvec))
1451 return zram_bvec_read_partial(zram, bvec, index, offset);
1452 return zram_read_page(zram, bvec->bv_page, index, bio);
1453 }
1454
zram_write_page(struct zram * zram,struct page * page,u32 index)1455 static int zram_write_page(struct zram *zram, struct page *page, u32 index)
1456 {
1457 int ret = 0;
1458 unsigned long alloced_pages;
1459 unsigned long handle = -ENOMEM;
1460 unsigned int comp_len = 0;
1461 void *src, *dst, *mem;
1462 struct zcomp_strm *zstrm;
1463 unsigned long element = 0;
1464 enum zram_pageflags flags = 0;
1465
1466 mem = kmap_atomic(page);
1467 if (page_same_filled(mem, &element)) {
1468 kunmap_atomic(mem);
1469 /* Free memory associated with this sector now. */
1470 flags = ZRAM_SAME;
1471 atomic64_inc(&zram->stats.same_pages);
1472 goto out;
1473 }
1474 kunmap_atomic(mem);
1475
1476 compress_again:
1477 zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
1478 src = kmap_atomic(page);
1479 ret = zcomp_compress(zstrm, src, &comp_len);
1480 kunmap_atomic(src);
1481
1482 if (unlikely(ret)) {
1483 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1484 pr_err("Compression failed! err=%d\n", ret);
1485 zs_free(zram->mem_pool, handle);
1486 return ret;
1487 }
1488
1489 if (comp_len >= huge_class_size)
1490 comp_len = PAGE_SIZE;
1491 /*
1492 * handle allocation has 2 paths:
1493 * a) fast path is executed with preemption disabled (for
1494 * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
1495 * since we can't sleep;
1496 * b) slow path enables preemption and attempts to allocate
1497 * the page with __GFP_DIRECT_RECLAIM bit set. we have to
1498 * put per-cpu compression stream and, thus, to re-do
1499 * the compression once handle is allocated.
1500 *
1501 * if we have a 'non-null' handle here then we are coming
1502 * from the slow path and handle has already been allocated.
1503 */
1504 if (IS_ERR_VALUE(handle))
1505 handle = zs_malloc(zram->mem_pool, comp_len,
1506 __GFP_KSWAPD_RECLAIM |
1507 __GFP_NOWARN |
1508 __GFP_HIGHMEM |
1509 __GFP_MOVABLE);
1510 if (IS_ERR_VALUE(handle)) {
1511 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1512 atomic64_inc(&zram->stats.writestall);
1513 handle = zs_malloc(zram->mem_pool, comp_len,
1514 GFP_NOIO | __GFP_HIGHMEM |
1515 __GFP_MOVABLE);
1516 if (IS_ERR_VALUE(handle))
1517 return PTR_ERR((void *)handle);
1518
1519 if (comp_len != PAGE_SIZE)
1520 goto compress_again;
1521 /*
1522 * If the page is not compressible, you need to acquire the
1523 * lock and execute the code below. The zcomp_stream_get()
1524 * call is needed to disable the cpu hotplug and grab the
1525 * zstrm buffer back. It is necessary that the dereferencing
1526 * of the zstrm variable below occurs correctly.
1527 */
1528 zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
1529 }
1530
1531 alloced_pages = zs_get_total_pages(zram->mem_pool);
1532 update_used_max(zram, alloced_pages);
1533
1534 if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1535 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1536 zs_free(zram->mem_pool, handle);
1537 return -ENOMEM;
1538 }
1539
1540 dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
1541
1542 src = zstrm->buffer;
1543 if (comp_len == PAGE_SIZE)
1544 src = kmap_atomic(page);
1545 memcpy(dst, src, comp_len);
1546 if (comp_len == PAGE_SIZE)
1547 kunmap_atomic(src);
1548
1549 zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1550 zs_unmap_object(zram->mem_pool, handle);
1551 atomic64_add(comp_len, &zram->stats.compr_data_size);
1552 out:
1553 /*
1554 * Free memory associated with this sector
1555 * before overwriting unused sectors.
1556 */
1557 zram_slot_lock(zram, index);
1558 zram_free_page(zram, index);
1559
1560 if (comp_len == PAGE_SIZE) {
1561 zram_set_flag(zram, index, ZRAM_HUGE);
1562 atomic64_inc(&zram->stats.huge_pages);
1563 atomic64_inc(&zram->stats.huge_pages_since);
1564 }
1565
1566 if (flags) {
1567 zram_set_flag(zram, index, flags);
1568 zram_set_element(zram, index, element);
1569 } else {
1570 zram_set_handle(zram, index, handle);
1571 zram_set_obj_size(zram, index, comp_len);
1572 }
1573 #ifdef CONFIG_ZRAM_GROUP
1574 zram_group_track_obj(zram, index, page_memcg(page));
1575 #endif
1576 zram_slot_unlock(zram, index);
1577
1578 /* Update stats */
1579 atomic64_inc(&zram->stats.pages_stored);
1580 return ret;
1581 }
1582
1583 /*
1584 * This is a partial IO. Read the full page before writing the changes.
1585 */
zram_bvec_write_partial(struct zram * zram,struct bio_vec * bvec,u32 index,int offset,struct bio * bio)1586 static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec,
1587 u32 index, int offset, struct bio *bio)
1588 {
1589 struct page *page = alloc_page(GFP_NOIO);
1590 int ret;
1591
1592 if (!page)
1593 return -ENOMEM;
1594
1595 ret = zram_read_page(zram, page, index, bio);
1596 if (!ret) {
1597 memcpy_from_bvec(page_address(page) + offset, bvec);
1598 ret = zram_write_page(zram, page, index);
1599 }
1600 __free_page(page);
1601 return ret;
1602 }
1603
zram_bvec_write(struct zram * zram,struct bio_vec * bvec,u32 index,int offset,struct bio * bio)1604 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1605 u32 index, int offset, struct bio *bio)
1606 {
1607 if (is_partial_io(bvec))
1608 return zram_bvec_write_partial(zram, bvec, index, offset, bio);
1609 return zram_write_page(zram, bvec->bv_page, index);
1610 }
1611
1612 #ifdef CONFIG_ZRAM_MULTI_COMP
1613 /*
1614 * This function will decompress (unless it's ZRAM_HUGE) the page and then
1615 * attempt to compress it using provided compression algorithm priority
1616 * (which is potentially more effective).
1617 *
1618 * Corresponding ZRAM slot should be locked.
1619 */
zram_recompress(struct zram * zram,u32 index,struct page * page,u32 threshold,u32 prio,u32 prio_max)1620 static int zram_recompress(struct zram *zram, u32 index, struct page *page,
1621 u32 threshold, u32 prio, u32 prio_max)
1622 {
1623 struct zcomp_strm *zstrm = NULL;
1624 unsigned long handle_old;
1625 unsigned long handle_new;
1626 unsigned int comp_len_old;
1627 unsigned int comp_len_new;
1628 unsigned int class_index_old;
1629 unsigned int class_index_new;
1630 u32 num_recomps = 0;
1631 void *src, *dst;
1632 int ret;
1633
1634 handle_old = zram_get_handle(zram, index);
1635 if (!handle_old)
1636 return -EINVAL;
1637
1638 comp_len_old = zram_get_obj_size(zram, index);
1639 /*
1640 * Do not recompress objects that are already "small enough".
1641 */
1642 if (comp_len_old < threshold)
1643 return 0;
1644
1645 ret = zram_read_from_zspool(zram, page, index);
1646 if (ret)
1647 return ret;
1648
1649 /*
1650 * We touched this entry so mark it as non-IDLE. This makes sure that
1651 * we don't preserve IDLE flag and don't incorrectly pick this entry
1652 * for different post-processing type (e.g. writeback).
1653 */
1654 zram_clear_flag(zram, index, ZRAM_IDLE);
1655
1656 class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
1657 /*
1658 * Iterate the secondary comp algorithms list (in order of priority)
1659 * and try to recompress the page.
1660 */
1661 for (; prio < prio_max; prio++) {
1662 if (!zram->comps[prio])
1663 continue;
1664
1665 /*
1666 * Skip if the object is already re-compressed with a higher
1667 * priority algorithm (or same algorithm).
1668 */
1669 if (prio <= zram_get_priority(zram, index))
1670 continue;
1671
1672 num_recomps++;
1673 zstrm = zcomp_stream_get(zram->comps[prio]);
1674 src = kmap_atomic(page);
1675 ret = zcomp_compress(zstrm, src, &comp_len_new);
1676 kunmap_atomic(src);
1677
1678 if (ret) {
1679 zcomp_stream_put(zram->comps[prio]);
1680 return ret;
1681 }
1682
1683 class_index_new = zs_lookup_class_index(zram->mem_pool,
1684 comp_len_new);
1685
1686 /* Continue until we make progress */
1687 if (class_index_new >= class_index_old ||
1688 (threshold && comp_len_new >= threshold)) {
1689 zcomp_stream_put(zram->comps[prio]);
1690 continue;
1691 }
1692
1693 /* Recompression was successful so break out */
1694 break;
1695 }
1696
1697 /*
1698 * We did not try to recompress, e.g. when we have only one
1699 * secondary algorithm and the page is already recompressed
1700 * using that algorithm
1701 */
1702 if (!zstrm)
1703 return 0;
1704
1705 if (class_index_new >= class_index_old) {
1706 /*
1707 * Secondary algorithms failed to re-compress the page
1708 * in a way that would save memory, mark the object as
1709 * incompressible so that we will not try to compress
1710 * it again.
1711 *
1712 * We need to make sure that all secondary algorithms have
1713 * failed, so we test if the number of recompressions matches
1714 * the number of active secondary algorithms.
1715 */
1716 if (num_recomps == zram->num_active_comps - 1)
1717 zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE);
1718 return 0;
1719 }
1720
1721 /* Successful recompression but above threshold */
1722 if (threshold && comp_len_new >= threshold)
1723 return 0;
1724
1725 /*
1726 * No direct reclaim (slow path) for handle allocation and no
1727 * re-compression attempt (unlike in zram_write_bvec()) since
1728 * we already have stored that object in zsmalloc. If we cannot
1729 * alloc memory for recompressed object then we bail out and
1730 * simply keep the old (existing) object in zsmalloc.
1731 */
1732 handle_new = zs_malloc(zram->mem_pool, comp_len_new,
1733 __GFP_KSWAPD_RECLAIM |
1734 __GFP_NOWARN |
1735 __GFP_HIGHMEM |
1736 __GFP_MOVABLE);
1737 if (IS_ERR_VALUE(handle_new)) {
1738 zcomp_stream_put(zram->comps[prio]);
1739 return PTR_ERR((void *)handle_new);
1740 }
1741
1742 dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
1743 memcpy(dst, zstrm->buffer, comp_len_new);
1744 zcomp_stream_put(zram->comps[prio]);
1745
1746 zs_unmap_object(zram->mem_pool, handle_new);
1747
1748 zram_free_page(zram, index);
1749 zram_set_handle(zram, index, handle_new);
1750 zram_set_obj_size(zram, index, comp_len_new);
1751 zram_set_priority(zram, index, prio);
1752
1753 atomic64_add(comp_len_new, &zram->stats.compr_data_size);
1754 atomic64_inc(&zram->stats.pages_stored);
1755
1756 return 0;
1757 }
1758
1759 #define RECOMPRESS_IDLE (1 << 0)
1760 #define RECOMPRESS_HUGE (1 << 1)
1761
recompress_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)1762 static ssize_t recompress_store(struct device *dev,
1763 struct device_attribute *attr,
1764 const char *buf, size_t len)
1765 {
1766 u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS;
1767 struct zram *zram = dev_to_zram(dev);
1768 unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
1769 char *args, *param, *val, *algo = NULL;
1770 u32 mode = 0, threshold = 0;
1771 unsigned long index;
1772 struct page *page;
1773 ssize_t ret;
1774
1775 args = skip_spaces(buf);
1776 while (*args) {
1777 args = next_arg(args, ¶m, &val);
1778
1779 if (!val || !*val)
1780 return -EINVAL;
1781
1782 if (!strcmp(param, "type")) {
1783 if (!strcmp(val, "idle"))
1784 mode = RECOMPRESS_IDLE;
1785 if (!strcmp(val, "huge"))
1786 mode = RECOMPRESS_HUGE;
1787 if (!strcmp(val, "huge_idle"))
1788 mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
1789 continue;
1790 }
1791
1792 if (!strcmp(param, "threshold")) {
1793 /*
1794 * We will re-compress only idle objects equal or
1795 * greater in size than watermark.
1796 */
1797 ret = kstrtouint(val, 10, &threshold);
1798 if (ret)
1799 return ret;
1800 continue;
1801 }
1802
1803 if (!strcmp(param, "algo")) {
1804 algo = val;
1805 continue;
1806 }
1807 }
1808
1809 if (threshold >= huge_class_size)
1810 return -EINVAL;
1811
1812 down_read(&zram->init_lock);
1813 if (!init_done(zram)) {
1814 ret = -EINVAL;
1815 goto release_init_lock;
1816 }
1817
1818 if (algo) {
1819 bool found = false;
1820
1821 for (; prio < ZRAM_MAX_COMPS; prio++) {
1822 if (!zram->comp_algs[prio])
1823 continue;
1824
1825 if (!strcmp(zram->comp_algs[prio], algo)) {
1826 prio_max = min(prio + 1, ZRAM_MAX_COMPS);
1827 found = true;
1828 break;
1829 }
1830 }
1831
1832 if (!found) {
1833 ret = -EINVAL;
1834 goto release_init_lock;
1835 }
1836 }
1837
1838 page = alloc_page(GFP_KERNEL);
1839 if (!page) {
1840 ret = -ENOMEM;
1841 goto release_init_lock;
1842 }
1843
1844 ret = len;
1845 for (index = 0; index < nr_pages; index++) {
1846 int err = 0;
1847
1848 zram_slot_lock(zram, index);
1849
1850 if (!zram_allocated(zram, index))
1851 goto next;
1852
1853 if (mode & RECOMPRESS_IDLE &&
1854 !zram_test_flag(zram, index, ZRAM_IDLE))
1855 goto next;
1856
1857 if (mode & RECOMPRESS_HUGE &&
1858 !zram_test_flag(zram, index, ZRAM_HUGE))
1859 goto next;
1860
1861 if (zram_test_flag(zram, index, ZRAM_WB) ||
1862 zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
1863 zram_test_flag(zram, index, ZRAM_SAME) ||
1864 zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1865 goto next;
1866
1867 err = zram_recompress(zram, index, page, threshold,
1868 prio, prio_max);
1869 next:
1870 zram_slot_unlock(zram, index);
1871 if (err) {
1872 ret = err;
1873 break;
1874 }
1875
1876 cond_resched();
1877 }
1878
1879 __free_page(page);
1880
1881 release_init_lock:
1882 up_read(&zram->init_lock);
1883 return ret;
1884 }
1885 #endif
1886
zram_bio_discard(struct zram * zram,struct bio * bio)1887 static void zram_bio_discard(struct zram *zram, struct bio *bio)
1888 {
1889 size_t n = bio->bi_iter.bi_size;
1890 u32 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1891 u32 offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1892 SECTOR_SHIFT;
1893
1894 /*
1895 * zram manages data in physical block size units. Because logical block
1896 * size isn't identical with physical block size on some arch, we
1897 * could get a discard request pointing to a specific offset within a
1898 * certain physical block. Although we can handle this request by
1899 * reading that physiclal block and decompressing and partially zeroing
1900 * and re-compressing and then re-storing it, this isn't reasonable
1901 * because our intent with a discard request is to save memory. So
1902 * skipping this logical block is appropriate here.
1903 */
1904 if (offset) {
1905 if (n <= (PAGE_SIZE - offset))
1906 return;
1907
1908 n -= (PAGE_SIZE - offset);
1909 index++;
1910 }
1911
1912 while (n >= PAGE_SIZE) {
1913 zram_slot_lock(zram, index);
1914 zram_free_page(zram, index);
1915 zram_slot_unlock(zram, index);
1916 atomic64_inc(&zram->stats.notify_free);
1917 index++;
1918 n -= PAGE_SIZE;
1919 }
1920
1921 bio_endio(bio);
1922 }
1923
zram_bio_read(struct zram * zram,struct bio * bio)1924 static void zram_bio_read(struct zram *zram, struct bio *bio)
1925 {
1926 unsigned long start_time = bio_start_io_acct(bio);
1927 struct bvec_iter iter = bio->bi_iter;
1928
1929 do {
1930 u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1931 u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1932 SECTOR_SHIFT;
1933 struct bio_vec bv = bio_iter_iovec(bio, iter);
1934
1935 bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
1936
1937 if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
1938 atomic64_inc(&zram->stats.failed_reads);
1939 bio->bi_status = BLK_STS_IOERR;
1940 break;
1941 }
1942 flush_dcache_page(bv.bv_page);
1943
1944 zram_slot_lock(zram, index);
1945 zram_accessed(zram, index);
1946 zram_slot_unlock(zram, index);
1947
1948 bio_advance_iter_single(bio, &iter, bv.bv_len);
1949 } while (iter.bi_size);
1950
1951 bio_end_io_acct(bio, start_time);
1952 bio_endio(bio);
1953 }
1954
zram_bio_write(struct zram * zram,struct bio * bio)1955 static void zram_bio_write(struct zram *zram, struct bio *bio)
1956 {
1957 unsigned long start_time = bio_start_io_acct(bio);
1958 struct bvec_iter iter = bio->bi_iter;
1959
1960 do {
1961 u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1962 u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1963 SECTOR_SHIFT;
1964 struct bio_vec bv = bio_iter_iovec(bio, iter);
1965
1966 bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
1967
1968 if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) {
1969 atomic64_inc(&zram->stats.failed_writes);
1970 bio->bi_status = BLK_STS_IOERR;
1971 break;
1972 }
1973
1974 zram_slot_lock(zram, index);
1975 zram_accessed(zram, index);
1976 zram_slot_unlock(zram, index);
1977
1978 bio_advance_iter_single(bio, &iter, bv.bv_len);
1979 } while (iter.bi_size);
1980
1981 bio_end_io_acct(bio, start_time);
1982 bio_endio(bio);
1983 }
1984
1985 /*
1986 * Handler function for all zram I/O requests.
1987 */
zram_submit_bio(struct bio * bio)1988 static void zram_submit_bio(struct bio *bio)
1989 {
1990 struct zram *zram = bio->bi_bdev->bd_disk->private_data;
1991
1992 switch (bio_op(bio)) {
1993 case REQ_OP_READ:
1994 zram_bio_read(zram, bio);
1995 break;
1996 case REQ_OP_WRITE:
1997 zram_bio_write(zram, bio);
1998 break;
1999 case REQ_OP_DISCARD:
2000 case REQ_OP_WRITE_ZEROES:
2001 zram_bio_discard(zram, bio);
2002 break;
2003 default:
2004 WARN_ON_ONCE(1);
2005 bio_endio(bio);
2006 }
2007 }
2008
zram_slot_free_notify(struct block_device * bdev,unsigned long index)2009 static void zram_slot_free_notify(struct block_device *bdev,
2010 unsigned long index)
2011 {
2012 struct zram *zram;
2013
2014 zram = bdev->bd_disk->private_data;
2015
2016 atomic64_inc(&zram->stats.notify_free);
2017 if (!zram_slot_trylock(zram, index)) {
2018 atomic64_inc(&zram->stats.miss_free);
2019 return;
2020 }
2021
2022 zram_free_page(zram, index);
2023 zram_slot_unlock(zram, index);
2024 }
2025
zram_destroy_comps(struct zram * zram)2026 static void zram_destroy_comps(struct zram *zram)
2027 {
2028 u32 prio;
2029
2030 for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
2031 struct zcomp *comp = zram->comps[prio];
2032
2033 zram->comps[prio] = NULL;
2034 if (!comp)
2035 continue;
2036 zcomp_destroy(comp);
2037 zram->num_active_comps--;
2038 }
2039
2040 for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2041 /* Do not free statically defined compression algorithms */
2042 if (zram->comp_algs[prio] != default_compressor)
2043 kfree(zram->comp_algs[prio]);
2044 zram->comp_algs[prio] = NULL;
2045 }
2046 }
2047
zram_reset_device(struct zram * zram)2048 static void zram_reset_device(struct zram *zram)
2049 {
2050 down_write(&zram->init_lock);
2051
2052 zram->limit_pages = 0;
2053
2054 set_capacity_and_notify(zram->disk, 0);
2055 part_stat_set_all(zram->disk->part0, 0);
2056
2057 /* I/O operation under all of CPU are done so let's free */
2058 zram_meta_free(zram, zram->disksize);
2059 zram->disksize = 0;
2060 zram_destroy_comps(zram);
2061 memset(&zram->stats, 0, sizeof(zram->stats));
2062 reset_bdev(zram);
2063
2064 comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2065 up_write(&zram->init_lock);
2066 }
2067
disksize_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)2068 static ssize_t disksize_store(struct device *dev,
2069 struct device_attribute *attr, const char *buf, size_t len)
2070 {
2071 u64 disksize;
2072 struct zcomp *comp;
2073 struct zram *zram = dev_to_zram(dev);
2074 int err;
2075 u32 prio;
2076
2077 disksize = memparse(buf, NULL);
2078 if (!disksize)
2079 return -EINVAL;
2080
2081 down_write(&zram->init_lock);
2082 if (init_done(zram)) {
2083 pr_info("Cannot change disksize for initialized device\n");
2084 err = -EBUSY;
2085 goto out_unlock;
2086 }
2087
2088 disksize = PAGE_ALIGN(disksize);
2089 if (!zram_meta_alloc(zram, disksize)) {
2090 err = -ENOMEM;
2091 goto out_unlock;
2092 }
2093
2094 for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
2095 if (!zram->comp_algs[prio])
2096 continue;
2097
2098 comp = zcomp_create(zram->comp_algs[prio]);
2099 if (IS_ERR(comp)) {
2100 pr_err("Cannot initialise %s compressing backend\n",
2101 zram->comp_algs[prio]);
2102 err = PTR_ERR(comp);
2103 goto out_free_comps;
2104 }
2105
2106 zram->comps[prio] = comp;
2107 zram->num_active_comps++;
2108 }
2109 zram->disksize = disksize;
2110 set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
2111 up_write(&zram->init_lock);
2112
2113 return len;
2114
2115 out_free_comps:
2116 zram_destroy_comps(zram);
2117 zram_meta_free(zram, disksize);
2118 out_unlock:
2119 up_write(&zram->init_lock);
2120 return err;
2121 }
2122
reset_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)2123 static ssize_t reset_store(struct device *dev,
2124 struct device_attribute *attr, const char *buf, size_t len)
2125 {
2126 int ret;
2127 unsigned short do_reset;
2128 struct zram *zram;
2129 struct gendisk *disk;
2130
2131 ret = kstrtou16(buf, 10, &do_reset);
2132 if (ret)
2133 return ret;
2134
2135 if (!do_reset)
2136 return -EINVAL;
2137
2138 zram = dev_to_zram(dev);
2139 disk = zram->disk;
2140
2141 mutex_lock(&disk->open_mutex);
2142 /* Do not reset an active device or claimed device */
2143 if (disk_openers(disk) || zram->claim) {
2144 mutex_unlock(&disk->open_mutex);
2145 return -EBUSY;
2146 }
2147
2148 /* From now on, anyone can't open /dev/zram[0-9] */
2149 zram->claim = true;
2150 mutex_unlock(&disk->open_mutex);
2151
2152 /* Make sure all the pending I/O are finished */
2153 sync_blockdev(disk->part0);
2154 zram_reset_device(zram);
2155
2156 mutex_lock(&disk->open_mutex);
2157 zram->claim = false;
2158 mutex_unlock(&disk->open_mutex);
2159
2160 return len;
2161 }
2162
zram_open(struct gendisk * disk,blk_mode_t mode)2163 static int zram_open(struct gendisk *disk, blk_mode_t mode)
2164 {
2165 struct zram *zram = disk->private_data;
2166
2167 WARN_ON(!mutex_is_locked(&disk->open_mutex));
2168
2169 /* zram was claimed to reset so open request fails */
2170 if (zram->claim)
2171 return -EBUSY;
2172 return 0;
2173 }
2174
2175 static const struct block_device_operations zram_devops = {
2176 .open = zram_open,
2177 .submit_bio = zram_submit_bio,
2178 .swap_slot_free_notify = zram_slot_free_notify,
2179 .owner = THIS_MODULE
2180 };
2181
2182 static DEVICE_ATTR_WO(compact);
2183 static DEVICE_ATTR_RW(disksize);
2184 static DEVICE_ATTR_RO(initstate);
2185 static DEVICE_ATTR_WO(reset);
2186 static DEVICE_ATTR_WO(mem_limit);
2187 static DEVICE_ATTR_WO(mem_used_max);
2188 static DEVICE_ATTR_WO(idle);
2189 static DEVICE_ATTR_RW(max_comp_streams);
2190 static DEVICE_ATTR_RW(comp_algorithm);
2191 #ifdef CONFIG_ZRAM_WRITEBACK
2192 static DEVICE_ATTR_RW(backing_dev);
2193 static DEVICE_ATTR_WO(writeback);
2194 static DEVICE_ATTR_RW(writeback_limit);
2195 static DEVICE_ATTR_RW(writeback_limit_enable);
2196 #endif
2197 #ifdef CONFIG_ZRAM_MULTI_COMP
2198 static DEVICE_ATTR_RW(recomp_algorithm);
2199 static DEVICE_ATTR_WO(recompress);
2200 #endif
2201 #ifdef CONFIG_ZRAM_GROUP
2202 static DEVICE_ATTR_RW(group);
2203 #endif
2204
2205 static struct attribute *zram_disk_attrs[] = {
2206 &dev_attr_disksize.attr,
2207 &dev_attr_initstate.attr,
2208 &dev_attr_reset.attr,
2209 &dev_attr_compact.attr,
2210 &dev_attr_mem_limit.attr,
2211 &dev_attr_mem_used_max.attr,
2212 &dev_attr_idle.attr,
2213 &dev_attr_max_comp_streams.attr,
2214 &dev_attr_comp_algorithm.attr,
2215 #ifdef CONFIG_ZRAM_WRITEBACK
2216 &dev_attr_backing_dev.attr,
2217 &dev_attr_writeback.attr,
2218 &dev_attr_writeback_limit.attr,
2219 &dev_attr_writeback_limit_enable.attr,
2220 #endif
2221 &dev_attr_io_stat.attr,
2222 &dev_attr_mm_stat.attr,
2223 #ifdef CONFIG_ZRAM_WRITEBACK
2224 &dev_attr_bd_stat.attr,
2225 #endif
2226 &dev_attr_debug_stat.attr,
2227 #ifdef CONFIG_ZRAM_MULTI_COMP
2228 &dev_attr_recomp_algorithm.attr,
2229 &dev_attr_recompress.attr,
2230 #endif
2231 #ifdef CONFIG_ZRAM_GROUP
2232 &dev_attr_group.attr,
2233 #endif
2234 NULL,
2235 };
2236
2237 ATTRIBUTE_GROUPS(zram_disk);
2238
2239 /*
2240 * Allocate and initialize new zram device. the function returns
2241 * '>= 0' device_id upon success, and negative value otherwise.
2242 */
zram_add(void)2243 static int zram_add(void)
2244 {
2245 struct zram *zram;
2246 int ret, device_id;
2247
2248 zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
2249 if (!zram)
2250 return -ENOMEM;
2251
2252 ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
2253 if (ret < 0)
2254 goto out_free_dev;
2255 device_id = ret;
2256
2257 init_rwsem(&zram->init_lock);
2258 #ifdef CONFIG_ZRAM_WRITEBACK
2259 spin_lock_init(&zram->wb_limit_lock);
2260 #endif
2261
2262 /* gendisk structure */
2263 zram->disk = blk_alloc_disk(NUMA_NO_NODE);
2264 if (!zram->disk) {
2265 pr_err("Error allocating disk structure for device %d\n",
2266 device_id);
2267 ret = -ENOMEM;
2268 goto out_free_idr;
2269 }
2270
2271 zram->disk->major = zram_major;
2272 zram->disk->first_minor = device_id;
2273 zram->disk->minors = 1;
2274 zram->disk->flags |= GENHD_FL_NO_PART;
2275 zram->disk->fops = &zram_devops;
2276 zram->disk->private_data = zram;
2277 snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
2278
2279 comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2280
2281 /* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
2282 set_capacity(zram->disk, 0);
2283 /* zram devices sort of resembles non-rotational disks */
2284 blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
2285 blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
2286
2287 /*
2288 * To ensure that we always get PAGE_SIZE aligned
2289 * and n*PAGE_SIZED sized I/O requests.
2290 */
2291 blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
2292 blk_queue_logical_block_size(zram->disk->queue,
2293 ZRAM_LOGICAL_BLOCK_SIZE);
2294 blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
2295 blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
2296 zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
2297 blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
2298
2299 /*
2300 * zram_bio_discard() will clear all logical blocks if logical block
2301 * size is identical with physical block size(PAGE_SIZE). But if it is
2302 * different, we will skip discarding some parts of logical blocks in
2303 * the part of the request range which isn't aligned to physical block
2304 * size. So we can't ensure that all discarded logical blocks are
2305 * zeroed.
2306 */
2307 if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
2308 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
2309
2310 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
2311 ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
2312 if (ret)
2313 goto out_cleanup_disk;
2314
2315 zram_debugfs_register(zram);
2316 pr_info("Added device: %s\n", zram->disk->disk_name);
2317 return device_id;
2318
2319 out_cleanup_disk:
2320 put_disk(zram->disk);
2321 out_free_idr:
2322 idr_remove(&zram_index_idr, device_id);
2323 out_free_dev:
2324 kfree(zram);
2325 return ret;
2326 }
2327
zram_remove(struct zram * zram)2328 static int zram_remove(struct zram *zram)
2329 {
2330 bool claimed;
2331
2332 mutex_lock(&zram->disk->open_mutex);
2333 if (disk_openers(zram->disk)) {
2334 mutex_unlock(&zram->disk->open_mutex);
2335 return -EBUSY;
2336 }
2337
2338 claimed = zram->claim;
2339 if (!claimed)
2340 zram->claim = true;
2341 mutex_unlock(&zram->disk->open_mutex);
2342
2343 zram_debugfs_unregister(zram);
2344
2345 if (claimed) {
2346 /*
2347 * If we were claimed by reset_store(), del_gendisk() will
2348 * wait until reset_store() is done, so nothing need to do.
2349 */
2350 ;
2351 } else {
2352 /* Make sure all the pending I/O are finished */
2353 sync_blockdev(zram->disk->part0);
2354 zram_reset_device(zram);
2355 }
2356
2357 pr_info("Removed device: %s\n", zram->disk->disk_name);
2358
2359 del_gendisk(zram->disk);
2360
2361 /* del_gendisk drains pending reset_store */
2362 WARN_ON_ONCE(claimed && zram->claim);
2363
2364 /*
2365 * disksize_store() may be called in between zram_reset_device()
2366 * and del_gendisk(), so run the last reset to avoid leaking
2367 * anything allocated with disksize_store()
2368 */
2369 zram_reset_device(zram);
2370
2371 put_disk(zram->disk);
2372 kfree(zram);
2373 return 0;
2374 }
2375
2376 /* zram-control sysfs attributes */
2377
2378 /*
2379 * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
2380 * sense that reading from this file does alter the state of your system -- it
2381 * creates a new un-initialized zram device and returns back this device's
2382 * device_id (or an error code if it fails to create a new device).
2383 */
hot_add_show(const struct class * class,const struct class_attribute * attr,char * buf)2384 static ssize_t hot_add_show(const struct class *class,
2385 const struct class_attribute *attr,
2386 char *buf)
2387 {
2388 int ret;
2389
2390 mutex_lock(&zram_index_mutex);
2391 ret = zram_add();
2392 mutex_unlock(&zram_index_mutex);
2393
2394 if (ret < 0)
2395 return ret;
2396 return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
2397 }
2398 /* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */
2399 static struct class_attribute class_attr_hot_add =
2400 __ATTR(hot_add, 0400, hot_add_show, NULL);
2401
hot_remove_store(const struct class * class,const struct class_attribute * attr,const char * buf,size_t count)2402 static ssize_t hot_remove_store(const struct class *class,
2403 const struct class_attribute *attr,
2404 const char *buf,
2405 size_t count)
2406 {
2407 struct zram *zram;
2408 int ret, dev_id;
2409
2410 /* dev_id is gendisk->first_minor, which is `int' */
2411 ret = kstrtoint(buf, 10, &dev_id);
2412 if (ret)
2413 return ret;
2414 if (dev_id < 0)
2415 return -EINVAL;
2416
2417 mutex_lock(&zram_index_mutex);
2418
2419 zram = idr_find(&zram_index_idr, dev_id);
2420 if (zram) {
2421 ret = zram_remove(zram);
2422 if (!ret)
2423 idr_remove(&zram_index_idr, dev_id);
2424 } else {
2425 ret = -ENODEV;
2426 }
2427
2428 mutex_unlock(&zram_index_mutex);
2429 return ret ? ret : count;
2430 }
2431 static CLASS_ATTR_WO(hot_remove);
2432
2433 static struct attribute *zram_control_class_attrs[] = {
2434 &class_attr_hot_add.attr,
2435 &class_attr_hot_remove.attr,
2436 NULL,
2437 };
2438 ATTRIBUTE_GROUPS(zram_control_class);
2439
2440 static struct class zram_control_class = {
2441 .name = "zram-control",
2442 .class_groups = zram_control_class_groups,
2443 };
2444
zram_remove_cb(int id,void * ptr,void * data)2445 static int zram_remove_cb(int id, void *ptr, void *data)
2446 {
2447 WARN_ON_ONCE(zram_remove(ptr));
2448 return 0;
2449 }
2450
destroy_devices(void)2451 static void destroy_devices(void)
2452 {
2453 class_unregister(&zram_control_class);
2454 idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
2455 zram_debugfs_destroy();
2456 idr_destroy(&zram_index_idr);
2457 unregister_blkdev(zram_major, "zram");
2458 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2459 }
2460
zram_init(void)2461 static int __init zram_init(void)
2462 {
2463 int ret;
2464
2465 BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
2466
2467 ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
2468 zcomp_cpu_up_prepare, zcomp_cpu_dead);
2469 if (ret < 0)
2470 return ret;
2471
2472 ret = class_register(&zram_control_class);
2473 if (ret) {
2474 pr_err("Unable to register zram-control class\n");
2475 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2476 return ret;
2477 }
2478
2479 zram_debugfs_create();
2480 zram_major = register_blkdev(0, "zram");
2481 if (zram_major <= 0) {
2482 pr_err("Unable to get major number\n");
2483 class_unregister(&zram_control_class);
2484 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2485 return -EBUSY;
2486 }
2487
2488 while (num_devices != 0) {
2489 mutex_lock(&zram_index_mutex);
2490 ret = zram_add();
2491 mutex_unlock(&zram_index_mutex);
2492 if (ret < 0)
2493 goto out_error;
2494 num_devices--;
2495 }
2496
2497 return 0;
2498
2499 out_error:
2500 destroy_devices();
2501 return ret;
2502 }
2503
zram_exit(void)2504 static void __exit zram_exit(void)
2505 {
2506 destroy_devices();
2507 }
2508
2509 module_init(zram_init);
2510 module_exit(zram_exit);
2511
2512 module_param(num_devices, uint, 0);
2513 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
2514
2515 MODULE_LICENSE("Dual BSD/GPL");
2516 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
2517 MODULE_DESCRIPTION("Compressed RAM Block Device");
2518