1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/memcontrol.h>
3 #include <linux/rwsem.h>
4 #include <linux/shrinker.h>
5 #include <linux/rculist.h>
6 #include <trace/events/vmscan.h>
7
8 #undef CREATE_TRACE_POINTS
9 #include <trace/hooks/vmscan.h>
10
11 #include "internal.h"
12
13 LIST_HEAD(shrinker_list);
14 DEFINE_MUTEX(shrinker_mutex);
15
16 #ifdef CONFIG_MEMCG
17 static int shrinker_nr_max;
18
shrinker_unit_size(int nr_items)19 static inline int shrinker_unit_size(int nr_items)
20 {
21 return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
22 }
23
shrinker_unit_free(struct shrinker_info * info,int start)24 static inline void shrinker_unit_free(struct shrinker_info *info, int start)
25 {
26 struct shrinker_info_unit **unit;
27 int nr, i;
28
29 if (!info)
30 return;
31
32 unit = info->unit;
33 nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
34
35 for (i = start; i < nr; i++) {
36 if (!unit[i])
37 break;
38
39 kfree(unit[i]);
40 unit[i] = NULL;
41 }
42 }
43
shrinker_unit_alloc(struct shrinker_info * new,struct shrinker_info * old,int nid)44 static inline int shrinker_unit_alloc(struct shrinker_info *new,
45 struct shrinker_info *old, int nid)
46 {
47 struct shrinker_info_unit *unit;
48 int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
49 int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
50 int i;
51
52 for (i = start; i < nr; i++) {
53 unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
54 if (!unit) {
55 shrinker_unit_free(new, start);
56 return -ENOMEM;
57 }
58
59 new->unit[i] = unit;
60 }
61
62 return 0;
63 }
64
free_shrinker_info(struct mem_cgroup * memcg)65 void free_shrinker_info(struct mem_cgroup *memcg)
66 {
67 struct mem_cgroup_per_node *pn;
68 struct shrinker_info *info;
69 int nid;
70
71 for_each_node(nid) {
72 pn = memcg->nodeinfo[nid];
73 info = rcu_dereference_protected(pn->shrinker_info, true);
74 shrinker_unit_free(info, 0);
75 kvfree(info);
76 rcu_assign_pointer(pn->shrinker_info, NULL);
77 }
78 }
79
alloc_shrinker_info(struct mem_cgroup * memcg)80 int alloc_shrinker_info(struct mem_cgroup *memcg)
81 {
82 int nid, ret = 0;
83 int array_size = 0;
84
85 mutex_lock(&shrinker_mutex);
86 array_size = shrinker_unit_size(shrinker_nr_max);
87 for_each_node(nid) {
88 struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
89 GFP_KERNEL, nid);
90 if (!info)
91 goto err;
92 info->map_nr_max = shrinker_nr_max;
93 if (shrinker_unit_alloc(info, NULL, nid)) {
94 kvfree(info);
95 goto err;
96 }
97 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
98 }
99 mutex_unlock(&shrinker_mutex);
100
101 return ret;
102
103 err:
104 mutex_unlock(&shrinker_mutex);
105 free_shrinker_info(memcg);
106 return -ENOMEM;
107 }
108
shrinker_info_protected(struct mem_cgroup * memcg,int nid)109 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
110 int nid)
111 {
112 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
113 lockdep_is_held(&shrinker_mutex));
114 }
115
expand_one_shrinker_info(struct mem_cgroup * memcg,int new_size,int old_size,int new_nr_max)116 static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
117 int old_size, int new_nr_max)
118 {
119 struct shrinker_info *new, *old;
120 struct mem_cgroup_per_node *pn;
121 int nid;
122
123 for_each_node(nid) {
124 pn = memcg->nodeinfo[nid];
125 old = shrinker_info_protected(memcg, nid);
126 /* Not yet online memcg */
127 if (!old)
128 return 0;
129
130 /* Already expanded this shrinker_info */
131 if (new_nr_max <= old->map_nr_max)
132 continue;
133
134 new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
135 if (!new)
136 return -ENOMEM;
137
138 new->map_nr_max = new_nr_max;
139
140 memcpy(new->unit, old->unit, old_size);
141 if (shrinker_unit_alloc(new, old, nid)) {
142 kvfree(new);
143 return -ENOMEM;
144 }
145
146 rcu_assign_pointer(pn->shrinker_info, new);
147 kvfree_rcu(old, rcu);
148 }
149
150 return 0;
151 }
152
expand_shrinker_info(int new_id)153 static int expand_shrinker_info(int new_id)
154 {
155 int ret = 0;
156 int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
157 int new_size, old_size = 0;
158 struct mem_cgroup *memcg;
159
160 if (!root_mem_cgroup)
161 goto out;
162
163 lockdep_assert_held(&shrinker_mutex);
164
165 new_size = shrinker_unit_size(new_nr_max);
166 old_size = shrinker_unit_size(shrinker_nr_max);
167
168 memcg = mem_cgroup_iter(NULL, NULL, NULL);
169 do {
170 ret = expand_one_shrinker_info(memcg, new_size, old_size,
171 new_nr_max);
172 if (ret) {
173 mem_cgroup_iter_break(NULL, memcg);
174 goto out;
175 }
176 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
177 out:
178 if (!ret)
179 shrinker_nr_max = new_nr_max;
180
181 return ret;
182 }
183
shrinker_id_to_index(int shrinker_id)184 static inline int shrinker_id_to_index(int shrinker_id)
185 {
186 return shrinker_id / SHRINKER_UNIT_BITS;
187 }
188
shrinker_id_to_offset(int shrinker_id)189 static inline int shrinker_id_to_offset(int shrinker_id)
190 {
191 return shrinker_id % SHRINKER_UNIT_BITS;
192 }
193
calc_shrinker_id(int index,int offset)194 static inline int calc_shrinker_id(int index, int offset)
195 {
196 return index * SHRINKER_UNIT_BITS + offset;
197 }
198
set_shrinker_bit(struct mem_cgroup * memcg,int nid,int shrinker_id)199 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
200 {
201 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
202 struct shrinker_info *info;
203 struct shrinker_info_unit *unit;
204
205 rcu_read_lock();
206 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
207 unit = info->unit[shrinker_id_to_index(shrinker_id)];
208 if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
209 /* Pairs with smp mb in shrink_slab() */
210 smp_mb__before_atomic();
211 set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
212 }
213 rcu_read_unlock();
214 }
215 }
216
217 static DEFINE_IDR(shrinker_idr);
218
shrinker_memcg_alloc(struct shrinker * shrinker)219 static int shrinker_memcg_alloc(struct shrinker *shrinker)
220 {
221 int id, ret = -ENOMEM;
222
223 if (mem_cgroup_disabled())
224 return -ENOSYS;
225
226 mutex_lock(&shrinker_mutex);
227 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
228 if (id < 0)
229 goto unlock;
230
231 if (id >= shrinker_nr_max) {
232 if (expand_shrinker_info(id)) {
233 idr_remove(&shrinker_idr, id);
234 goto unlock;
235 }
236 }
237 shrinker->id = id;
238 ret = 0;
239 unlock:
240 mutex_unlock(&shrinker_mutex);
241 return ret;
242 }
243
shrinker_memcg_remove(struct shrinker * shrinker)244 static void shrinker_memcg_remove(struct shrinker *shrinker)
245 {
246 int id = shrinker->id;
247
248 BUG_ON(id < 0);
249
250 lockdep_assert_held(&shrinker_mutex);
251
252 idr_remove(&shrinker_idr, id);
253 }
254
xchg_nr_deferred_memcg(int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)255 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
256 struct mem_cgroup *memcg)
257 {
258 struct shrinker_info *info;
259 struct shrinker_info_unit *unit;
260 long nr_deferred;
261
262 rcu_read_lock();
263 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
264 unit = info->unit[shrinker_id_to_index(shrinker->id)];
265 nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
266 rcu_read_unlock();
267
268 return nr_deferred;
269 }
270
add_nr_deferred_memcg(long nr,int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)271 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
272 struct mem_cgroup *memcg)
273 {
274 struct shrinker_info *info;
275 struct shrinker_info_unit *unit;
276 long nr_deferred;
277
278 rcu_read_lock();
279 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
280 unit = info->unit[shrinker_id_to_index(shrinker->id)];
281 nr_deferred =
282 atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
283 rcu_read_unlock();
284
285 return nr_deferred;
286 }
287
reparent_shrinker_deferred(struct mem_cgroup * memcg)288 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
289 {
290 int nid, index, offset;
291 long nr;
292 struct mem_cgroup *parent;
293 struct shrinker_info *child_info, *parent_info;
294 struct shrinker_info_unit *child_unit, *parent_unit;
295
296 parent = parent_mem_cgroup(memcg);
297 if (!parent)
298 parent = root_mem_cgroup;
299
300 /* Prevent from concurrent shrinker_info expand */
301 mutex_lock(&shrinker_mutex);
302 for_each_node(nid) {
303 child_info = shrinker_info_protected(memcg, nid);
304 parent_info = shrinker_info_protected(parent, nid);
305 for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
306 child_unit = child_info->unit[index];
307 parent_unit = parent_info->unit[index];
308 for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
309 nr = atomic_long_read(&child_unit->nr_deferred[offset]);
310 atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
311 }
312 }
313 }
314 mutex_unlock(&shrinker_mutex);
315 }
316 #else
shrinker_memcg_alloc(struct shrinker * shrinker)317 static int shrinker_memcg_alloc(struct shrinker *shrinker)
318 {
319 return -ENOSYS;
320 }
321
shrinker_memcg_remove(struct shrinker * shrinker)322 static void shrinker_memcg_remove(struct shrinker *shrinker)
323 {
324 }
325
xchg_nr_deferred_memcg(int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)326 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
327 struct mem_cgroup *memcg)
328 {
329 return 0;
330 }
331
add_nr_deferred_memcg(long nr,int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)332 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
333 struct mem_cgroup *memcg)
334 {
335 return 0;
336 }
337 #endif /* CONFIG_MEMCG */
338
xchg_nr_deferred(struct shrinker * shrinker,struct shrink_control * sc)339 static long xchg_nr_deferred(struct shrinker *shrinker,
340 struct shrink_control *sc)
341 {
342 int nid = sc->nid;
343
344 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
345 nid = 0;
346
347 if (sc->memcg &&
348 (shrinker->flags & SHRINKER_MEMCG_AWARE))
349 return xchg_nr_deferred_memcg(nid, shrinker,
350 sc->memcg);
351
352 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
353 }
354
355
add_nr_deferred(long nr,struct shrinker * shrinker,struct shrink_control * sc)356 static long add_nr_deferred(long nr, struct shrinker *shrinker,
357 struct shrink_control *sc)
358 {
359 int nid = sc->nid;
360
361 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
362 nid = 0;
363
364 if (sc->memcg &&
365 (shrinker->flags & SHRINKER_MEMCG_AWARE))
366 return add_nr_deferred_memcg(nr, nid, shrinker,
367 sc->memcg);
368
369 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
370 }
371
372 #define SHRINK_BATCH 128
373
do_shrink_slab(struct shrink_control * shrinkctl,struct shrinker * shrinker,int priority)374 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
375 struct shrinker *shrinker, int priority)
376 {
377 unsigned long freed = 0;
378 unsigned long long delta;
379 long total_scan;
380 long freeable;
381 long nr;
382 long new_nr;
383 long batch_size = shrinker->batch ? shrinker->batch
384 : SHRINK_BATCH;
385 long scanned = 0, next_deferred;
386
387 freeable = shrinker->count_objects(shrinker, shrinkctl);
388 trace_android_vh_do_shrink_slab(shrinker, &freeable);
389 trace_android_vh_do_shrink_slab_ex(shrinkctl, shrinker, &freeable, priority);
390 if (freeable == 0 || freeable == SHRINK_EMPTY)
391 return freeable;
392
393 /*
394 * copy the current shrinker scan count into a local variable
395 * and zero it so that other concurrent shrinker invocations
396 * don't also do this scanning work.
397 */
398 nr = xchg_nr_deferred(shrinker, shrinkctl);
399
400 if (shrinker->seeks) {
401 delta = freeable >> priority;
402 delta *= 4;
403 do_div(delta, shrinker->seeks);
404 } else {
405 /*
406 * These objects don't require any IO to create. Trim
407 * them aggressively under memory pressure to keep
408 * them from causing refetches in the IO caches.
409 */
410 delta = freeable / 2;
411 }
412
413 total_scan = nr >> priority;
414 total_scan += delta;
415 total_scan = min(total_scan, (2 * freeable));
416
417 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
418 freeable, delta, total_scan, priority);
419
420 /*
421 * Normally, we should not scan less than batch_size objects in one
422 * pass to avoid too frequent shrinker calls, but if the slab has less
423 * than batch_size objects in total and we are really tight on memory,
424 * we will try to reclaim all available objects, otherwise we can end
425 * up failing allocations although there are plenty of reclaimable
426 * objects spread over several slabs with usage less than the
427 * batch_size.
428 *
429 * We detect the "tight on memory" situations by looking at the total
430 * number of objects we want to scan (total_scan). If it is greater
431 * than the total number of objects on slab (freeable), we must be
432 * scanning at high prio and therefore should try to reclaim as much as
433 * possible.
434 */
435 while (total_scan >= batch_size ||
436 total_scan >= freeable) {
437 unsigned long ret;
438 unsigned long nr_to_scan = min(batch_size, total_scan);
439
440 shrinkctl->nr_to_scan = nr_to_scan;
441 shrinkctl->nr_scanned = nr_to_scan;
442 ret = shrinker->scan_objects(shrinker, shrinkctl);
443 if (ret == SHRINK_STOP)
444 break;
445 freed += ret;
446
447 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
448 total_scan -= shrinkctl->nr_scanned;
449 scanned += shrinkctl->nr_scanned;
450
451 cond_resched();
452 }
453
454 /*
455 * The deferred work is increased by any new work (delta) that wasn't
456 * done, decreased by old deferred work that was done now.
457 *
458 * And it is capped to two times of the freeable items.
459 */
460 next_deferred = max_t(long, (nr + delta - scanned), 0);
461 next_deferred = min(next_deferred, (2 * freeable));
462
463 /*
464 * move the unused scan count back into the shrinker in a
465 * manner that handles concurrent updates.
466 */
467 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
468
469 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
470 return freed;
471 }
472
473 #ifdef CONFIG_MEMCG
shrink_slab_memcg(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)474 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
475 struct mem_cgroup *memcg, int priority)
476 {
477 struct shrinker_info *info;
478 unsigned long ret, freed = 0;
479 int offset, index = 0;
480
481 if (!mem_cgroup_online(memcg))
482 return 0;
483
484 /*
485 * lockless algorithm of memcg shrink.
486 *
487 * The shrinker_info may be freed asynchronously via RCU in the
488 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
489 * to ensure the existence of the shrinker_info.
490 *
491 * The shrinker_info_unit is never freed unless its corresponding memcg
492 * is destroyed. Here we already hold the refcount of memcg, so the
493 * memcg will not be destroyed, and of course shrinker_info_unit will
494 * not be freed.
495 *
496 * So in the memcg shrink:
497 * step 1: use rcu_read_lock() to guarantee existence of the
498 * shrinker_info.
499 * step 2: after getting shrinker_info_unit we can safely release the
500 * RCU lock.
501 * step 3: traverse the bitmap and calculate shrinker_id
502 * step 4: use rcu_read_lock() to guarantee existence of the shrinker.
503 * step 5: use shrinker_id to find the shrinker, then use
504 * shrinker_try_get() to guarantee existence of the shrinker,
505 * then we can release the RCU lock to do do_shrink_slab() that
506 * may sleep.
507 * step 6: do shrinker_put() paired with step 5 to put the refcount,
508 * if the refcount reaches 0, then wake up the waiter in
509 * shrinker_free() by calling complete().
510 * Note: here is different from the global shrink, we don't
511 * need to acquire the RCU lock to guarantee existence of
512 * the shrinker, because we don't need to use this
513 * shrinker to traverse the next shrinker in the bitmap.
514 * step 7: we have already exited the read-side of rcu critical section
515 * before calling do_shrink_slab(), the shrinker_info may be
516 * released in expand_one_shrinker_info(), so go back to step 1
517 * to reacquire the shrinker_info.
518 */
519 again:
520 rcu_read_lock();
521 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
522 if (unlikely(!info))
523 goto unlock;
524
525 if (index < shrinker_id_to_index(info->map_nr_max)) {
526 struct shrinker_info_unit *unit;
527
528 unit = info->unit[index];
529
530 rcu_read_unlock();
531
532 for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
533 struct shrink_control sc = {
534 .gfp_mask = gfp_mask,
535 .nid = nid,
536 .memcg = memcg,
537 };
538 struct shrinker *shrinker;
539 int shrinker_id = calc_shrinker_id(index, offset);
540
541 rcu_read_lock();
542 shrinker = idr_find(&shrinker_idr, shrinker_id);
543 if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
544 clear_bit(offset, unit->map);
545 rcu_read_unlock();
546 continue;
547 }
548 rcu_read_unlock();
549
550 /* Call non-slab shrinkers even though kmem is disabled */
551 if (!memcg_kmem_online() &&
552 !(shrinker->flags & SHRINKER_NONSLAB))
553 continue;
554
555 ret = do_shrink_slab(&sc, shrinker, priority);
556 if (ret == SHRINK_EMPTY) {
557 clear_bit(offset, unit->map);
558 /*
559 * After the shrinker reported that it had no objects to
560 * free, but before we cleared the corresponding bit in
561 * the memcg shrinker map, a new object might have been
562 * added. To make sure, we have the bit set in this
563 * case, we invoke the shrinker one more time and reset
564 * the bit if it reports that it is not empty anymore.
565 * The memory barrier here pairs with the barrier in
566 * set_shrinker_bit():
567 *
568 * list_lru_add() shrink_slab_memcg()
569 * list_add_tail() clear_bit()
570 * <MB> <MB>
571 * set_bit() do_shrink_slab()
572 */
573 smp_mb__after_atomic();
574 ret = do_shrink_slab(&sc, shrinker, priority);
575 if (ret == SHRINK_EMPTY)
576 ret = 0;
577 else
578 set_shrinker_bit(memcg, nid, shrinker_id);
579 }
580 freed += ret;
581 shrinker_put(shrinker);
582 }
583
584 index++;
585 goto again;
586 }
587 unlock:
588 rcu_read_unlock();
589 return freed;
590 }
591 #else /* !CONFIG_MEMCG */
shrink_slab_memcg(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)592 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
593 struct mem_cgroup *memcg, int priority)
594 {
595 return 0;
596 }
597 #endif /* CONFIG_MEMCG */
598
599 /**
600 * shrink_slab - shrink slab caches
601 * @gfp_mask: allocation context
602 * @nid: node whose slab caches to target
603 * @memcg: memory cgroup whose slab caches to target
604 * @priority: the reclaim priority
605 *
606 * Call the shrink functions to age shrinkable caches.
607 *
608 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
609 * unaware shrinkers will receive a node id of 0 instead.
610 *
611 * @memcg specifies the memory cgroup to target. Unaware shrinkers
612 * are called only if it is the root cgroup.
613 *
614 * @priority is sc->priority, we take the number of objects and >> by priority
615 * in order to get the scan target.
616 *
617 * Returns the number of reclaimed slab objects.
618 */
shrink_slab(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)619 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
620 int priority)
621 {
622 unsigned long ret, freed = 0;
623 struct shrinker *shrinker;
624 bool bypass = false;
625
626 trace_android_vh_shrink_slab_bypass(gfp_mask, nid, memcg, priority, &bypass);
627 if (bypass)
628 return 0;
629
630 /*
631 * The root memcg might be allocated even though memcg is disabled
632 * via "cgroup_disable=memory" boot parameter. This could make
633 * mem_cgroup_is_root() return false, then just run memcg slab
634 * shrink, but skip global shrink. This may result in premature
635 * oom.
636 */
637 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
638 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
639
640 /*
641 * lockless algorithm of global shrink.
642 *
643 * In the unregistration setp, the shrinker will be freed asynchronously
644 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
645 * shrinker_try_get() can be used to ensure the existence of the shrinker.
646 *
647 * So in the global shrink:
648 * step 1: use rcu_read_lock() to guarantee existence of the shrinker
649 * and the validity of the shrinker_list walk.
650 * step 2: use shrinker_try_get() to try get the refcount, if successful,
651 * then the existence of the shrinker can also be guaranteed,
652 * so we can release the RCU lock to do do_shrink_slab() that
653 * may sleep.
654 * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
655 * which ensures that neither this shrinker nor the next shrinker
656 * will be freed in the next traversal operation.
657 * step 4: do shrinker_put() paired with step 2 to put the refcount,
658 * if the refcount reaches 0, then wake up the waiter in
659 * shrinker_free() by calling complete().
660 */
661 rcu_read_lock();
662 list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
663 struct shrink_control sc = {
664 .gfp_mask = gfp_mask,
665 .nid = nid,
666 .memcg = memcg,
667 };
668
669 if (!shrinker_try_get(shrinker))
670 continue;
671
672 rcu_read_unlock();
673
674 ret = do_shrink_slab(&sc, shrinker, priority);
675 if (ret == SHRINK_EMPTY)
676 ret = 0;
677 freed += ret;
678
679 rcu_read_lock();
680 shrinker_put(shrinker);
681 }
682
683 rcu_read_unlock();
684 cond_resched();
685 return freed;
686 }
687 EXPORT_SYMBOL_GPL(shrink_slab);
688
shrinker_alloc(unsigned int flags,const char * fmt,...)689 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
690 {
691 struct shrinker *shrinker;
692 unsigned int size;
693 va_list ap;
694 int err;
695
696 shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
697 if (!shrinker)
698 return NULL;
699
700 va_start(ap, fmt);
701 err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
702 va_end(ap);
703 if (err)
704 goto err_name;
705
706 shrinker->flags = flags | SHRINKER_ALLOCATED;
707 shrinker->seeks = DEFAULT_SEEKS;
708
709 if (flags & SHRINKER_MEMCG_AWARE) {
710 err = shrinker_memcg_alloc(shrinker);
711 if (err == -ENOSYS) {
712 /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
713 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
714 goto non_memcg;
715 }
716
717 if (err)
718 goto err_flags;
719
720 return shrinker;
721 }
722
723 non_memcg:
724 /*
725 * The nr_deferred is available on per memcg level for memcg aware
726 * shrinkers, so only allocate nr_deferred in the following cases:
727 * - non-memcg-aware shrinkers
728 * - !CONFIG_MEMCG
729 * - memcg is disabled by kernel command line
730 */
731 size = sizeof(*shrinker->nr_deferred);
732 if (flags & SHRINKER_NUMA_AWARE)
733 size *= nr_node_ids;
734
735 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
736 if (!shrinker->nr_deferred)
737 goto err_flags;
738
739 return shrinker;
740
741 err_flags:
742 shrinker_debugfs_name_free(shrinker);
743 err_name:
744 kfree(shrinker);
745 return NULL;
746 }
747 EXPORT_SYMBOL_GPL(shrinker_alloc);
748
shrinker_register(struct shrinker * shrinker)749 void shrinker_register(struct shrinker *shrinker)
750 {
751 if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
752 pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
753 return;
754 }
755
756 mutex_lock(&shrinker_mutex);
757 list_add_tail_rcu(&shrinker->list, &shrinker_list);
758 shrinker->flags |= SHRINKER_REGISTERED;
759 shrinker_debugfs_add(shrinker);
760 mutex_unlock(&shrinker_mutex);
761
762 init_completion(&shrinker->done);
763 /*
764 * Now the shrinker is fully set up, take the first reference to it to
765 * indicate that lookup operations are now allowed to use it via
766 * shrinker_try_get().
767 */
768 refcount_set(&shrinker->refcount, 1);
769 }
770 EXPORT_SYMBOL_GPL(shrinker_register);
771
shrinker_free_rcu_cb(struct rcu_head * head)772 static void shrinker_free_rcu_cb(struct rcu_head *head)
773 {
774 struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
775
776 kfree(shrinker->nr_deferred);
777 kfree(shrinker);
778 }
779
shrinker_free(struct shrinker * shrinker)780 void shrinker_free(struct shrinker *shrinker)
781 {
782 struct dentry *debugfs_entry = NULL;
783 int debugfs_id;
784
785 if (!shrinker)
786 return;
787
788 if (shrinker->flags & SHRINKER_REGISTERED) {
789 /* drop the initial refcount */
790 shrinker_put(shrinker);
791 /*
792 * Wait for all lookups of the shrinker to complete, after that,
793 * no shrinker is running or will run again, then we can safely
794 * free it asynchronously via RCU and safely free the structure
795 * where the shrinker is located, such as super_block etc.
796 */
797 wait_for_completion(&shrinker->done);
798 }
799
800 mutex_lock(&shrinker_mutex);
801 if (shrinker->flags & SHRINKER_REGISTERED) {
802 /*
803 * Now we can safely remove it from the shrinker_list and then
804 * free it.
805 */
806 list_del_rcu(&shrinker->list);
807 debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
808 shrinker->flags &= ~SHRINKER_REGISTERED;
809 }
810
811 shrinker_debugfs_name_free(shrinker);
812
813 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
814 shrinker_memcg_remove(shrinker);
815 mutex_unlock(&shrinker_mutex);
816
817 if (debugfs_entry)
818 shrinker_debugfs_remove(debugfs_entry, debugfs_id);
819
820 call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
821 }
822 EXPORT_SYMBOL_GPL(shrinker_free);
823