1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Lockless hierarchical page accounting & limiting
4 *
5 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
6 */
7
8 #include <linux/page_counter.h>
9 #include <linux/atomic.h>
10 #include <linux/kernel.h>
11 #include <linux/string.h>
12 #include <linux/sched.h>
13 #include <linux/bug.h>
14 #include <asm/page.h>
15 #include <trace/hooks/cgroup.h>
16
propagate_protected_usage(struct page_counter * c,unsigned long usage)17 static void propagate_protected_usage(struct page_counter *c,
18 unsigned long usage)
19 {
20 unsigned long protected, old_protected;
21 unsigned long low, min;
22 long delta;
23
24 if (!c->parent)
25 return;
26
27 min = READ_ONCE(c->min);
28 if (min || atomic_long_read(&c->min_usage)) {
29 protected = min(usage, min);
30 old_protected = atomic_long_xchg(&c->min_usage, protected);
31 delta = protected - old_protected;
32 if (delta)
33 atomic_long_add(delta, &c->parent->children_min_usage);
34 }
35
36 low = READ_ONCE(c->low);
37 if (low || atomic_long_read(&c->low_usage)) {
38 protected = min(usage, low);
39 old_protected = atomic_long_xchg(&c->low_usage, protected);
40 delta = protected - old_protected;
41 if (delta)
42 atomic_long_add(delta, &c->parent->children_low_usage);
43 }
44 }
45
46 /**
47 * page_counter_cancel - take pages out of the local counter
48 * @counter: counter
49 * @nr_pages: number of pages to cancel
50 */
page_counter_cancel(struct page_counter * counter,unsigned long nr_pages)51 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
52 {
53 long new;
54
55 new = atomic_long_sub_return(nr_pages, &counter->usage);
56 /* More uncharges than charges? */
57 if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
58 new, nr_pages)) {
59 new = 0;
60 atomic_long_set(&counter->usage, new);
61 }
62 propagate_protected_usage(counter, new);
63 }
64
65 /**
66 * page_counter_charge - hierarchically charge pages
67 * @counter: counter
68 * @nr_pages: number of pages to charge
69 *
70 * NOTE: This does not consider any configured counter limits.
71 */
page_counter_charge(struct page_counter * counter,unsigned long nr_pages)72 void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
73 {
74 struct page_counter *c;
75
76 for (c = counter; c; c = c->parent) {
77 long new;
78
79 new = atomic_long_add_return(nr_pages, &c->usage);
80 propagate_protected_usage(c, new);
81 /*
82 * This is indeed racy, but we can live with some
83 * inaccuracy in the watermark.
84 */
85 if (new > READ_ONCE(c->watermark))
86 WRITE_ONCE(c->watermark, new);
87 trace_android_rvh_update_watermark(new, c);
88 }
89 }
90
91 /**
92 * page_counter_try_charge - try to hierarchically charge pages
93 * @counter: counter
94 * @nr_pages: number of pages to charge
95 * @fail: points first counter to hit its limit, if any
96 *
97 * Returns %true on success, or %false and @fail if the counter or one
98 * of its ancestors has hit its configured limit.
99 */
page_counter_try_charge(struct page_counter * counter,unsigned long nr_pages,struct page_counter ** fail)100 bool page_counter_try_charge(struct page_counter *counter,
101 unsigned long nr_pages,
102 struct page_counter **fail)
103 {
104 struct page_counter *c;
105
106 for (c = counter; c; c = c->parent) {
107 long new;
108 /*
109 * Charge speculatively to avoid an expensive CAS. If
110 * a bigger charge fails, it might falsely lock out a
111 * racing smaller charge and send it into reclaim
112 * early, but the error is limited to the difference
113 * between the two sizes, which is less than 2M/4M in
114 * case of a THP locking out a regular page charge.
115 *
116 * The atomic_long_add_return() implies a full memory
117 * barrier between incrementing the count and reading
118 * the limit. When racing with page_counter_set_max(),
119 * we either see the new limit or the setter sees the
120 * counter has changed and retries.
121 */
122 new = atomic_long_add_return(nr_pages, &c->usage);
123 if (new > c->max) {
124 atomic_long_sub(nr_pages, &c->usage);
125 propagate_protected_usage(c, new);
126 /*
127 * This is racy, but we can live with some
128 * inaccuracy in the failcnt which is only used
129 * to report stats.
130 */
131 data_race(c->failcnt++);
132 *fail = c;
133 goto failed;
134 }
135 propagate_protected_usage(c, new);
136 /*
137 * Just like with failcnt, we can live with some
138 * inaccuracy in the watermark.
139 */
140 if (new > READ_ONCE(c->watermark))
141 WRITE_ONCE(c->watermark, new);
142 trace_android_rvh_update_watermark(new, c);
143 }
144 return true;
145
146 failed:
147 for (c = counter; c != *fail; c = c->parent)
148 page_counter_cancel(c, nr_pages);
149
150 return false;
151 }
152
153 /**
154 * page_counter_uncharge - hierarchically uncharge pages
155 * @counter: counter
156 * @nr_pages: number of pages to uncharge
157 */
page_counter_uncharge(struct page_counter * counter,unsigned long nr_pages)158 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
159 {
160 struct page_counter *c;
161
162 for (c = counter; c; c = c->parent)
163 page_counter_cancel(c, nr_pages);
164 }
165
166 /**
167 * page_counter_set_max - set the maximum number of pages allowed
168 * @counter: counter
169 * @nr_pages: limit to set
170 *
171 * Returns 0 on success, -EBUSY if the current number of pages on the
172 * counter already exceeds the specified limit.
173 *
174 * The caller must serialize invocations on the same counter.
175 */
page_counter_set_max(struct page_counter * counter,unsigned long nr_pages)176 int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
177 {
178 for (;;) {
179 unsigned long old;
180 long usage;
181
182 /*
183 * Update the limit while making sure that it's not
184 * below the concurrently-changing counter value.
185 *
186 * The xchg implies two full memory barriers before
187 * and after, so the read-swap-read is ordered and
188 * ensures coherency with page_counter_try_charge():
189 * that function modifies the count before checking
190 * the limit, so if it sees the old limit, we see the
191 * modified counter and retry.
192 */
193 usage = page_counter_read(counter);
194
195 if (usage > nr_pages)
196 return -EBUSY;
197
198 old = xchg(&counter->max, nr_pages);
199
200 if (page_counter_read(counter) <= usage)
201 return 0;
202
203 counter->max = old;
204 cond_resched();
205 }
206 }
207
208 /**
209 * page_counter_set_min - set the amount of protected memory
210 * @counter: counter
211 * @nr_pages: value to set
212 *
213 * The caller must serialize invocations on the same counter.
214 */
page_counter_set_min(struct page_counter * counter,unsigned long nr_pages)215 void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
216 {
217 struct page_counter *c;
218
219 WRITE_ONCE(counter->min, nr_pages);
220
221 for (c = counter; c; c = c->parent)
222 propagate_protected_usage(c, atomic_long_read(&c->usage));
223 }
224
225 /**
226 * page_counter_set_low - set the amount of protected memory
227 * @counter: counter
228 * @nr_pages: value to set
229 *
230 * The caller must serialize invocations on the same counter.
231 */
page_counter_set_low(struct page_counter * counter,unsigned long nr_pages)232 void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
233 {
234 struct page_counter *c;
235
236 WRITE_ONCE(counter->low, nr_pages);
237
238 for (c = counter; c; c = c->parent)
239 propagate_protected_usage(c, atomic_long_read(&c->usage));
240 }
241
242 /**
243 * page_counter_memparse - memparse() for page counter limits
244 * @buf: string to parse
245 * @max: string meaning maximum possible value
246 * @nr_pages: returns the result in number of pages
247 *
248 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
249 * limited to %PAGE_COUNTER_MAX.
250 */
page_counter_memparse(const char * buf,const char * max,unsigned long * nr_pages)251 int page_counter_memparse(const char *buf, const char *max,
252 unsigned long *nr_pages)
253 {
254 char *end;
255 u64 bytes;
256
257 if (!strcmp(buf, max)) {
258 *nr_pages = PAGE_COUNTER_MAX;
259 return 0;
260 }
261
262 bytes = memparse(buf, &end);
263 if (*end != '\0')
264 return -EINVAL;
265
266 *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
267
268 return 0;
269 }
270