• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /* SPDX-License-Identifier: GPL-2.0
2   *
3   * page_pool.c
4   *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
5   *	Copyright (C) 2016 Red Hat, Inc.
6   */
7  
8  #include <linux/types.h>
9  #include <linux/kernel.h>
10  #include <linux/slab.h>
11  #include <linux/device.h>
12  
13  #include <net/page_pool.h>
14  #include <linux/dma-direction.h>
15  #include <linux/dma-mapping.h>
16  #include <linux/page-flags.h>
17  #include <linux/mm.h> /* for __put_page() */
18  
19  #include <trace/events/page_pool.h>
20  
21  #define DEFER_TIME (msecs_to_jiffies(1000))
22  #define DEFER_WARN_INTERVAL (60 * HZ)
23  
page_pool_init(struct page_pool * pool,const struct page_pool_params * params)24  static int page_pool_init(struct page_pool *pool,
25  			  const struct page_pool_params *params)
26  {
27  	unsigned int ring_qsize = 1024; /* Default */
28  
29  	memcpy(&pool->p, params, sizeof(pool->p));
30  
31  	/* Validate only known flags were used */
32  	if (pool->p.flags & ~(PP_FLAG_ALL))
33  		return -EINVAL;
34  
35  	if (pool->p.pool_size)
36  		ring_qsize = pool->p.pool_size;
37  
38  	/* Sanity limit mem that can be pinned down */
39  	if (ring_qsize > 32768)
40  		return -E2BIG;
41  
42  	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
43  	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
44  	 * which is the XDP_TX use-case.
45  	 */
46  	if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
47  	    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
48  		return -EINVAL;
49  
50  	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
51  		return -ENOMEM;
52  
53  	atomic_set(&pool->pages_state_release_cnt, 0);
54  
55  	/* Driver calling page_pool_create() also call page_pool_destroy() */
56  	refcount_set(&pool->user_cnt, 1);
57  
58  	if (pool->p.flags & PP_FLAG_DMA_MAP)
59  		get_device(pool->p.dev);
60  
61  	return 0;
62  }
63  
page_pool_create(const struct page_pool_params * params)64  struct page_pool *page_pool_create(const struct page_pool_params *params)
65  {
66  	struct page_pool *pool;
67  	int err;
68  
69  	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
70  	if (!pool)
71  		return ERR_PTR(-ENOMEM);
72  
73  	err = page_pool_init(pool, params);
74  	if (err < 0) {
75  		pr_warn("%s() gave up with errno %d\n", __func__, err);
76  		kfree(pool);
77  		return ERR_PTR(err);
78  	}
79  
80  	return pool;
81  }
82  EXPORT_SYMBOL(page_pool_create);
83  
84  /* fast path */
__page_pool_get_cached(struct page_pool * pool)85  static struct page *__page_pool_get_cached(struct page_pool *pool)
86  {
87  	struct ptr_ring *r = &pool->ring;
88  	bool refill = false;
89  	struct page *page;
90  
91  	/* Test for safe-context, caller should provide this guarantee */
92  	if (likely(in_serving_softirq())) {
93  		if (likely(pool->alloc.count)) {
94  			/* Fast-path */
95  			page = pool->alloc.cache[--pool->alloc.count];
96  			return page;
97  		}
98  		refill = true;
99  	}
100  
101  	/* Quicker fallback, avoid locks when ring is empty */
102  	if (__ptr_ring_empty(r))
103  		return NULL;
104  
105  	/* Slow-path: Get page from locked ring queue,
106  	 * refill alloc array if requested.
107  	 */
108  	spin_lock(&r->consumer_lock);
109  	page = __ptr_ring_consume(r);
110  	if (refill)
111  		pool->alloc.count = __ptr_ring_consume_batched(r,
112  							pool->alloc.cache,
113  							PP_ALLOC_CACHE_REFILL);
114  	spin_unlock(&r->consumer_lock);
115  	return page;
116  }
117  
118  /* slow path */
119  noinline
__page_pool_alloc_pages_slow(struct page_pool * pool,gfp_t _gfp)120  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
121  						 gfp_t _gfp)
122  {
123  	struct page *page;
124  	gfp_t gfp = _gfp;
125  	dma_addr_t dma;
126  
127  	/* We could always set __GFP_COMP, and avoid this branch, as
128  	 * prep_new_page() can handle order-0 with __GFP_COMP.
129  	 */
130  	if (pool->p.order)
131  		gfp |= __GFP_COMP;
132  
133  	/* FUTURE development:
134  	 *
135  	 * Current slow-path essentially falls back to single page
136  	 * allocations, which doesn't improve performance.  This code
137  	 * need bulk allocation support from the page allocator code.
138  	 */
139  
140  	/* Cache was empty, do real allocation */
141  	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
142  	if (!page)
143  		return NULL;
144  
145  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
146  		goto skip_dma_map;
147  
148  	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
149  	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
150  	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
151  	 * This mapping is kept for lifetime of page, until leaving pool.
152  	 */
153  	dma = dma_map_page_attrs(pool->p.dev, page, 0,
154  				 (PAGE_SIZE << pool->p.order),
155  				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
156  	if (dma_mapping_error(pool->p.dev, dma)) {
157  		put_page(page);
158  		return NULL;
159  	}
160  	page->dma_addr = dma;
161  
162  skip_dma_map:
163  	/* Track how many pages are held 'in-flight' */
164  	pool->pages_state_hold_cnt++;
165  
166  	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
167  
168  	/* When page just alloc'ed is should/must have refcnt 1. */
169  	return page;
170  }
171  
172  /* For using page_pool replace: alloc_pages() API calls, but provide
173   * synchronization guarantee for allocation side.
174   */
page_pool_alloc_pages(struct page_pool * pool,gfp_t gfp)175  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
176  {
177  	struct page *page;
178  
179  	/* Fast-path: Get a page from cache */
180  	page = __page_pool_get_cached(pool);
181  	if (page)
182  		return page;
183  
184  	/* Slow-path: cache empty, do real allocation */
185  	page = __page_pool_alloc_pages_slow(pool, gfp);
186  	return page;
187  }
188  EXPORT_SYMBOL(page_pool_alloc_pages);
189  
190  /* Calculate distance between two u32 values, valid if distance is below 2^(31)
191   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
192   */
193  #define _distance(a, b)	(s32)((a) - (b))
194  
page_pool_inflight(struct page_pool * pool)195  static s32 page_pool_inflight(struct page_pool *pool)
196  {
197  	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
198  	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
199  	s32 inflight;
200  
201  	inflight = _distance(hold_cnt, release_cnt);
202  
203  	trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt);
204  	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
205  
206  	return inflight;
207  }
208  
209  /* Cleanup page_pool state from page */
__page_pool_clean_page(struct page_pool * pool,struct page * page)210  static void __page_pool_clean_page(struct page_pool *pool,
211  				   struct page *page)
212  {
213  	dma_addr_t dma;
214  	int count;
215  
216  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
217  		goto skip_dma_unmap;
218  
219  	dma = page->dma_addr;
220  	/* DMA unmap */
221  	dma_unmap_page_attrs(pool->p.dev, dma,
222  			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
223  			     DMA_ATTR_SKIP_CPU_SYNC);
224  	page->dma_addr = 0;
225  skip_dma_unmap:
226  	/* This may be the last page returned, releasing the pool, so
227  	 * it is not safe to reference pool afterwards.
228  	 */
229  	count = atomic_inc_return(&pool->pages_state_release_cnt);
230  	trace_page_pool_state_release(pool, page, count);
231  }
232  
233  /* unmap the page and clean our state */
page_pool_unmap_page(struct page_pool * pool,struct page * page)234  void page_pool_unmap_page(struct page_pool *pool, struct page *page)
235  {
236  	/* When page is unmapped, this implies page will not be
237  	 * returned to page_pool.
238  	 */
239  	__page_pool_clean_page(pool, page);
240  }
241  EXPORT_SYMBOL(page_pool_unmap_page);
242  
243  /* Return a page to the page allocator, cleaning up our state */
__page_pool_return_page(struct page_pool * pool,struct page * page)244  static void __page_pool_return_page(struct page_pool *pool, struct page *page)
245  {
246  	__page_pool_clean_page(pool, page);
247  
248  	put_page(page);
249  	/* An optimization would be to call __free_pages(page, pool->p.order)
250  	 * knowing page is not part of page-cache (thus avoiding a
251  	 * __page_cache_release() call).
252  	 */
253  }
254  
__page_pool_recycle_into_ring(struct page_pool * pool,struct page * page)255  static bool __page_pool_recycle_into_ring(struct page_pool *pool,
256  				   struct page *page)
257  {
258  	int ret;
259  	/* BH protection not needed if current is serving softirq */
260  	if (in_serving_softirq())
261  		ret = ptr_ring_produce(&pool->ring, page);
262  	else
263  		ret = ptr_ring_produce_bh(&pool->ring, page);
264  
265  	return (ret == 0) ? true : false;
266  }
267  
268  /* Only allow direct recycling in special circumstances, into the
269   * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
270   *
271   * Caller must provide appropriate safe context.
272   */
__page_pool_recycle_direct(struct page * page,struct page_pool * pool)273  static bool __page_pool_recycle_direct(struct page *page,
274  				       struct page_pool *pool)
275  {
276  	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
277  		return false;
278  
279  	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
280  	pool->alloc.cache[pool->alloc.count++] = page;
281  	return true;
282  }
283  
__page_pool_put_page(struct page_pool * pool,struct page * page,bool allow_direct)284  void __page_pool_put_page(struct page_pool *pool,
285  			  struct page *page, bool allow_direct)
286  {
287  	/* This allocator is optimized for the XDP mode that uses
288  	 * one-frame-per-page, but have fallbacks that act like the
289  	 * regular page allocator APIs.
290  	 *
291  	 * refcnt == 1 means page_pool owns page, and can recycle it.
292  	 */
293  	if (likely(page_ref_count(page) == 1)) {
294  		/* Read barrier done in page_ref_count / READ_ONCE */
295  
296  		if (allow_direct && in_serving_softirq())
297  			if (__page_pool_recycle_direct(page, pool))
298  				return;
299  
300  		if (!__page_pool_recycle_into_ring(pool, page)) {
301  			/* Cache full, fallback to free pages */
302  			__page_pool_return_page(pool, page);
303  		}
304  		return;
305  	}
306  	/* Fallback/non-XDP mode: API user have elevated refcnt.
307  	 *
308  	 * Many drivers split up the page into fragments, and some
309  	 * want to keep doing this to save memory and do refcnt based
310  	 * recycling. Support this use case too, to ease drivers
311  	 * switching between XDP/non-XDP.
312  	 *
313  	 * In-case page_pool maintains the DMA mapping, API user must
314  	 * call page_pool_put_page once.  In this elevated refcnt
315  	 * case, the DMA is unmapped/released, as driver is likely
316  	 * doing refcnt based recycle tricks, meaning another process
317  	 * will be invoking put_page.
318  	 */
319  	__page_pool_clean_page(pool, page);
320  	put_page(page);
321  }
322  EXPORT_SYMBOL(__page_pool_put_page);
323  
__page_pool_empty_ring(struct page_pool * pool)324  static void __page_pool_empty_ring(struct page_pool *pool)
325  {
326  	struct page *page;
327  
328  	/* Empty recycle ring */
329  	while ((page = ptr_ring_consume_bh(&pool->ring))) {
330  		/* Verify the refcnt invariant of cached pages */
331  		if (!(page_ref_count(page) == 1))
332  			pr_crit("%s() page_pool refcnt %d violation\n",
333  				__func__, page_ref_count(page));
334  
335  		__page_pool_return_page(pool, page);
336  	}
337  }
338  
page_pool_free(struct page_pool * pool)339  static void page_pool_free(struct page_pool *pool)
340  {
341  	if (pool->disconnect)
342  		pool->disconnect(pool);
343  
344  	ptr_ring_cleanup(&pool->ring, NULL);
345  
346  	if (pool->p.flags & PP_FLAG_DMA_MAP)
347  		put_device(pool->p.dev);
348  
349  	kfree(pool);
350  }
351  
page_pool_scrub(struct page_pool * pool)352  static void page_pool_scrub(struct page_pool *pool)
353  {
354  	struct page *page;
355  
356  	/* Empty alloc cache, assume caller made sure this is
357  	 * no-longer in use, and page_pool_alloc_pages() cannot be
358  	 * call concurrently.
359  	 */
360  	while (pool->alloc.count) {
361  		page = pool->alloc.cache[--pool->alloc.count];
362  		__page_pool_return_page(pool, page);
363  	}
364  
365  	/* No more consumers should exist, but producers could still
366  	 * be in-flight.
367  	 */
368  	__page_pool_empty_ring(pool);
369  }
370  
page_pool_release(struct page_pool * pool)371  static int page_pool_release(struct page_pool *pool)
372  {
373  	int inflight;
374  
375  	page_pool_scrub(pool);
376  	inflight = page_pool_inflight(pool);
377  	if (!inflight)
378  		page_pool_free(pool);
379  
380  	return inflight;
381  }
382  
page_pool_release_retry(struct work_struct * wq)383  static void page_pool_release_retry(struct work_struct *wq)
384  {
385  	struct delayed_work *dwq = to_delayed_work(wq);
386  	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
387  	int inflight;
388  
389  	inflight = page_pool_release(pool);
390  	if (!inflight)
391  		return;
392  
393  	/* Periodic warning */
394  	if (time_after_eq(jiffies, pool->defer_warn)) {
395  		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
396  
397  		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
398  			__func__, inflight, sec);
399  		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
400  	}
401  
402  	/* Still not ready to be disconnected, retry later */
403  	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
404  }
405  
page_pool_use_xdp_mem(struct page_pool * pool,void (* disconnect)(void *))406  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
407  {
408  	refcount_inc(&pool->user_cnt);
409  	pool->disconnect = disconnect;
410  }
411  
page_pool_destroy(struct page_pool * pool)412  void page_pool_destroy(struct page_pool *pool)
413  {
414  	if (!pool)
415  		return;
416  
417  	if (!page_pool_put(pool))
418  		return;
419  
420  	if (!page_pool_release(pool))
421  		return;
422  
423  	pool->defer_start = jiffies;
424  	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
425  
426  	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
427  	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
428  }
429  EXPORT_SYMBOL(page_pool_destroy);
430