• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* flow.c: Generic flow cache.
2  *
3  * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
4  * Copyright (C) 2003 David S. Miller (davem@redhat.com)
5  */
6 
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/list.h>
10 #include <linux/jhash.h>
11 #include <linux/interrupt.h>
12 #include <linux/mm.h>
13 #include <linux/random.h>
14 #include <linux/init.h>
15 #include <linux/slab.h>
16 #include <linux/smp.h>
17 #include <linux/completion.h>
18 #include <linux/percpu.h>
19 #include <linux/bitops.h>
20 #include <linux/notifier.h>
21 #include <linux/cpu.h>
22 #include <linux/cpumask.h>
23 #include <linux/mutex.h>
24 #include <net/flow.h>
25 #include <linux/atomic.h>
26 #include <linux/security.h>
27 #include <net/net_namespace.h>
28 
29 struct flow_cache_entry {
30 	union {
31 		struct hlist_node	hlist;
32 		struct list_head	gc_list;
33 	} u;
34 	struct net			*net;
35 	u16				family;
36 	u8				dir;
37 	u32				genid;
38 	struct flowi			key;
39 	struct flow_cache_object	*object;
40 };
41 
42 struct flow_flush_info {
43 	struct flow_cache		*cache;
44 	atomic_t			cpuleft;
45 	struct completion		completion;
46 };
47 
48 static struct kmem_cache *flow_cachep __read_mostly;
49 
50 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
51 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
52 
flow_cache_new_hashrnd(unsigned long arg)53 static void flow_cache_new_hashrnd(unsigned long arg)
54 {
55 	struct flow_cache *fc = (void *) arg;
56 	int i;
57 
58 	for_each_possible_cpu(i)
59 		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
60 
61 	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
62 	add_timer(&fc->rnd_timer);
63 }
64 
flow_entry_valid(struct flow_cache_entry * fle,struct netns_xfrm * xfrm)65 static int flow_entry_valid(struct flow_cache_entry *fle,
66 				struct netns_xfrm *xfrm)
67 {
68 	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
69 		return 0;
70 	if (fle->object && !fle->object->ops->check(fle->object))
71 		return 0;
72 	return 1;
73 }
74 
flow_entry_kill(struct flow_cache_entry * fle,struct netns_xfrm * xfrm)75 static void flow_entry_kill(struct flow_cache_entry *fle,
76 				struct netns_xfrm *xfrm)
77 {
78 	if (fle->object)
79 		fle->object->ops->delete(fle->object);
80 	kmem_cache_free(flow_cachep, fle);
81 }
82 
flow_cache_gc_task(struct work_struct * work)83 static void flow_cache_gc_task(struct work_struct *work)
84 {
85 	struct list_head gc_list;
86 	struct flow_cache_entry *fce, *n;
87 	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
88 						flow_cache_gc_work);
89 
90 	INIT_LIST_HEAD(&gc_list);
91 	spin_lock_bh(&xfrm->flow_cache_gc_lock);
92 	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
93 	spin_unlock_bh(&xfrm->flow_cache_gc_lock);
94 
95 	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
96 		flow_entry_kill(fce, xfrm);
97 }
98 
flow_cache_queue_garbage(struct flow_cache_percpu * fcp,int deleted,struct list_head * gc_list,struct netns_xfrm * xfrm)99 static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
100 				     int deleted, struct list_head *gc_list,
101 				     struct netns_xfrm *xfrm)
102 {
103 	if (deleted) {
104 		fcp->hash_count -= deleted;
105 		spin_lock_bh(&xfrm->flow_cache_gc_lock);
106 		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
107 		spin_unlock_bh(&xfrm->flow_cache_gc_lock);
108 		schedule_work(&xfrm->flow_cache_gc_work);
109 	}
110 }
111 
__flow_cache_shrink(struct flow_cache * fc,struct flow_cache_percpu * fcp,int shrink_to)112 static void __flow_cache_shrink(struct flow_cache *fc,
113 				struct flow_cache_percpu *fcp,
114 				int shrink_to)
115 {
116 	struct flow_cache_entry *fle;
117 	struct hlist_node *tmp;
118 	LIST_HEAD(gc_list);
119 	int i, deleted = 0;
120 	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
121 						flow_cache_global);
122 
123 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
124 		int saved = 0;
125 
126 		hlist_for_each_entry_safe(fle, tmp,
127 					  &fcp->hash_table[i], u.hlist) {
128 			if (saved < shrink_to &&
129 			    flow_entry_valid(fle, xfrm)) {
130 				saved++;
131 			} else {
132 				deleted++;
133 				hlist_del(&fle->u.hlist);
134 				list_add_tail(&fle->u.gc_list, &gc_list);
135 			}
136 		}
137 	}
138 
139 	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
140 }
141 
flow_cache_shrink(struct flow_cache * fc,struct flow_cache_percpu * fcp)142 static void flow_cache_shrink(struct flow_cache *fc,
143 			      struct flow_cache_percpu *fcp)
144 {
145 	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
146 
147 	__flow_cache_shrink(fc, fcp, shrink_to);
148 }
149 
flow_new_hash_rnd(struct flow_cache * fc,struct flow_cache_percpu * fcp)150 static void flow_new_hash_rnd(struct flow_cache *fc,
151 			      struct flow_cache_percpu *fcp)
152 {
153 	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
154 	fcp->hash_rnd_recalc = 0;
155 	__flow_cache_shrink(fc, fcp, 0);
156 }
157 
flow_hash_code(struct flow_cache * fc,struct flow_cache_percpu * fcp,const struct flowi * key,size_t keysize)158 static u32 flow_hash_code(struct flow_cache *fc,
159 			  struct flow_cache_percpu *fcp,
160 			  const struct flowi *key,
161 			  size_t keysize)
162 {
163 	const u32 *k = (const u32 *) key;
164 	const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
165 
166 	return jhash2(k, length, fcp->hash_rnd)
167 		& (flow_cache_hash_size(fc) - 1);
168 }
169 
170 /* I hear what you're saying, use memcmp.  But memcmp cannot make
171  * important assumptions that we can here, such as alignment.
172  */
flow_key_compare(const struct flowi * key1,const struct flowi * key2,size_t keysize)173 static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
174 			    size_t keysize)
175 {
176 	const flow_compare_t *k1, *k1_lim, *k2;
177 
178 	k1 = (const flow_compare_t *) key1;
179 	k1_lim = k1 + keysize;
180 
181 	k2 = (const flow_compare_t *) key2;
182 
183 	do {
184 		if (*k1++ != *k2++)
185 			return 1;
186 	} while (k1 < k1_lim);
187 
188 	return 0;
189 }
190 
191 struct flow_cache_object *
flow_cache_lookup(struct net * net,const struct flowi * key,u16 family,u8 dir,flow_resolve_t resolver,void * ctx)192 flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
193 		  flow_resolve_t resolver, void *ctx)
194 {
195 	struct flow_cache *fc = &net->xfrm.flow_cache_global;
196 	struct flow_cache_percpu *fcp;
197 	struct flow_cache_entry *fle, *tfle;
198 	struct flow_cache_object *flo;
199 	size_t keysize;
200 	unsigned int hash;
201 
202 	local_bh_disable();
203 	fcp = this_cpu_ptr(fc->percpu);
204 
205 	fle = NULL;
206 	flo = NULL;
207 
208 	keysize = flow_key_size(family);
209 	if (!keysize)
210 		goto nocache;
211 
212 	/* Packet really early in init?  Making flow_cache_init a
213 	 * pre-smp initcall would solve this.  --RR */
214 	if (!fcp->hash_table)
215 		goto nocache;
216 
217 	if (fcp->hash_rnd_recalc)
218 		flow_new_hash_rnd(fc, fcp);
219 
220 	hash = flow_hash_code(fc, fcp, key, keysize);
221 	hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
222 		if (tfle->net == net &&
223 		    tfle->family == family &&
224 		    tfle->dir == dir &&
225 		    flow_key_compare(key, &tfle->key, keysize) == 0) {
226 			fle = tfle;
227 			break;
228 		}
229 	}
230 
231 	if (unlikely(!fle)) {
232 		if (fcp->hash_count > fc->high_watermark)
233 			flow_cache_shrink(fc, fcp);
234 
235 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
236 		if (fle) {
237 			fle->net = net;
238 			fle->family = family;
239 			fle->dir = dir;
240 			memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
241 			fle->object = NULL;
242 			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
243 			fcp->hash_count++;
244 		}
245 	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
246 		flo = fle->object;
247 		if (!flo)
248 			goto ret_object;
249 		flo = flo->ops->get(flo);
250 		if (flo)
251 			goto ret_object;
252 	} else if (fle->object) {
253 	        flo = fle->object;
254 	        flo->ops->delete(flo);
255 	        fle->object = NULL;
256 	}
257 
258 nocache:
259 	flo = NULL;
260 	if (fle) {
261 		flo = fle->object;
262 		fle->object = NULL;
263 	}
264 	flo = resolver(net, key, family, dir, flo, ctx);
265 	if (fle) {
266 		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
267 		if (!IS_ERR(flo))
268 			fle->object = flo;
269 		else
270 			fle->genid--;
271 	} else {
272 		if (!IS_ERR_OR_NULL(flo))
273 			flo->ops->delete(flo);
274 	}
275 ret_object:
276 	local_bh_enable();
277 	return flo;
278 }
279 EXPORT_SYMBOL(flow_cache_lookup);
280 
flow_cache_flush_tasklet(unsigned long data)281 static void flow_cache_flush_tasklet(unsigned long data)
282 {
283 	struct flow_flush_info *info = (void *)data;
284 	struct flow_cache *fc = info->cache;
285 	struct flow_cache_percpu *fcp;
286 	struct flow_cache_entry *fle;
287 	struct hlist_node *tmp;
288 	LIST_HEAD(gc_list);
289 	int i, deleted = 0;
290 	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
291 						flow_cache_global);
292 
293 	fcp = this_cpu_ptr(fc->percpu);
294 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
295 		hlist_for_each_entry_safe(fle, tmp,
296 					  &fcp->hash_table[i], u.hlist) {
297 			if (flow_entry_valid(fle, xfrm))
298 				continue;
299 
300 			deleted++;
301 			hlist_del(&fle->u.hlist);
302 			list_add_tail(&fle->u.gc_list, &gc_list);
303 		}
304 	}
305 
306 	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
307 
308 	if (atomic_dec_and_test(&info->cpuleft))
309 		complete(&info->completion);
310 }
311 
312 /*
313  * Return whether a cpu needs flushing.  Conservatively, we assume
314  * the presence of any entries means the core may require flushing,
315  * since the flow_cache_ops.check() function may assume it's running
316  * on the same core as the per-cpu cache component.
317  */
flow_cache_percpu_empty(struct flow_cache * fc,int cpu)318 static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
319 {
320 	struct flow_cache_percpu *fcp;
321 	int i;
322 
323 	fcp = per_cpu_ptr(fc->percpu, cpu);
324 	for (i = 0; i < flow_cache_hash_size(fc); i++)
325 		if (!hlist_empty(&fcp->hash_table[i]))
326 			return 0;
327 	return 1;
328 }
329 
flow_cache_flush_per_cpu(void * data)330 static void flow_cache_flush_per_cpu(void *data)
331 {
332 	struct flow_flush_info *info = data;
333 	struct tasklet_struct *tasklet;
334 
335 	tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
336 	tasklet->data = (unsigned long)info;
337 	tasklet_schedule(tasklet);
338 }
339 
flow_cache_flush(struct net * net)340 void flow_cache_flush(struct net *net)
341 {
342 	struct flow_flush_info info;
343 	cpumask_var_t mask;
344 	int i, self;
345 
346 	/* Track which cpus need flushing to avoid disturbing all cores. */
347 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
348 		return;
349 	cpumask_clear(mask);
350 
351 	/* Don't want cpus going down or up during this. */
352 	get_online_cpus();
353 	mutex_lock(&net->xfrm.flow_flush_sem);
354 	info.cache = &net->xfrm.flow_cache_global;
355 	for_each_online_cpu(i)
356 		if (!flow_cache_percpu_empty(info.cache, i))
357 			cpumask_set_cpu(i, mask);
358 	atomic_set(&info.cpuleft, cpumask_weight(mask));
359 	if (atomic_read(&info.cpuleft) == 0)
360 		goto done;
361 
362 	init_completion(&info.completion);
363 
364 	local_bh_disable();
365 	self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
366 	on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
367 	if (self)
368 		flow_cache_flush_tasklet((unsigned long)&info);
369 	local_bh_enable();
370 
371 	wait_for_completion(&info.completion);
372 
373 done:
374 	mutex_unlock(&net->xfrm.flow_flush_sem);
375 	put_online_cpus();
376 	free_cpumask_var(mask);
377 }
378 
flow_cache_flush_task(struct work_struct * work)379 static void flow_cache_flush_task(struct work_struct *work)
380 {
381 	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
382 						flow_cache_flush_work);
383 	struct net *net = container_of(xfrm, struct net, xfrm);
384 
385 	flow_cache_flush(net);
386 }
387 
flow_cache_flush_deferred(struct net * net)388 void flow_cache_flush_deferred(struct net *net)
389 {
390 	schedule_work(&net->xfrm.flow_cache_flush_work);
391 }
392 
flow_cache_cpu_prepare(struct flow_cache * fc,int cpu)393 static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
394 {
395 	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
396 	size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
397 
398 	if (!fcp->hash_table) {
399 		fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
400 		if (!fcp->hash_table) {
401 			pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
402 			return -ENOMEM;
403 		}
404 		fcp->hash_rnd_recalc = 1;
405 		fcp->hash_count = 0;
406 		tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
407 	}
408 	return 0;
409 }
410 
flow_cache_cpu(struct notifier_block * nfb,unsigned long action,void * hcpu)411 static int flow_cache_cpu(struct notifier_block *nfb,
412 			  unsigned long action,
413 			  void *hcpu)
414 {
415 	struct flow_cache *fc = container_of(nfb, struct flow_cache,
416 						hotcpu_notifier);
417 	int res, cpu = (unsigned long) hcpu;
418 	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
419 
420 	switch (action) {
421 	case CPU_UP_PREPARE:
422 	case CPU_UP_PREPARE_FROZEN:
423 		res = flow_cache_cpu_prepare(fc, cpu);
424 		if (res)
425 			return notifier_from_errno(res);
426 		break;
427 	case CPU_DEAD:
428 	case CPU_DEAD_FROZEN:
429 		__flow_cache_shrink(fc, fcp, 0);
430 		break;
431 	}
432 	return NOTIFY_OK;
433 }
434 
flow_cache_init(struct net * net)435 int flow_cache_init(struct net *net)
436 {
437 	int i;
438 	struct flow_cache *fc = &net->xfrm.flow_cache_global;
439 
440 	if (!flow_cachep)
441 		flow_cachep = kmem_cache_create("flow_cache",
442 						sizeof(struct flow_cache_entry),
443 						0, SLAB_PANIC, NULL);
444 	spin_lock_init(&net->xfrm.flow_cache_gc_lock);
445 	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
446 	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
447 	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
448 	mutex_init(&net->xfrm.flow_flush_sem);
449 
450 	fc->hash_shift = 10;
451 	fc->low_watermark = 2 * flow_cache_hash_size(fc);
452 	fc->high_watermark = 4 * flow_cache_hash_size(fc);
453 
454 	fc->percpu = alloc_percpu(struct flow_cache_percpu);
455 	if (!fc->percpu)
456 		return -ENOMEM;
457 
458 	cpu_notifier_register_begin();
459 
460 	for_each_online_cpu(i) {
461 		if (flow_cache_cpu_prepare(fc, i))
462 			goto err;
463 	}
464 	fc->hotcpu_notifier = (struct notifier_block){
465 		.notifier_call = flow_cache_cpu,
466 	};
467 	__register_hotcpu_notifier(&fc->hotcpu_notifier);
468 
469 	cpu_notifier_register_done();
470 
471 	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
472 		    (unsigned long) fc);
473 	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
474 	add_timer(&fc->rnd_timer);
475 
476 	return 0;
477 
478 err:
479 	for_each_possible_cpu(i) {
480 		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
481 		kfree(fcp->hash_table);
482 		fcp->hash_table = NULL;
483 	}
484 
485 	cpu_notifier_register_done();
486 
487 	free_percpu(fc->percpu);
488 	fc->percpu = NULL;
489 
490 	return -ENOMEM;
491 }
492 EXPORT_SYMBOL(flow_cache_init);
493 
flow_cache_fini(struct net * net)494 void flow_cache_fini(struct net *net)
495 {
496 	int i;
497 	struct flow_cache *fc = &net->xfrm.flow_cache_global;
498 
499 	del_timer_sync(&fc->rnd_timer);
500 	unregister_hotcpu_notifier(&fc->hotcpu_notifier);
501 
502 	for_each_possible_cpu(i) {
503 		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
504 		kfree(fcp->hash_table);
505 		fcp->hash_table = NULL;
506 	}
507 
508 	free_percpu(fc->percpu);
509 	fc->percpu = NULL;
510 }
511 EXPORT_SYMBOL(flow_cache_fini);
512