1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Connection state tracking for netfilter. This is separated from,
3 but required by, the NAT layer; it can also be used by an iptables
4 extension. */
5
6 /* (C) 1999-2001 Paul `Rusty' Russell
7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
10 */
11
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14 #include <linux/types.h>
15 #include <linux/netfilter.h>
16 #include <linux/module.h>
17 #include <linux/sched.h>
18 #include <linux/skbuff.h>
19 #include <linux/proc_fs.h>
20 #include <linux/vmalloc.h>
21 #include <linux/stddef.h>
22 #include <linux/slab.h>
23 #include <linux/random.h>
24 #include <linux/jhash.h>
25 #include <linux/siphash.h>
26 #include <linux/err.h>
27 #include <linux/percpu.h>
28 #include <linux/moduleparam.h>
29 #include <linux/notifier.h>
30 #include <linux/kernel.h>
31 #include <linux/netdevice.h>
32 #include <linux/socket.h>
33 #include <linux/mm.h>
34 #include <linux/nsproxy.h>
35 #include <linux/rculist_nulls.h>
36
37 #include <net/netfilter/nf_conntrack.h>
38 #include <net/netfilter/nf_conntrack_l4proto.h>
39 #include <net/netfilter/nf_conntrack_expect.h>
40 #include <net/netfilter/nf_conntrack_helper.h>
41 #include <net/netfilter/nf_conntrack_seqadj.h>
42 #include <net/netfilter/nf_conntrack_core.h>
43 #include <net/netfilter/nf_conntrack_extend.h>
44 #include <net/netfilter/nf_conntrack_acct.h>
45 #include <net/netfilter/nf_conntrack_ecache.h>
46 #include <net/netfilter/nf_conntrack_zones.h>
47 #include <net/netfilter/nf_conntrack_timestamp.h>
48 #include <net/netfilter/nf_conntrack_timeout.h>
49 #include <net/netfilter/nf_conntrack_labels.h>
50 #include <net/netfilter/nf_conntrack_synproxy.h>
51 #include <net/netfilter/nf_nat.h>
52 #include <net/netfilter/nf_nat_helper.h>
53 #include <net/netns/hash.h>
54 #include <net/ip.h>
55
56 #include "nf_internals.h"
57
58 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
59 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
60
61 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
62 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
63
64 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
65 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
66
67 struct conntrack_gc_work {
68 struct delayed_work dwork;
69 u32 next_bucket;
70 bool exiting;
71 bool early_drop;
72 };
73
74 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
75 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
76 static __read_mostly bool nf_conntrack_locks_all;
77
78 /* serialize hash resizes and nf_ct_iterate_cleanup */
79 static DEFINE_MUTEX(nf_conntrack_mutex);
80
81 #define GC_SCAN_INTERVAL (120u * HZ)
82 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
83
84 static struct conntrack_gc_work conntrack_gc_work;
85
nf_conntrack_lock(spinlock_t * lock)86 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
87 {
88 /* 1) Acquire the lock */
89 spin_lock(lock);
90
91 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
92 * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
93 */
94 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
95 return;
96
97 /* fast path failed, unlock */
98 spin_unlock(lock);
99
100 /* Slow path 1) get global lock */
101 spin_lock(&nf_conntrack_locks_all_lock);
102
103 /* Slow path 2) get the lock we want */
104 spin_lock(lock);
105
106 /* Slow path 3) release the global lock */
107 spin_unlock(&nf_conntrack_locks_all_lock);
108 }
109 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
110
nf_conntrack_double_unlock(unsigned int h1,unsigned int h2)111 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
112 {
113 h1 %= CONNTRACK_LOCKS;
114 h2 %= CONNTRACK_LOCKS;
115 spin_unlock(&nf_conntrack_locks[h1]);
116 if (h1 != h2)
117 spin_unlock(&nf_conntrack_locks[h2]);
118 }
119
120 /* return true if we need to recompute hashes (in case hash table was resized) */
nf_conntrack_double_lock(struct net * net,unsigned int h1,unsigned int h2,unsigned int sequence)121 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
122 unsigned int h2, unsigned int sequence)
123 {
124 h1 %= CONNTRACK_LOCKS;
125 h2 %= CONNTRACK_LOCKS;
126 if (h1 <= h2) {
127 nf_conntrack_lock(&nf_conntrack_locks[h1]);
128 if (h1 != h2)
129 spin_lock_nested(&nf_conntrack_locks[h2],
130 SINGLE_DEPTH_NESTING);
131 } else {
132 nf_conntrack_lock(&nf_conntrack_locks[h2]);
133 spin_lock_nested(&nf_conntrack_locks[h1],
134 SINGLE_DEPTH_NESTING);
135 }
136 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
137 nf_conntrack_double_unlock(h1, h2);
138 return true;
139 }
140 return false;
141 }
142
nf_conntrack_all_lock(void)143 static void nf_conntrack_all_lock(void)
144 __acquires(&nf_conntrack_locks_all_lock)
145 {
146 int i;
147
148 spin_lock(&nf_conntrack_locks_all_lock);
149
150 nf_conntrack_locks_all = true;
151
152 for (i = 0; i < CONNTRACK_LOCKS; i++) {
153 spin_lock(&nf_conntrack_locks[i]);
154
155 /* This spin_unlock provides the "release" to ensure that
156 * nf_conntrack_locks_all==true is visible to everyone that
157 * acquired spin_lock(&nf_conntrack_locks[]).
158 */
159 spin_unlock(&nf_conntrack_locks[i]);
160 }
161 }
162
nf_conntrack_all_unlock(void)163 static void nf_conntrack_all_unlock(void)
164 __releases(&nf_conntrack_locks_all_lock)
165 {
166 /* All prior stores must be complete before we clear
167 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
168 * might observe the false value but not the entire
169 * critical section.
170 * It pairs with the smp_load_acquire() in nf_conntrack_lock()
171 */
172 smp_store_release(&nf_conntrack_locks_all, false);
173 spin_unlock(&nf_conntrack_locks_all_lock);
174 }
175
176 unsigned int nf_conntrack_htable_size __read_mostly;
177 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
178
179 unsigned int nf_conntrack_max __read_mostly;
180 EXPORT_SYMBOL_GPL(nf_conntrack_max);
181 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
182 static unsigned int nf_conntrack_hash_rnd __read_mostly;
183
hash_conntrack_raw(const struct nf_conntrack_tuple * tuple,const struct net * net)184 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
185 const struct net *net)
186 {
187 unsigned int n;
188 u32 seed;
189
190 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
191
192 /* The direction must be ignored, so we hash everything up to the
193 * destination ports (which is a multiple of 4) and treat the last
194 * three bytes manually.
195 */
196 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
197 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
198 return jhash2((u32 *)tuple, n, seed ^
199 (((__force __u16)tuple->dst.u.all << 16) |
200 tuple->dst.protonum));
201 }
202
scale_hash(u32 hash)203 static u32 scale_hash(u32 hash)
204 {
205 return reciprocal_scale(hash, nf_conntrack_htable_size);
206 }
207
__hash_conntrack(const struct net * net,const struct nf_conntrack_tuple * tuple,unsigned int size)208 static u32 __hash_conntrack(const struct net *net,
209 const struct nf_conntrack_tuple *tuple,
210 unsigned int size)
211 {
212 return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
213 }
214
hash_conntrack(const struct net * net,const struct nf_conntrack_tuple * tuple)215 static u32 hash_conntrack(const struct net *net,
216 const struct nf_conntrack_tuple *tuple)
217 {
218 return scale_hash(hash_conntrack_raw(tuple, net));
219 }
220
nf_ct_get_tuple_ports(const struct sk_buff * skb,unsigned int dataoff,struct nf_conntrack_tuple * tuple)221 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
222 unsigned int dataoff,
223 struct nf_conntrack_tuple *tuple)
224 { struct {
225 __be16 sport;
226 __be16 dport;
227 } _inet_hdr, *inet_hdr;
228
229 /* Actually only need first 4 bytes to get ports. */
230 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
231 if (!inet_hdr)
232 return false;
233
234 tuple->src.u.udp.port = inet_hdr->sport;
235 tuple->dst.u.udp.port = inet_hdr->dport;
236 return true;
237 }
238
239 static bool
nf_ct_get_tuple(const struct sk_buff * skb,unsigned int nhoff,unsigned int dataoff,u_int16_t l3num,u_int8_t protonum,struct net * net,struct nf_conntrack_tuple * tuple)240 nf_ct_get_tuple(const struct sk_buff *skb,
241 unsigned int nhoff,
242 unsigned int dataoff,
243 u_int16_t l3num,
244 u_int8_t protonum,
245 struct net *net,
246 struct nf_conntrack_tuple *tuple)
247 {
248 unsigned int size;
249 const __be32 *ap;
250 __be32 _addrs[8];
251
252 memset(tuple, 0, sizeof(*tuple));
253
254 tuple->src.l3num = l3num;
255 switch (l3num) {
256 case NFPROTO_IPV4:
257 nhoff += offsetof(struct iphdr, saddr);
258 size = 2 * sizeof(__be32);
259 break;
260 case NFPROTO_IPV6:
261 nhoff += offsetof(struct ipv6hdr, saddr);
262 size = sizeof(_addrs);
263 break;
264 default:
265 return true;
266 }
267
268 ap = skb_header_pointer(skb, nhoff, size, _addrs);
269 if (!ap)
270 return false;
271
272 switch (l3num) {
273 case NFPROTO_IPV4:
274 tuple->src.u3.ip = ap[0];
275 tuple->dst.u3.ip = ap[1];
276 break;
277 case NFPROTO_IPV6:
278 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
279 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
280 break;
281 }
282
283 tuple->dst.protonum = protonum;
284 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
285
286 switch (protonum) {
287 #if IS_ENABLED(CONFIG_IPV6)
288 case IPPROTO_ICMPV6:
289 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
290 #endif
291 case IPPROTO_ICMP:
292 return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
293 #ifdef CONFIG_NF_CT_PROTO_GRE
294 case IPPROTO_GRE:
295 return gre_pkt_to_tuple(skb, dataoff, net, tuple);
296 #endif
297 case IPPROTO_TCP:
298 case IPPROTO_UDP: /* fallthrough */
299 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
300 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
301 case IPPROTO_UDPLITE:
302 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
303 #endif
304 #ifdef CONFIG_NF_CT_PROTO_SCTP
305 case IPPROTO_SCTP:
306 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
307 #endif
308 #ifdef CONFIG_NF_CT_PROTO_DCCP
309 case IPPROTO_DCCP:
310 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
311 #endif
312 default:
313 break;
314 }
315
316 return true;
317 }
318
ipv4_get_l4proto(const struct sk_buff * skb,unsigned int nhoff,u_int8_t * protonum)319 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
320 u_int8_t *protonum)
321 {
322 int dataoff = -1;
323 const struct iphdr *iph;
324 struct iphdr _iph;
325
326 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
327 if (!iph)
328 return -1;
329
330 /* Conntrack defragments packets, we might still see fragments
331 * inside ICMP packets though.
332 */
333 if (iph->frag_off & htons(IP_OFFSET))
334 return -1;
335
336 dataoff = nhoff + (iph->ihl << 2);
337 *protonum = iph->protocol;
338
339 /* Check bogus IP headers */
340 if (dataoff > skb->len) {
341 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
342 nhoff, iph->ihl << 2, skb->len);
343 return -1;
344 }
345 return dataoff;
346 }
347
348 #if IS_ENABLED(CONFIG_IPV6)
ipv6_get_l4proto(const struct sk_buff * skb,unsigned int nhoff,u8 * protonum)349 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
350 u8 *protonum)
351 {
352 int protoff = -1;
353 unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
354 __be16 frag_off;
355 u8 nexthdr;
356
357 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
358 &nexthdr, sizeof(nexthdr)) != 0) {
359 pr_debug("can't get nexthdr\n");
360 return -1;
361 }
362 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
363 /*
364 * (protoff == skb->len) means the packet has not data, just
365 * IPv6 and possibly extensions headers, but it is tracked anyway
366 */
367 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
368 pr_debug("can't find proto in pkt\n");
369 return -1;
370 }
371
372 *protonum = nexthdr;
373 return protoff;
374 }
375 #endif
376
get_l4proto(const struct sk_buff * skb,unsigned int nhoff,u8 pf,u8 * l4num)377 static int get_l4proto(const struct sk_buff *skb,
378 unsigned int nhoff, u8 pf, u8 *l4num)
379 {
380 switch (pf) {
381 case NFPROTO_IPV4:
382 return ipv4_get_l4proto(skb, nhoff, l4num);
383 #if IS_ENABLED(CONFIG_IPV6)
384 case NFPROTO_IPV6:
385 return ipv6_get_l4proto(skb, nhoff, l4num);
386 #endif
387 default:
388 *l4num = 0;
389 break;
390 }
391 return -1;
392 }
393
nf_ct_get_tuplepr(const struct sk_buff * skb,unsigned int nhoff,u_int16_t l3num,struct net * net,struct nf_conntrack_tuple * tuple)394 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
395 u_int16_t l3num,
396 struct net *net, struct nf_conntrack_tuple *tuple)
397 {
398 u8 protonum;
399 int protoff;
400
401 protoff = get_l4proto(skb, nhoff, l3num, &protonum);
402 if (protoff <= 0)
403 return false;
404
405 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
406 }
407 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
408
409 bool
nf_ct_invert_tuple(struct nf_conntrack_tuple * inverse,const struct nf_conntrack_tuple * orig)410 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
411 const struct nf_conntrack_tuple *orig)
412 {
413 memset(inverse, 0, sizeof(*inverse));
414
415 inverse->src.l3num = orig->src.l3num;
416
417 switch (orig->src.l3num) {
418 case NFPROTO_IPV4:
419 inverse->src.u3.ip = orig->dst.u3.ip;
420 inverse->dst.u3.ip = orig->src.u3.ip;
421 break;
422 case NFPROTO_IPV6:
423 inverse->src.u3.in6 = orig->dst.u3.in6;
424 inverse->dst.u3.in6 = orig->src.u3.in6;
425 break;
426 default:
427 break;
428 }
429
430 inverse->dst.dir = !orig->dst.dir;
431
432 inverse->dst.protonum = orig->dst.protonum;
433
434 switch (orig->dst.protonum) {
435 case IPPROTO_ICMP:
436 return nf_conntrack_invert_icmp_tuple(inverse, orig);
437 #if IS_ENABLED(CONFIG_IPV6)
438 case IPPROTO_ICMPV6:
439 return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
440 #endif
441 }
442
443 inverse->src.u.all = orig->dst.u.all;
444 inverse->dst.u.all = orig->src.u.all;
445 return true;
446 }
447 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
448
449 /* Generate a almost-unique pseudo-id for a given conntrack.
450 *
451 * intentionally doesn't re-use any of the seeds used for hash
452 * table location, we assume id gets exposed to userspace.
453 *
454 * Following nf_conn items do not change throughout lifetime
455 * of the nf_conn:
456 *
457 * 1. nf_conn address
458 * 2. nf_conn->master address (normally NULL)
459 * 3. the associated net namespace
460 * 4. the original direction tuple
461 */
nf_ct_get_id(const struct nf_conn * ct)462 u32 nf_ct_get_id(const struct nf_conn *ct)
463 {
464 static __read_mostly siphash_key_t ct_id_seed;
465 unsigned long a, b, c, d;
466
467 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
468
469 a = (unsigned long)ct;
470 b = (unsigned long)ct->master;
471 c = (unsigned long)nf_ct_net(ct);
472 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
473 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
474 &ct_id_seed);
475 #ifdef CONFIG_64BIT
476 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
477 #else
478 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
479 #endif
480 }
481 EXPORT_SYMBOL_GPL(nf_ct_get_id);
482
483 static void
clean_from_lists(struct nf_conn * ct)484 clean_from_lists(struct nf_conn *ct)
485 {
486 pr_debug("clean_from_lists(%p)\n", ct);
487 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
488 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
489
490 /* Destroy all pending expectations */
491 nf_ct_remove_expectations(ct);
492 }
493
494 /* must be called with local_bh_disable */
nf_ct_add_to_dying_list(struct nf_conn * ct)495 static void nf_ct_add_to_dying_list(struct nf_conn *ct)
496 {
497 struct ct_pcpu *pcpu;
498
499 /* add this conntrack to the (per cpu) dying list */
500 ct->cpu = smp_processor_id();
501 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
502
503 spin_lock(&pcpu->lock);
504 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
505 &pcpu->dying);
506 spin_unlock(&pcpu->lock);
507 }
508
509 /* must be called with local_bh_disable */
nf_ct_add_to_unconfirmed_list(struct nf_conn * ct)510 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
511 {
512 struct ct_pcpu *pcpu;
513
514 /* add this conntrack to the (per cpu) unconfirmed list */
515 ct->cpu = smp_processor_id();
516 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
517
518 spin_lock(&pcpu->lock);
519 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
520 &pcpu->unconfirmed);
521 spin_unlock(&pcpu->lock);
522 }
523
524 /* must be called with local_bh_disable */
nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn * ct)525 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
526 {
527 struct ct_pcpu *pcpu;
528
529 /* We overload first tuple to link into unconfirmed or dying list.*/
530 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
531
532 spin_lock(&pcpu->lock);
533 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
534 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
535 spin_unlock(&pcpu->lock);
536 }
537
538 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
539
540 /* Released via destroy_conntrack() */
nf_ct_tmpl_alloc(struct net * net,const struct nf_conntrack_zone * zone,gfp_t flags)541 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
542 const struct nf_conntrack_zone *zone,
543 gfp_t flags)
544 {
545 struct nf_conn *tmpl, *p;
546
547 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
548 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
549 if (!tmpl)
550 return NULL;
551
552 p = tmpl;
553 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
554 if (tmpl != p) {
555 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
556 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
557 }
558 } else {
559 tmpl = kzalloc(sizeof(*tmpl), flags);
560 if (!tmpl)
561 return NULL;
562 }
563
564 tmpl->status = IPS_TEMPLATE;
565 write_pnet(&tmpl->ct_net, net);
566 nf_ct_zone_add(tmpl, zone);
567 atomic_set(&tmpl->ct_general.use, 0);
568
569 return tmpl;
570 }
571 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
572
nf_ct_tmpl_free(struct nf_conn * tmpl)573 void nf_ct_tmpl_free(struct nf_conn *tmpl)
574 {
575 nf_ct_ext_destroy(tmpl);
576
577 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
578 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
579 else
580 kfree(tmpl);
581 }
582 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
583
destroy_gre_conntrack(struct nf_conn * ct)584 static void destroy_gre_conntrack(struct nf_conn *ct)
585 {
586 #ifdef CONFIG_NF_CT_PROTO_GRE
587 struct nf_conn *master = ct->master;
588
589 if (master)
590 nf_ct_gre_keymap_destroy(master);
591 #endif
592 }
593
594 static void
destroy_conntrack(struct nf_conntrack * nfct)595 destroy_conntrack(struct nf_conntrack *nfct)
596 {
597 struct nf_conn *ct = (struct nf_conn *)nfct;
598
599 pr_debug("destroy_conntrack(%p)\n", ct);
600 WARN_ON(atomic_read(&nfct->use) != 0);
601
602 if (unlikely(nf_ct_is_template(ct))) {
603 nf_ct_tmpl_free(ct);
604 return;
605 }
606
607 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
608 destroy_gre_conntrack(ct);
609
610 local_bh_disable();
611 /* Expectations will have been removed in clean_from_lists,
612 * except TFTP can create an expectation on the first packet,
613 * before connection is in the list, so we need to clean here,
614 * too.
615 */
616 nf_ct_remove_expectations(ct);
617
618 nf_ct_del_from_dying_or_unconfirmed_list(ct);
619
620 local_bh_enable();
621
622 if (ct->master)
623 nf_ct_put(ct->master);
624
625 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
626 nf_conntrack_free(ct);
627 }
628
nf_ct_delete_from_lists(struct nf_conn * ct)629 static void nf_ct_delete_from_lists(struct nf_conn *ct)
630 {
631 struct net *net = nf_ct_net(ct);
632 unsigned int hash, reply_hash;
633 unsigned int sequence;
634
635 nf_ct_helper_destroy(ct);
636
637 local_bh_disable();
638 do {
639 sequence = read_seqcount_begin(&nf_conntrack_generation);
640 hash = hash_conntrack(net,
641 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
642 reply_hash = hash_conntrack(net,
643 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
644 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
645
646 clean_from_lists(ct);
647 nf_conntrack_double_unlock(hash, reply_hash);
648
649 nf_ct_add_to_dying_list(ct);
650
651 local_bh_enable();
652 }
653
nf_ct_delete(struct nf_conn * ct,u32 portid,int report)654 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
655 {
656 struct nf_conn_tstamp *tstamp;
657
658 if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
659 return false;
660
661 tstamp = nf_conn_tstamp_find(ct);
662 if (tstamp) {
663 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
664
665 tstamp->stop = ktime_get_real_ns();
666 if (timeout < 0)
667 tstamp->stop -= jiffies_to_nsecs(-timeout);
668 }
669
670 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
671 portid, report) < 0) {
672 /* destroy event was not delivered. nf_ct_put will
673 * be done by event cache worker on redelivery.
674 */
675 nf_ct_delete_from_lists(ct);
676 nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
677 return false;
678 }
679
680 nf_conntrack_ecache_work(nf_ct_net(ct));
681 nf_ct_delete_from_lists(ct);
682 nf_ct_put(ct);
683 return true;
684 }
685 EXPORT_SYMBOL_GPL(nf_ct_delete);
686
687 static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash * h,const struct nf_conntrack_tuple * tuple,const struct nf_conntrack_zone * zone,const struct net * net)688 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
689 const struct nf_conntrack_tuple *tuple,
690 const struct nf_conntrack_zone *zone,
691 const struct net *net)
692 {
693 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
694
695 /* A conntrack can be recreated with the equal tuple,
696 * so we need to check that the conntrack is confirmed
697 */
698 return nf_ct_tuple_equal(tuple, &h->tuple) &&
699 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
700 nf_ct_is_confirmed(ct) &&
701 net_eq(net, nf_ct_net(ct));
702 }
703
704 static inline bool
nf_ct_match(const struct nf_conn * ct1,const struct nf_conn * ct2)705 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
706 {
707 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
708 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
709 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
710 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
711 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
712 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
713 net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
714 }
715
716 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
nf_ct_gc_expired(struct nf_conn * ct)717 static void nf_ct_gc_expired(struct nf_conn *ct)
718 {
719 if (!atomic_inc_not_zero(&ct->ct_general.use))
720 return;
721
722 if (nf_ct_should_gc(ct))
723 nf_ct_kill(ct);
724
725 nf_ct_put(ct);
726 }
727
728 /*
729 * Warning :
730 * - Caller must take a reference on returned object
731 * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
732 */
733 static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net * net,const struct nf_conntrack_zone * zone,const struct nf_conntrack_tuple * tuple,u32 hash)734 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
735 const struct nf_conntrack_tuple *tuple, u32 hash)
736 {
737 struct nf_conntrack_tuple_hash *h;
738 struct hlist_nulls_head *ct_hash;
739 struct hlist_nulls_node *n;
740 unsigned int bucket, hsize;
741
742 begin:
743 nf_conntrack_get_ht(&ct_hash, &hsize);
744 bucket = reciprocal_scale(hash, hsize);
745
746 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
747 struct nf_conn *ct;
748
749 ct = nf_ct_tuplehash_to_ctrack(h);
750 if (nf_ct_is_expired(ct)) {
751 nf_ct_gc_expired(ct);
752 continue;
753 }
754
755 if (nf_ct_key_equal(h, tuple, zone, net))
756 return h;
757 }
758 /*
759 * if the nulls value we got at the end of this lookup is
760 * not the expected one, we must restart lookup.
761 * We probably met an item that was moved to another chain.
762 */
763 if (get_nulls_value(n) != bucket) {
764 NF_CT_STAT_INC_ATOMIC(net, search_restart);
765 goto begin;
766 }
767
768 return NULL;
769 }
770
771 /* Find a connection corresponding to a tuple. */
772 static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net * net,const struct nf_conntrack_zone * zone,const struct nf_conntrack_tuple * tuple,u32 hash)773 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
774 const struct nf_conntrack_tuple *tuple, u32 hash)
775 {
776 struct nf_conntrack_tuple_hash *h;
777 struct nf_conn *ct;
778
779 rcu_read_lock();
780
781 h = ____nf_conntrack_find(net, zone, tuple, hash);
782 if (h) {
783 /* We have a candidate that matches the tuple we're interested
784 * in, try to obtain a reference and re-check tuple
785 */
786 ct = nf_ct_tuplehash_to_ctrack(h);
787 if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
788 if (likely(nf_ct_key_equal(h, tuple, zone, net)))
789 goto found;
790
791 /* TYPESAFE_BY_RCU recycled the candidate */
792 nf_ct_put(ct);
793 }
794
795 h = NULL;
796 }
797 found:
798 rcu_read_unlock();
799
800 return h;
801 }
802
803 struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net * net,const struct nf_conntrack_zone * zone,const struct nf_conntrack_tuple * tuple)804 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
805 const struct nf_conntrack_tuple *tuple)
806 {
807 return __nf_conntrack_find_get(net, zone, tuple,
808 hash_conntrack_raw(tuple, net));
809 }
810 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
811
__nf_conntrack_hash_insert(struct nf_conn * ct,unsigned int hash,unsigned int reply_hash)812 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
813 unsigned int hash,
814 unsigned int reply_hash)
815 {
816 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
817 &nf_conntrack_hash[hash]);
818 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
819 &nf_conntrack_hash[reply_hash]);
820 }
821
822 int
nf_conntrack_hash_check_insert(struct nf_conn * ct)823 nf_conntrack_hash_check_insert(struct nf_conn *ct)
824 {
825 const struct nf_conntrack_zone *zone;
826 struct net *net = nf_ct_net(ct);
827 unsigned int hash, reply_hash;
828 struct nf_conntrack_tuple_hash *h;
829 struct hlist_nulls_node *n;
830 unsigned int sequence;
831
832 zone = nf_ct_zone(ct);
833
834 local_bh_disable();
835 do {
836 sequence = read_seqcount_begin(&nf_conntrack_generation);
837 hash = hash_conntrack(net,
838 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
839 reply_hash = hash_conntrack(net,
840 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
841 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
842
843 /* See if there's one in the list already, including reverse */
844 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
845 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
846 zone, net))
847 goto out;
848
849 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
850 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
851 zone, net))
852 goto out;
853
854 smp_wmb();
855 /* The caller holds a reference to this object */
856 atomic_set(&ct->ct_general.use, 2);
857 __nf_conntrack_hash_insert(ct, hash, reply_hash);
858 nf_conntrack_double_unlock(hash, reply_hash);
859 NF_CT_STAT_INC(net, insert);
860 local_bh_enable();
861 return 0;
862
863 out:
864 nf_conntrack_double_unlock(hash, reply_hash);
865 local_bh_enable();
866 return -EEXIST;
867 }
868 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
869
nf_ct_acct_add(struct nf_conn * ct,u32 dir,unsigned int packets,unsigned int bytes)870 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
871 unsigned int bytes)
872 {
873 struct nf_conn_acct *acct;
874
875 acct = nf_conn_acct_find(ct);
876 if (acct) {
877 struct nf_conn_counter *counter = acct->counter;
878
879 atomic64_add(packets, &counter[dir].packets);
880 atomic64_add(bytes, &counter[dir].bytes);
881 }
882 }
883 EXPORT_SYMBOL_GPL(nf_ct_acct_add);
884
nf_ct_acct_merge(struct nf_conn * ct,enum ip_conntrack_info ctinfo,const struct nf_conn * loser_ct)885 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
886 const struct nf_conn *loser_ct)
887 {
888 struct nf_conn_acct *acct;
889
890 acct = nf_conn_acct_find(loser_ct);
891 if (acct) {
892 struct nf_conn_counter *counter = acct->counter;
893 unsigned int bytes;
894
895 /* u32 should be fine since we must have seen one packet. */
896 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
897 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
898 }
899 }
900
__nf_conntrack_insert_prepare(struct nf_conn * ct)901 static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
902 {
903 struct nf_conn_tstamp *tstamp;
904
905 atomic_inc(&ct->ct_general.use);
906 ct->status |= IPS_CONFIRMED;
907
908 /* set conntrack timestamp, if enabled. */
909 tstamp = nf_conn_tstamp_find(ct);
910 if (tstamp)
911 tstamp->start = ktime_get_real_ns();
912 }
913
914 /* caller must hold locks to prevent concurrent changes */
__nf_ct_resolve_clash(struct sk_buff * skb,struct nf_conntrack_tuple_hash * h)915 static int __nf_ct_resolve_clash(struct sk_buff *skb,
916 struct nf_conntrack_tuple_hash *h)
917 {
918 /* This is the conntrack entry already in hashes that won race. */
919 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
920 enum ip_conntrack_info ctinfo;
921 struct nf_conn *loser_ct;
922
923 loser_ct = nf_ct_get(skb, &ctinfo);
924
925 if (nf_ct_is_dying(ct))
926 return NF_DROP;
927
928 if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
929 nf_ct_match(ct, loser_ct)) {
930 struct net *net = nf_ct_net(ct);
931
932 nf_conntrack_get(&ct->ct_general);
933
934 nf_ct_acct_merge(ct, ctinfo, loser_ct);
935 nf_ct_add_to_dying_list(loser_ct);
936 nf_conntrack_put(&loser_ct->ct_general);
937 nf_ct_set(skb, ct, ctinfo);
938
939 NF_CT_STAT_INC(net, clash_resolve);
940 return NF_ACCEPT;
941 }
942
943 return NF_DROP;
944 }
945
946 /**
947 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
948 *
949 * @skb: skb that causes the collision
950 * @repl_idx: hash slot for reply direction
951 *
952 * Called when origin or reply direction had a clash.
953 * The skb can be handled without packet drop provided the reply direction
954 * is unique or there the existing entry has the identical tuple in both
955 * directions.
956 *
957 * Caller must hold conntrack table locks to prevent concurrent updates.
958 *
959 * Returns NF_DROP if the clash could not be handled.
960 */
nf_ct_resolve_clash_harder(struct sk_buff * skb,u32 repl_idx)961 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
962 {
963 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
964 const struct nf_conntrack_zone *zone;
965 struct nf_conntrack_tuple_hash *h;
966 struct hlist_nulls_node *n;
967 struct net *net;
968
969 zone = nf_ct_zone(loser_ct);
970 net = nf_ct_net(loser_ct);
971
972 /* Reply direction must never result in a clash, unless both origin
973 * and reply tuples are identical.
974 */
975 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
976 if (nf_ct_key_equal(h,
977 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
978 zone, net))
979 return __nf_ct_resolve_clash(skb, h);
980 }
981
982 /* We want the clashing entry to go away real soon: 1 second timeout. */
983 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
984
985 /* IPS_NAT_CLASH removes the entry automatically on the first
986 * reply. Also prevents UDP tracker from moving the entry to
987 * ASSURED state, i.e. the entry can always be evicted under
988 * pressure.
989 */
990 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
991
992 __nf_conntrack_insert_prepare(loser_ct);
993
994 /* fake add for ORIGINAL dir: we want lookups to only find the entry
995 * already in the table. This also hides the clashing entry from
996 * ctnetlink iteration, i.e. conntrack -L won't show them.
997 */
998 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
999
1000 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1001 &nf_conntrack_hash[repl_idx]);
1002
1003 NF_CT_STAT_INC(net, clash_resolve);
1004 return NF_ACCEPT;
1005 }
1006
1007 /**
1008 * nf_ct_resolve_clash - attempt to handle clash without packet drop
1009 *
1010 * @skb: skb that causes the clash
1011 * @h: tuplehash of the clashing entry already in table
1012 * @reply_hash: hash slot for reply direction
1013 *
1014 * A conntrack entry can be inserted to the connection tracking table
1015 * if there is no existing entry with an identical tuple.
1016 *
1017 * If there is one, @skb (and the assocated, unconfirmed conntrack) has
1018 * to be dropped. In case @skb is retransmitted, next conntrack lookup
1019 * will find the already-existing entry.
1020 *
1021 * The major problem with such packet drop is the extra delay added by
1022 * the packet loss -- it will take some time for a retransmit to occur
1023 * (or the sender to time out when waiting for a reply).
1024 *
1025 * This function attempts to handle the situation without packet drop.
1026 *
1027 * If @skb has no NAT transformation or if the colliding entries are
1028 * exactly the same, only the to-be-confirmed conntrack entry is discarded
1029 * and @skb is associated with the conntrack entry already in the table.
1030 *
1031 * Failing that, the new, unconfirmed conntrack is still added to the table
1032 * provided that the collision only occurs in the ORIGINAL direction.
1033 * The new entry will be added only in the non-clashing REPLY direction,
1034 * so packets in the ORIGINAL direction will continue to match the existing
1035 * entry. The new entry will also have a fixed timeout so it expires --
1036 * due to the collision, it will only see reply traffic.
1037 *
1038 * Returns NF_DROP if the clash could not be resolved.
1039 */
1040 static __cold noinline int
nf_ct_resolve_clash(struct sk_buff * skb,struct nf_conntrack_tuple_hash * h,u32 reply_hash)1041 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
1042 u32 reply_hash)
1043 {
1044 /* This is the conntrack entry already in hashes that won race. */
1045 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1046 const struct nf_conntrack_l4proto *l4proto;
1047 enum ip_conntrack_info ctinfo;
1048 struct nf_conn *loser_ct;
1049 struct net *net;
1050 int ret;
1051
1052 loser_ct = nf_ct_get(skb, &ctinfo);
1053 net = nf_ct_net(loser_ct);
1054
1055 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1056 if (!l4proto->allow_clash)
1057 goto drop;
1058
1059 ret = __nf_ct_resolve_clash(skb, h);
1060 if (ret == NF_ACCEPT)
1061 return ret;
1062
1063 ret = nf_ct_resolve_clash_harder(skb, reply_hash);
1064 if (ret == NF_ACCEPT)
1065 return ret;
1066
1067 drop:
1068 nf_ct_add_to_dying_list(loser_ct);
1069 NF_CT_STAT_INC(net, drop);
1070 NF_CT_STAT_INC(net, insert_failed);
1071 return NF_DROP;
1072 }
1073
1074 /* Confirm a connection given skb; places it in hash table */
1075 int
__nf_conntrack_confirm(struct sk_buff * skb)1076 __nf_conntrack_confirm(struct sk_buff *skb)
1077 {
1078 const struct nf_conntrack_zone *zone;
1079 unsigned int hash, reply_hash;
1080 struct nf_conntrack_tuple_hash *h;
1081 struct nf_conn *ct;
1082 struct nf_conn_help *help;
1083 struct hlist_nulls_node *n;
1084 enum ip_conntrack_info ctinfo;
1085 struct net *net;
1086 unsigned int sequence;
1087 int ret = NF_DROP;
1088
1089 ct = nf_ct_get(skb, &ctinfo);
1090 net = nf_ct_net(ct);
1091
1092 /* ipt_REJECT uses nf_conntrack_attach to attach related
1093 ICMP/TCP RST packets in other direction. Actual packet
1094 which created connection will be IP_CT_NEW or for an
1095 expected connection, IP_CT_RELATED. */
1096 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1097 return NF_ACCEPT;
1098
1099 zone = nf_ct_zone(ct);
1100 local_bh_disable();
1101
1102 do {
1103 sequence = read_seqcount_begin(&nf_conntrack_generation);
1104 /* reuse the hash saved before */
1105 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1106 hash = scale_hash(hash);
1107 reply_hash = hash_conntrack(net,
1108 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
1109
1110 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
1111
1112 /* We're not in hash table, and we refuse to set up related
1113 * connections for unconfirmed conns. But packet copies and
1114 * REJECT will give spurious warnings here.
1115 */
1116
1117 /* Another skb with the same unconfirmed conntrack may
1118 * win the race. This may happen for bridge(br_flood)
1119 * or broadcast/multicast packets do skb_clone with
1120 * unconfirmed conntrack.
1121 */
1122 if (unlikely(nf_ct_is_confirmed(ct))) {
1123 WARN_ON_ONCE(1);
1124 nf_conntrack_double_unlock(hash, reply_hash);
1125 local_bh_enable();
1126 return NF_DROP;
1127 }
1128
1129 pr_debug("Confirming conntrack %p\n", ct);
1130 /* We have to check the DYING flag after unlink to prevent
1131 * a race against nf_ct_get_next_corpse() possibly called from
1132 * user context, else we insert an already 'dead' hash, blocking
1133 * further use of that particular connection -JM.
1134 */
1135 nf_ct_del_from_dying_or_unconfirmed_list(ct);
1136
1137 if (unlikely(nf_ct_is_dying(ct))) {
1138 nf_ct_add_to_dying_list(ct);
1139 NF_CT_STAT_INC(net, insert_failed);
1140 goto dying;
1141 }
1142
1143 /* See if there's one in the list already, including reverse:
1144 NAT could have grabbed it without realizing, since we're
1145 not in the hash. If there is, we lost race. */
1146 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
1147 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1148 zone, net))
1149 goto out;
1150
1151 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
1152 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1153 zone, net))
1154 goto out;
1155
1156 /* Timer relative to confirmation time, not original
1157 setting time, otherwise we'd get timer wrap in
1158 weird delay cases. */
1159 ct->timeout += nfct_time_stamp;
1160
1161 __nf_conntrack_insert_prepare(ct);
1162
1163 /* Since the lookup is lockless, hash insertion must be done after
1164 * starting the timer and setting the CONFIRMED bit. The RCU barriers
1165 * guarantee that no other CPU can find the conntrack before the above
1166 * stores are visible.
1167 */
1168 __nf_conntrack_hash_insert(ct, hash, reply_hash);
1169 nf_conntrack_double_unlock(hash, reply_hash);
1170 local_bh_enable();
1171
1172 help = nfct_help(ct);
1173 if (help && help->helper)
1174 nf_conntrack_event_cache(IPCT_HELPER, ct);
1175
1176 nf_conntrack_event_cache(master_ct(ct) ?
1177 IPCT_RELATED : IPCT_NEW, ct);
1178 return NF_ACCEPT;
1179
1180 out:
1181 ret = nf_ct_resolve_clash(skb, h, reply_hash);
1182 dying:
1183 nf_conntrack_double_unlock(hash, reply_hash);
1184 local_bh_enable();
1185 return ret;
1186 }
1187 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1188
1189 /* Returns true if a connection correspondings to the tuple (required
1190 for NAT). */
1191 int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple * tuple,const struct nf_conn * ignored_conntrack)1192 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1193 const struct nf_conn *ignored_conntrack)
1194 {
1195 struct net *net = nf_ct_net(ignored_conntrack);
1196 const struct nf_conntrack_zone *zone;
1197 struct nf_conntrack_tuple_hash *h;
1198 struct hlist_nulls_head *ct_hash;
1199 unsigned int hash, hsize;
1200 struct hlist_nulls_node *n;
1201 struct nf_conn *ct;
1202
1203 zone = nf_ct_zone(ignored_conntrack);
1204
1205 rcu_read_lock();
1206 begin:
1207 nf_conntrack_get_ht(&ct_hash, &hsize);
1208 hash = __hash_conntrack(net, tuple, hsize);
1209
1210 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1211 ct = nf_ct_tuplehash_to_ctrack(h);
1212
1213 if (ct == ignored_conntrack)
1214 continue;
1215
1216 if (nf_ct_is_expired(ct)) {
1217 nf_ct_gc_expired(ct);
1218 continue;
1219 }
1220
1221 if (nf_ct_key_equal(h, tuple, zone, net)) {
1222 /* Tuple is taken already, so caller will need to find
1223 * a new source port to use.
1224 *
1225 * Only exception:
1226 * If the *original tuples* are identical, then both
1227 * conntracks refer to the same flow.
1228 * This is a rare situation, it can occur e.g. when
1229 * more than one UDP packet is sent from same socket
1230 * in different threads.
1231 *
1232 * Let nf_ct_resolve_clash() deal with this later.
1233 */
1234 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1235 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
1236 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
1237 continue;
1238
1239 NF_CT_STAT_INC_ATOMIC(net, found);
1240 rcu_read_unlock();
1241 return 1;
1242 }
1243 }
1244
1245 if (get_nulls_value(n) != hash) {
1246 NF_CT_STAT_INC_ATOMIC(net, search_restart);
1247 goto begin;
1248 }
1249
1250 rcu_read_unlock();
1251
1252 return 0;
1253 }
1254 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1255
1256 #define NF_CT_EVICTION_RANGE 8
1257
1258 /* There's a small race here where we may free a just-assured
1259 connection. Too bad: we're in trouble anyway. */
early_drop_list(struct net * net,struct hlist_nulls_head * head)1260 static unsigned int early_drop_list(struct net *net,
1261 struct hlist_nulls_head *head)
1262 {
1263 struct nf_conntrack_tuple_hash *h;
1264 struct hlist_nulls_node *n;
1265 unsigned int drops = 0;
1266 struct nf_conn *tmp;
1267
1268 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1269 tmp = nf_ct_tuplehash_to_ctrack(h);
1270
1271 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1272 continue;
1273
1274 if (nf_ct_is_expired(tmp)) {
1275 nf_ct_gc_expired(tmp);
1276 continue;
1277 }
1278
1279 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1280 !net_eq(nf_ct_net(tmp), net) ||
1281 nf_ct_is_dying(tmp))
1282 continue;
1283
1284 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1285 continue;
1286
1287 /* kill only if still in same netns -- might have moved due to
1288 * SLAB_TYPESAFE_BY_RCU rules.
1289 *
1290 * We steal the timer reference. If that fails timer has
1291 * already fired or someone else deleted it. Just drop ref
1292 * and move to next entry.
1293 */
1294 if (net_eq(nf_ct_net(tmp), net) &&
1295 nf_ct_is_confirmed(tmp) &&
1296 nf_ct_delete(tmp, 0, 0))
1297 drops++;
1298
1299 nf_ct_put(tmp);
1300 }
1301
1302 return drops;
1303 }
1304
early_drop(struct net * net,unsigned int hash)1305 static noinline int early_drop(struct net *net, unsigned int hash)
1306 {
1307 unsigned int i, bucket;
1308
1309 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1310 struct hlist_nulls_head *ct_hash;
1311 unsigned int hsize, drops;
1312
1313 rcu_read_lock();
1314 nf_conntrack_get_ht(&ct_hash, &hsize);
1315 if (!i)
1316 bucket = reciprocal_scale(hash, hsize);
1317 else
1318 bucket = (bucket + 1) % hsize;
1319
1320 drops = early_drop_list(net, &ct_hash[bucket]);
1321 rcu_read_unlock();
1322
1323 if (drops) {
1324 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1325 return true;
1326 }
1327 }
1328
1329 return false;
1330 }
1331
gc_worker_skip_ct(const struct nf_conn * ct)1332 static bool gc_worker_skip_ct(const struct nf_conn *ct)
1333 {
1334 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1335 }
1336
gc_worker_can_early_drop(const struct nf_conn * ct)1337 static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1338 {
1339 const struct nf_conntrack_l4proto *l4proto;
1340
1341 if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1342 return true;
1343
1344 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1345 if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1346 return true;
1347
1348 return false;
1349 }
1350
gc_worker(struct work_struct * work)1351 static void gc_worker(struct work_struct *work)
1352 {
1353 unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
1354 unsigned int i, hashsz, nf_conntrack_max95 = 0;
1355 unsigned long next_run = GC_SCAN_INTERVAL;
1356 struct conntrack_gc_work *gc_work;
1357 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1358
1359 i = gc_work->next_bucket;
1360 if (gc_work->early_drop)
1361 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1362
1363 do {
1364 struct nf_conntrack_tuple_hash *h;
1365 struct hlist_nulls_head *ct_hash;
1366 struct hlist_nulls_node *n;
1367 struct nf_conn *tmp;
1368
1369 rcu_read_lock();
1370
1371 nf_conntrack_get_ht(&ct_hash, &hashsz);
1372 if (i >= hashsz) {
1373 rcu_read_unlock();
1374 break;
1375 }
1376
1377 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1378 struct net *net;
1379
1380 tmp = nf_ct_tuplehash_to_ctrack(h);
1381
1382 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1383 nf_ct_offload_timeout(tmp);
1384 continue;
1385 }
1386
1387 if (nf_ct_is_expired(tmp)) {
1388 nf_ct_gc_expired(tmp);
1389 continue;
1390 }
1391
1392 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1393 continue;
1394
1395 net = nf_ct_net(tmp);
1396 if (atomic_read(&net->ct.count) < nf_conntrack_max95)
1397 continue;
1398
1399 /* need to take reference to avoid possible races */
1400 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1401 continue;
1402
1403 if (gc_worker_skip_ct(tmp)) {
1404 nf_ct_put(tmp);
1405 continue;
1406 }
1407
1408 if (gc_worker_can_early_drop(tmp))
1409 nf_ct_kill(tmp);
1410
1411 nf_ct_put(tmp);
1412 }
1413
1414 /* could check get_nulls_value() here and restart if ct
1415 * was moved to another chain. But given gc is best-effort
1416 * we will just continue with next hash slot.
1417 */
1418 rcu_read_unlock();
1419 cond_resched();
1420 i++;
1421
1422 if (time_after(jiffies, end_time) && i < hashsz) {
1423 gc_work->next_bucket = i;
1424 next_run = 0;
1425 break;
1426 }
1427 } while (i < hashsz);
1428
1429 if (gc_work->exiting)
1430 return;
1431
1432 /*
1433 * Eviction will normally happen from the packet path, and not
1434 * from this gc worker.
1435 *
1436 * This worker is only here to reap expired entries when system went
1437 * idle after a busy period.
1438 */
1439 if (next_run) {
1440 gc_work->early_drop = false;
1441 gc_work->next_bucket = 0;
1442 }
1443 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1444 }
1445
conntrack_gc_work_init(struct conntrack_gc_work * gc_work)1446 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1447 {
1448 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
1449 gc_work->exiting = false;
1450 }
1451
1452 static struct nf_conn *
__nf_conntrack_alloc(struct net * net,const struct nf_conntrack_zone * zone,const struct nf_conntrack_tuple * orig,const struct nf_conntrack_tuple * repl,gfp_t gfp,u32 hash)1453 __nf_conntrack_alloc(struct net *net,
1454 const struct nf_conntrack_zone *zone,
1455 const struct nf_conntrack_tuple *orig,
1456 const struct nf_conntrack_tuple *repl,
1457 gfp_t gfp, u32 hash)
1458 {
1459 struct nf_conn *ct;
1460
1461 /* We don't want any race condition at early drop stage */
1462 atomic_inc(&net->ct.count);
1463
1464 if (nf_conntrack_max &&
1465 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1466 if (!early_drop(net, hash)) {
1467 if (!conntrack_gc_work.early_drop)
1468 conntrack_gc_work.early_drop = true;
1469 atomic_dec(&net->ct.count);
1470 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1471 return ERR_PTR(-ENOMEM);
1472 }
1473 }
1474
1475 /*
1476 * Do not use kmem_cache_zalloc(), as this cache uses
1477 * SLAB_TYPESAFE_BY_RCU.
1478 */
1479 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1480 if (ct == NULL)
1481 goto out;
1482
1483 spin_lock_init(&ct->lock);
1484 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1485 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1486 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1487 /* save hash for reusing when confirming */
1488 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1489 ct->status = 0;
1490 WRITE_ONCE(ct->timeout, 0);
1491 write_pnet(&ct->ct_net, net);
1492 memset(&ct->__nfct_init_offset, 0,
1493 offsetof(struct nf_conn, proto) -
1494 offsetof(struct nf_conn, __nfct_init_offset));
1495
1496 nf_ct_zone_add(ct, zone);
1497
1498 /* Because we use RCU lookups, we set ct_general.use to zero before
1499 * this is inserted in any list.
1500 */
1501 atomic_set(&ct->ct_general.use, 0);
1502 return ct;
1503 out:
1504 atomic_dec(&net->ct.count);
1505 return ERR_PTR(-ENOMEM);
1506 }
1507
nf_conntrack_alloc(struct net * net,const struct nf_conntrack_zone * zone,const struct nf_conntrack_tuple * orig,const struct nf_conntrack_tuple * repl,gfp_t gfp)1508 struct nf_conn *nf_conntrack_alloc(struct net *net,
1509 const struct nf_conntrack_zone *zone,
1510 const struct nf_conntrack_tuple *orig,
1511 const struct nf_conntrack_tuple *repl,
1512 gfp_t gfp)
1513 {
1514 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1515 }
1516 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1517
nf_conntrack_free(struct nf_conn * ct)1518 void nf_conntrack_free(struct nf_conn *ct)
1519 {
1520 struct net *net = nf_ct_net(ct);
1521
1522 /* A freed object has refcnt == 0, that's
1523 * the golden rule for SLAB_TYPESAFE_BY_RCU
1524 */
1525 WARN_ON(atomic_read(&ct->ct_general.use) != 0);
1526
1527 nf_ct_ext_destroy(ct);
1528 kmem_cache_free(nf_conntrack_cachep, ct);
1529 smp_mb__before_atomic();
1530 atomic_dec(&net->ct.count);
1531 }
1532 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1533
1534
1535 /* Allocate a new conntrack: we return -ENOMEM if classification
1536 failed due to stress. Otherwise it really is unclassifiable. */
1537 static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net * net,struct nf_conn * tmpl,const struct nf_conntrack_tuple * tuple,struct sk_buff * skb,unsigned int dataoff,u32 hash)1538 init_conntrack(struct net *net, struct nf_conn *tmpl,
1539 const struct nf_conntrack_tuple *tuple,
1540 struct sk_buff *skb,
1541 unsigned int dataoff, u32 hash)
1542 {
1543 struct nf_conn *ct;
1544 struct nf_conn_help *help;
1545 struct nf_conntrack_tuple repl_tuple;
1546 struct nf_conntrack_ecache *ecache;
1547 struct nf_conntrack_expect *exp = NULL;
1548 const struct nf_conntrack_zone *zone;
1549 struct nf_conn_timeout *timeout_ext;
1550 struct nf_conntrack_zone tmp;
1551
1552 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1553 pr_debug("Can't invert tuple.\n");
1554 return NULL;
1555 }
1556
1557 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1558 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1559 hash);
1560 if (IS_ERR(ct))
1561 return (struct nf_conntrack_tuple_hash *)ct;
1562
1563 if (!nf_ct_add_synproxy(ct, tmpl)) {
1564 nf_conntrack_free(ct);
1565 return ERR_PTR(-ENOMEM);
1566 }
1567
1568 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1569
1570 if (timeout_ext)
1571 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1572 GFP_ATOMIC);
1573
1574 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1575 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1576 nf_ct_labels_ext_add(ct);
1577
1578 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1579 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1580 ecache ? ecache->expmask : 0,
1581 GFP_ATOMIC);
1582
1583 local_bh_disable();
1584 if (net->ct.expect_count) {
1585 spin_lock(&nf_conntrack_expect_lock);
1586 exp = nf_ct_find_expectation(net, zone, tuple);
1587 if (exp) {
1588 pr_debug("expectation arrives ct=%p exp=%p\n",
1589 ct, exp);
1590 /* Welcome, Mr. Bond. We've been expecting you... */
1591 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1592 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1593 ct->master = exp->master;
1594 if (exp->helper) {
1595 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1596 if (help)
1597 rcu_assign_pointer(help->helper, exp->helper);
1598 }
1599
1600 #ifdef CONFIG_NF_CONNTRACK_MARK
1601 ct->mark = READ_ONCE(exp->master->mark);
1602 #endif
1603 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1604 ct->secmark = exp->master->secmark;
1605 #endif
1606 NF_CT_STAT_INC(net, expect_new);
1607 }
1608 spin_unlock(&nf_conntrack_expect_lock);
1609 }
1610 if (!exp)
1611 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1612
1613 /* Now it is inserted into the unconfirmed list, bump refcount */
1614 nf_conntrack_get(&ct->ct_general);
1615 nf_ct_add_to_unconfirmed_list(ct);
1616
1617 local_bh_enable();
1618
1619 if (exp) {
1620 if (exp->expectfn)
1621 exp->expectfn(ct, exp);
1622 nf_ct_expect_put(exp);
1623 }
1624
1625 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1626 }
1627
1628 /* On success, returns 0, sets skb->_nfct | ctinfo */
1629 static int
resolve_normal_ct(struct nf_conn * tmpl,struct sk_buff * skb,unsigned int dataoff,u_int8_t protonum,const struct nf_hook_state * state)1630 resolve_normal_ct(struct nf_conn *tmpl,
1631 struct sk_buff *skb,
1632 unsigned int dataoff,
1633 u_int8_t protonum,
1634 const struct nf_hook_state *state)
1635 {
1636 const struct nf_conntrack_zone *zone;
1637 struct nf_conntrack_tuple tuple;
1638 struct nf_conntrack_tuple_hash *h;
1639 enum ip_conntrack_info ctinfo;
1640 struct nf_conntrack_zone tmp;
1641 struct nf_conn *ct;
1642 u32 hash;
1643
1644 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1645 dataoff, state->pf, protonum, state->net,
1646 &tuple)) {
1647 pr_debug("Can't get tuple\n");
1648 return 0;
1649 }
1650
1651 /* look for tuple match */
1652 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1653 hash = hash_conntrack_raw(&tuple, state->net);
1654 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1655 if (!h) {
1656 h = init_conntrack(state->net, tmpl, &tuple,
1657 skb, dataoff, hash);
1658 if (!h)
1659 return 0;
1660 if (IS_ERR(h))
1661 return PTR_ERR(h);
1662 }
1663 ct = nf_ct_tuplehash_to_ctrack(h);
1664
1665 /* It exists; we have (non-exclusive) reference. */
1666 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1667 ctinfo = IP_CT_ESTABLISHED_REPLY;
1668 } else {
1669 /* Once we've had two way comms, always ESTABLISHED. */
1670 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1671 pr_debug("normal packet for %p\n", ct);
1672 ctinfo = IP_CT_ESTABLISHED;
1673 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1674 pr_debug("related packet for %p\n", ct);
1675 ctinfo = IP_CT_RELATED;
1676 } else {
1677 pr_debug("new packet for %p\n", ct);
1678 ctinfo = IP_CT_NEW;
1679 }
1680 }
1681 nf_ct_set(skb, ct, ctinfo);
1682 return 0;
1683 }
1684
1685 /*
1686 * icmp packets need special treatment to handle error messages that are
1687 * related to a connection.
1688 *
1689 * Callers need to check if skb has a conntrack assigned when this
1690 * helper returns; in such case skb belongs to an already known connection.
1691 */
1692 static unsigned int __cold
nf_conntrack_handle_icmp(struct nf_conn * tmpl,struct sk_buff * skb,unsigned int dataoff,u8 protonum,const struct nf_hook_state * state)1693 nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1694 struct sk_buff *skb,
1695 unsigned int dataoff,
1696 u8 protonum,
1697 const struct nf_hook_state *state)
1698 {
1699 int ret;
1700
1701 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1702 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1703 #if IS_ENABLED(CONFIG_IPV6)
1704 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1705 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1706 #endif
1707 else
1708 return NF_ACCEPT;
1709
1710 if (ret <= 0)
1711 NF_CT_STAT_INC_ATOMIC(state->net, error);
1712
1713 return ret;
1714 }
1715
generic_packet(struct nf_conn * ct,struct sk_buff * skb,enum ip_conntrack_info ctinfo)1716 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1717 enum ip_conntrack_info ctinfo)
1718 {
1719 const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1720
1721 if (!timeout)
1722 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1723
1724 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1725 return NF_ACCEPT;
1726 }
1727
1728 /* Returns verdict for packet, or -1 for invalid. */
nf_conntrack_handle_packet(struct nf_conn * ct,struct sk_buff * skb,unsigned int dataoff,enum ip_conntrack_info ctinfo,const struct nf_hook_state * state)1729 static int nf_conntrack_handle_packet(struct nf_conn *ct,
1730 struct sk_buff *skb,
1731 unsigned int dataoff,
1732 enum ip_conntrack_info ctinfo,
1733 const struct nf_hook_state *state)
1734 {
1735 switch (nf_ct_protonum(ct)) {
1736 case IPPROTO_TCP:
1737 return nf_conntrack_tcp_packet(ct, skb, dataoff,
1738 ctinfo, state);
1739 case IPPROTO_UDP:
1740 return nf_conntrack_udp_packet(ct, skb, dataoff,
1741 ctinfo, state);
1742 case IPPROTO_ICMP:
1743 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1744 #if IS_ENABLED(CONFIG_IPV6)
1745 case IPPROTO_ICMPV6:
1746 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1747 #endif
1748 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
1749 case IPPROTO_UDPLITE:
1750 return nf_conntrack_udplite_packet(ct, skb, dataoff,
1751 ctinfo, state);
1752 #endif
1753 #ifdef CONFIG_NF_CT_PROTO_SCTP
1754 case IPPROTO_SCTP:
1755 return nf_conntrack_sctp_packet(ct, skb, dataoff,
1756 ctinfo, state);
1757 #endif
1758 #ifdef CONFIG_NF_CT_PROTO_DCCP
1759 case IPPROTO_DCCP:
1760 return nf_conntrack_dccp_packet(ct, skb, dataoff,
1761 ctinfo, state);
1762 #endif
1763 #ifdef CONFIG_NF_CT_PROTO_GRE
1764 case IPPROTO_GRE:
1765 return nf_conntrack_gre_packet(ct, skb, dataoff,
1766 ctinfo, state);
1767 #endif
1768 }
1769
1770 return generic_packet(ct, skb, ctinfo);
1771 }
1772
1773 unsigned int
nf_conntrack_in(struct sk_buff * skb,const struct nf_hook_state * state)1774 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1775 {
1776 enum ip_conntrack_info ctinfo;
1777 struct nf_conn *ct, *tmpl;
1778 u_int8_t protonum;
1779 int dataoff, ret;
1780
1781 tmpl = nf_ct_get(skb, &ctinfo);
1782 if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1783 /* Previously seen (loopback or untracked)? Ignore. */
1784 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1785 ctinfo == IP_CT_UNTRACKED)
1786 return NF_ACCEPT;
1787 skb->_nfct = 0;
1788 }
1789
1790 /* rcu_read_lock()ed by nf_hook_thresh */
1791 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1792 if (dataoff <= 0) {
1793 pr_debug("not prepared to track yet or error occurred\n");
1794 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1795 ret = NF_ACCEPT;
1796 goto out;
1797 }
1798
1799 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1800 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1801 protonum, state);
1802 if (ret <= 0) {
1803 ret = -ret;
1804 goto out;
1805 }
1806 /* ICMP[v6] protocol trackers may assign one conntrack. */
1807 if (skb->_nfct)
1808 goto out;
1809 }
1810 repeat:
1811 ret = resolve_normal_ct(tmpl, skb, dataoff,
1812 protonum, state);
1813 if (ret < 0) {
1814 /* Too stressed to deal. */
1815 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1816 ret = NF_DROP;
1817 goto out;
1818 }
1819
1820 ct = nf_ct_get(skb, &ctinfo);
1821 if (!ct) {
1822 /* Not valid part of a connection */
1823 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1824 ret = NF_ACCEPT;
1825 goto out;
1826 }
1827
1828 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
1829 if (ret <= 0) {
1830 /* Invalid: inverse of the return code tells
1831 * the netfilter core what to do */
1832 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1833 nf_conntrack_put(&ct->ct_general);
1834 skb->_nfct = 0;
1835 /* Special case: TCP tracker reports an attempt to reopen a
1836 * closed/aborted connection. We have to go back and create a
1837 * fresh conntrack.
1838 */
1839 if (ret == -NF_REPEAT)
1840 goto repeat;
1841
1842 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1843 if (ret == -NF_DROP)
1844 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1845
1846 ret = -ret;
1847 goto out;
1848 }
1849
1850 if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1851 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1852 nf_conntrack_event_cache(IPCT_REPLY, ct);
1853 out:
1854 if (tmpl)
1855 nf_ct_put(tmpl);
1856
1857 return ret;
1858 }
1859 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1860
1861 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1862 implicitly racy: see __nf_conntrack_confirm */
nf_conntrack_alter_reply(struct nf_conn * ct,const struct nf_conntrack_tuple * newreply)1863 void nf_conntrack_alter_reply(struct nf_conn *ct,
1864 const struct nf_conntrack_tuple *newreply)
1865 {
1866 struct nf_conn_help *help = nfct_help(ct);
1867
1868 /* Should be unconfirmed, so not in hash table yet */
1869 WARN_ON(nf_ct_is_confirmed(ct));
1870
1871 pr_debug("Altering reply tuple of %p to ", ct);
1872 nf_ct_dump_tuple(newreply);
1873
1874 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1875 if (ct->master || (help && !hlist_empty(&help->expectations)))
1876 return;
1877
1878 rcu_read_lock();
1879 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1880 rcu_read_unlock();
1881 }
1882 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1883
1884 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
__nf_ct_refresh_acct(struct nf_conn * ct,enum ip_conntrack_info ctinfo,const struct sk_buff * skb,u32 extra_jiffies,bool do_acct)1885 void __nf_ct_refresh_acct(struct nf_conn *ct,
1886 enum ip_conntrack_info ctinfo,
1887 const struct sk_buff *skb,
1888 u32 extra_jiffies,
1889 bool do_acct)
1890 {
1891 /* Only update if this is not a fixed timeout */
1892 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1893 goto acct;
1894
1895 /* If not in hash table, timer will not be active yet */
1896 if (nf_ct_is_confirmed(ct))
1897 extra_jiffies += nfct_time_stamp;
1898
1899 if (READ_ONCE(ct->timeout) != extra_jiffies)
1900 WRITE_ONCE(ct->timeout, extra_jiffies);
1901 acct:
1902 if (do_acct)
1903 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1904 }
1905 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1906
nf_ct_kill_acct(struct nf_conn * ct,enum ip_conntrack_info ctinfo,const struct sk_buff * skb)1907 bool nf_ct_kill_acct(struct nf_conn *ct,
1908 enum ip_conntrack_info ctinfo,
1909 const struct sk_buff *skb)
1910 {
1911 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1912
1913 return nf_ct_delete(ct, 0, 0);
1914 }
1915 EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
1916
1917 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1918
1919 #include <linux/netfilter/nfnetlink.h>
1920 #include <linux/netfilter/nfnetlink_conntrack.h>
1921 #include <linux/mutex.h>
1922
1923 /* Generic function for tcp/udp/sctp/dccp and alike. */
nf_ct_port_tuple_to_nlattr(struct sk_buff * skb,const struct nf_conntrack_tuple * tuple)1924 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1925 const struct nf_conntrack_tuple *tuple)
1926 {
1927 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1928 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1929 goto nla_put_failure;
1930 return 0;
1931
1932 nla_put_failure:
1933 return -1;
1934 }
1935 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1936
1937 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1938 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
1939 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
1940 };
1941 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1942
nf_ct_port_nlattr_to_tuple(struct nlattr * tb[],struct nf_conntrack_tuple * t,u_int32_t flags)1943 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1944 struct nf_conntrack_tuple *t,
1945 u_int32_t flags)
1946 {
1947 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
1948 if (!tb[CTA_PROTO_SRC_PORT])
1949 return -EINVAL;
1950
1951 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1952 }
1953
1954 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
1955 if (!tb[CTA_PROTO_DST_PORT])
1956 return -EINVAL;
1957
1958 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1959 }
1960
1961 return 0;
1962 }
1963 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1964
nf_ct_port_nlattr_tuple_size(void)1965 unsigned int nf_ct_port_nlattr_tuple_size(void)
1966 {
1967 static unsigned int size __read_mostly;
1968
1969 if (!size)
1970 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1971
1972 return size;
1973 }
1974 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1975 #endif
1976
1977 /* Used by ipt_REJECT and ip6t_REJECT. */
nf_conntrack_attach(struct sk_buff * nskb,const struct sk_buff * skb)1978 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1979 {
1980 struct nf_conn *ct;
1981 enum ip_conntrack_info ctinfo;
1982
1983 /* This ICMP is in reverse direction to the packet which caused it */
1984 ct = nf_ct_get(skb, &ctinfo);
1985 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1986 ctinfo = IP_CT_RELATED_REPLY;
1987 else
1988 ctinfo = IP_CT_RELATED;
1989
1990 /* Attach to new skbuff, and increment count */
1991 nf_ct_set(nskb, ct, ctinfo);
1992 nf_conntrack_get(skb_nfct(nskb));
1993 }
1994
__nf_conntrack_update(struct net * net,struct sk_buff * skb,struct nf_conn * ct,enum ip_conntrack_info ctinfo)1995 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
1996 struct nf_conn *ct,
1997 enum ip_conntrack_info ctinfo)
1998 {
1999 struct nf_conntrack_tuple_hash *h;
2000 struct nf_conntrack_tuple tuple;
2001 struct nf_nat_hook *nat_hook;
2002 unsigned int status;
2003 int dataoff;
2004 u16 l3num;
2005 u8 l4num;
2006
2007 l3num = nf_ct_l3num(ct);
2008
2009 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2010 if (dataoff <= 0)
2011 return -1;
2012
2013 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2014 l4num, net, &tuple))
2015 return -1;
2016
2017 if (ct->status & IPS_SRC_NAT) {
2018 memcpy(tuple.src.u3.all,
2019 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2020 sizeof(tuple.src.u3.all));
2021 tuple.src.u.all =
2022 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2023 }
2024
2025 if (ct->status & IPS_DST_NAT) {
2026 memcpy(tuple.dst.u3.all,
2027 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2028 sizeof(tuple.dst.u3.all));
2029 tuple.dst.u.all =
2030 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2031 }
2032
2033 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2034 if (!h)
2035 return 0;
2036
2037 /* Store status bits of the conntrack that is clashing to re-do NAT
2038 * mangling according to what it has been done already to this packet.
2039 */
2040 status = ct->status;
2041
2042 nf_ct_put(ct);
2043 ct = nf_ct_tuplehash_to_ctrack(h);
2044 nf_ct_set(skb, ct, ctinfo);
2045
2046 nat_hook = rcu_dereference(nf_nat_hook);
2047 if (!nat_hook)
2048 return 0;
2049
2050 if (status & IPS_SRC_NAT &&
2051 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
2052 IP_CT_DIR_ORIGINAL) == NF_DROP)
2053 return -1;
2054
2055 if (status & IPS_DST_NAT &&
2056 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
2057 IP_CT_DIR_ORIGINAL) == NF_DROP)
2058 return -1;
2059
2060 return 0;
2061 }
2062
2063 /* This packet is coming from userspace via nf_queue, complete the packet
2064 * processing after the helper invocation in nf_confirm().
2065 */
nf_confirm_cthelper(struct sk_buff * skb,struct nf_conn * ct,enum ip_conntrack_info ctinfo)2066 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
2067 enum ip_conntrack_info ctinfo)
2068 {
2069 const struct nf_conntrack_helper *helper;
2070 const struct nf_conn_help *help;
2071 int protoff;
2072
2073 help = nfct_help(ct);
2074 if (!help)
2075 return 0;
2076
2077 helper = rcu_dereference(help->helper);
2078 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2079 return 0;
2080
2081 switch (nf_ct_l3num(ct)) {
2082 case NFPROTO_IPV4:
2083 protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2084 break;
2085 #if IS_ENABLED(CONFIG_IPV6)
2086 case NFPROTO_IPV6: {
2087 __be16 frag_off;
2088 u8 pnum;
2089
2090 pnum = ipv6_hdr(skb)->nexthdr;
2091 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
2092 &frag_off);
2093 if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
2094 return 0;
2095 break;
2096 }
2097 #endif
2098 default:
2099 return 0;
2100 }
2101
2102 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2103 !nf_is_loopback_packet(skb)) {
2104 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2105 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2106 return -1;
2107 }
2108 }
2109
2110 /* We've seen it coming out the other side: confirm it */
2111 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
2112 }
2113
nf_conntrack_update(struct net * net,struct sk_buff * skb)2114 static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
2115 {
2116 enum ip_conntrack_info ctinfo;
2117 struct nf_conn *ct;
2118 int err;
2119
2120 ct = nf_ct_get(skb, &ctinfo);
2121 if (!ct)
2122 return 0;
2123
2124 if (!nf_ct_is_confirmed(ct)) {
2125 err = __nf_conntrack_update(net, skb, ct, ctinfo);
2126 if (err < 0)
2127 return err;
2128
2129 ct = nf_ct_get(skb, &ctinfo);
2130 }
2131
2132 return nf_confirm_cthelper(skb, ct, ctinfo);
2133 }
2134
nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple * dst_tuple,const struct sk_buff * skb)2135 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2136 const struct sk_buff *skb)
2137 {
2138 const struct nf_conntrack_tuple *src_tuple;
2139 const struct nf_conntrack_tuple_hash *hash;
2140 struct nf_conntrack_tuple srctuple;
2141 enum ip_conntrack_info ctinfo;
2142 struct nf_conn *ct;
2143
2144 ct = nf_ct_get(skb, &ctinfo);
2145 if (ct) {
2146 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2147 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2148 return true;
2149 }
2150
2151 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2152 NFPROTO_IPV4, dev_net(skb->dev),
2153 &srctuple))
2154 return false;
2155
2156 hash = nf_conntrack_find_get(dev_net(skb->dev),
2157 &nf_ct_zone_dflt,
2158 &srctuple);
2159 if (!hash)
2160 return false;
2161
2162 ct = nf_ct_tuplehash_to_ctrack(hash);
2163 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2164 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2165 nf_ct_put(ct);
2166
2167 return true;
2168 }
2169
2170 /* Bring out ya dead! */
2171 static struct nf_conn *
get_next_corpse(int (* iter)(struct nf_conn * i,void * data),void * data,unsigned int * bucket)2172 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
2173 void *data, unsigned int *bucket)
2174 {
2175 struct nf_conntrack_tuple_hash *h;
2176 struct nf_conn *ct;
2177 struct hlist_nulls_node *n;
2178 spinlock_t *lockp;
2179
2180 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2181 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
2182
2183 if (hlist_nulls_empty(hslot))
2184 continue;
2185
2186 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2187 local_bh_disable();
2188 nf_conntrack_lock(lockp);
2189 hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
2190 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2191 continue;
2192 /* All nf_conn objects are added to hash table twice, one
2193 * for original direction tuple, once for the reply tuple.
2194 *
2195 * Exception: In the IPS_NAT_CLASH case, only the reply
2196 * tuple is added (the original tuple already existed for
2197 * a different object).
2198 *
2199 * We only need to call the iterator once for each
2200 * conntrack, so we just use the 'reply' direction
2201 * tuple while iterating.
2202 */
2203 ct = nf_ct_tuplehash_to_ctrack(h);
2204 if (iter(ct, data))
2205 goto found;
2206 }
2207 spin_unlock(lockp);
2208 local_bh_enable();
2209 cond_resched();
2210 }
2211
2212 return NULL;
2213 found:
2214 atomic_inc(&ct->ct_general.use);
2215 spin_unlock(lockp);
2216 local_bh_enable();
2217 return ct;
2218 }
2219
nf_ct_iterate_cleanup(int (* iter)(struct nf_conn * i,void * data),void * data,u32 portid,int report)2220 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2221 void *data, u32 portid, int report)
2222 {
2223 unsigned int bucket = 0;
2224 struct nf_conn *ct;
2225
2226 might_sleep();
2227
2228 mutex_lock(&nf_conntrack_mutex);
2229 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2230 /* Time to push up daises... */
2231
2232 nf_ct_delete(ct, portid, report);
2233 nf_ct_put(ct);
2234 cond_resched();
2235 }
2236 mutex_unlock(&nf_conntrack_mutex);
2237 }
2238
2239 struct iter_data {
2240 int (*iter)(struct nf_conn *i, void *data);
2241 void *data;
2242 struct net *net;
2243 };
2244
iter_net_only(struct nf_conn * i,void * data)2245 static int iter_net_only(struct nf_conn *i, void *data)
2246 {
2247 struct iter_data *d = data;
2248
2249 if (!net_eq(d->net, nf_ct_net(i)))
2250 return 0;
2251
2252 return d->iter(i, d->data);
2253 }
2254
2255 static void
__nf_ct_unconfirmed_destroy(struct net * net)2256 __nf_ct_unconfirmed_destroy(struct net *net)
2257 {
2258 int cpu;
2259
2260 for_each_possible_cpu(cpu) {
2261 struct nf_conntrack_tuple_hash *h;
2262 struct hlist_nulls_node *n;
2263 struct ct_pcpu *pcpu;
2264
2265 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2266
2267 spin_lock_bh(&pcpu->lock);
2268 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
2269 struct nf_conn *ct;
2270
2271 ct = nf_ct_tuplehash_to_ctrack(h);
2272
2273 /* we cannot call iter() on unconfirmed list, the
2274 * owning cpu can reallocate ct->ext at any time.
2275 */
2276 set_bit(IPS_DYING_BIT, &ct->status);
2277 }
2278 spin_unlock_bh(&pcpu->lock);
2279 cond_resched();
2280 }
2281 }
2282
nf_ct_unconfirmed_destroy(struct net * net)2283 void nf_ct_unconfirmed_destroy(struct net *net)
2284 {
2285 might_sleep();
2286
2287 if (atomic_read(&net->ct.count) > 0) {
2288 __nf_ct_unconfirmed_destroy(net);
2289 nf_queue_nf_hook_drop(net);
2290 synchronize_net();
2291 }
2292 }
2293 EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
2294
nf_ct_iterate_cleanup_net(struct net * net,int (* iter)(struct nf_conn * i,void * data),void * data,u32 portid,int report)2295 void nf_ct_iterate_cleanup_net(struct net *net,
2296 int (*iter)(struct nf_conn *i, void *data),
2297 void *data, u32 portid, int report)
2298 {
2299 struct iter_data d;
2300
2301 might_sleep();
2302
2303 if (atomic_read(&net->ct.count) == 0)
2304 return;
2305
2306 d.iter = iter;
2307 d.data = data;
2308 d.net = net;
2309
2310 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
2311 }
2312 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2313
2314 /**
2315 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
2316 * @iter: callback to invoke for each conntrack
2317 * @data: data to pass to @iter
2318 *
2319 * Like nf_ct_iterate_cleanup, but first marks conntracks on the
2320 * unconfirmed list as dying (so they will not be inserted into
2321 * main table).
2322 *
2323 * Can only be called in module exit path.
2324 */
2325 void
nf_ct_iterate_destroy(int (* iter)(struct nf_conn * i,void * data),void * data)2326 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2327 {
2328 struct net *net;
2329
2330 down_read(&net_rwsem);
2331 for_each_net(net) {
2332 if (atomic_read(&net->ct.count) == 0)
2333 continue;
2334 __nf_ct_unconfirmed_destroy(net);
2335 nf_queue_nf_hook_drop(net);
2336 }
2337 up_read(&net_rwsem);
2338
2339 /* Need to wait for netns cleanup worker to finish, if its
2340 * running -- it might have deleted a net namespace from
2341 * the global list, so our __nf_ct_unconfirmed_destroy() might
2342 * not have affected all namespaces.
2343 */
2344 net_ns_barrier();
2345
2346 /* a conntrack could have been unlinked from unconfirmed list
2347 * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
2348 * This makes sure its inserted into conntrack table.
2349 */
2350 synchronize_net();
2351
2352 nf_ct_iterate_cleanup(iter, data, 0, 0);
2353 }
2354 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2355
kill_all(struct nf_conn * i,void * data)2356 static int kill_all(struct nf_conn *i, void *data)
2357 {
2358 return net_eq(nf_ct_net(i), data);
2359 }
2360
nf_conntrack_cleanup_start(void)2361 void nf_conntrack_cleanup_start(void)
2362 {
2363 conntrack_gc_work.exiting = true;
2364 RCU_INIT_POINTER(ip_ct_attach, NULL);
2365 }
2366
nf_conntrack_cleanup_end(void)2367 void nf_conntrack_cleanup_end(void)
2368 {
2369 RCU_INIT_POINTER(nf_ct_hook, NULL);
2370 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2371 kvfree(nf_conntrack_hash);
2372
2373 nf_conntrack_proto_fini();
2374 nf_conntrack_seqadj_fini();
2375 nf_conntrack_labels_fini();
2376 nf_conntrack_helper_fini();
2377 nf_conntrack_timeout_fini();
2378 nf_conntrack_ecache_fini();
2379 nf_conntrack_tstamp_fini();
2380 nf_conntrack_acct_fini();
2381 nf_conntrack_expect_fini();
2382
2383 kmem_cache_destroy(nf_conntrack_cachep);
2384 }
2385
2386 /*
2387 * Mishearing the voices in his head, our hero wonders how he's
2388 * supposed to kill the mall.
2389 */
nf_conntrack_cleanup_net(struct net * net)2390 void nf_conntrack_cleanup_net(struct net *net)
2391 {
2392 LIST_HEAD(single);
2393
2394 list_add(&net->exit_list, &single);
2395 nf_conntrack_cleanup_net_list(&single);
2396 }
2397
nf_conntrack_cleanup_net_list(struct list_head * net_exit_list)2398 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2399 {
2400 int busy;
2401 struct net *net;
2402
2403 /*
2404 * This makes sure all current packets have passed through
2405 * netfilter framework. Roll on, two-stage module
2406 * delete...
2407 */
2408 synchronize_net();
2409 i_see_dead_people:
2410 busy = 0;
2411 list_for_each_entry(net, net_exit_list, exit_list) {
2412 nf_ct_iterate_cleanup(kill_all, net, 0, 0);
2413 if (atomic_read(&net->ct.count) != 0)
2414 busy = 1;
2415 }
2416 if (busy) {
2417 schedule();
2418 goto i_see_dead_people;
2419 }
2420
2421 list_for_each_entry(net, net_exit_list, exit_list) {
2422 nf_conntrack_proto_pernet_fini(net);
2423 nf_conntrack_ecache_pernet_fini(net);
2424 nf_conntrack_expect_pernet_fini(net);
2425 free_percpu(net->ct.stat);
2426 free_percpu(net->ct.pcpu_lists);
2427 }
2428 }
2429
nf_ct_alloc_hashtable(unsigned int * sizep,int nulls)2430 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2431 {
2432 struct hlist_nulls_head *hash;
2433 unsigned int nr_slots, i;
2434
2435 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2436 return NULL;
2437
2438 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2439 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2440
2441 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
2442
2443 if (hash && nulls)
2444 for (i = 0; i < nr_slots; i++)
2445 INIT_HLIST_NULLS_HEAD(&hash[i], i);
2446
2447 return hash;
2448 }
2449 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2450
nf_conntrack_hash_resize(unsigned int hashsize)2451 int nf_conntrack_hash_resize(unsigned int hashsize)
2452 {
2453 int i, bucket;
2454 unsigned int old_size;
2455 struct hlist_nulls_head *hash, *old_hash;
2456 struct nf_conntrack_tuple_hash *h;
2457 struct nf_conn *ct;
2458
2459 if (!hashsize)
2460 return -EINVAL;
2461
2462 hash = nf_ct_alloc_hashtable(&hashsize, 1);
2463 if (!hash)
2464 return -ENOMEM;
2465
2466 mutex_lock(&nf_conntrack_mutex);
2467 old_size = nf_conntrack_htable_size;
2468 if (old_size == hashsize) {
2469 mutex_unlock(&nf_conntrack_mutex);
2470 kvfree(hash);
2471 return 0;
2472 }
2473
2474 local_bh_disable();
2475 nf_conntrack_all_lock();
2476 write_seqcount_begin(&nf_conntrack_generation);
2477
2478 /* Lookups in the old hash might happen in parallel, which means we
2479 * might get false negatives during connection lookup. New connections
2480 * created because of a false negative won't make it into the hash
2481 * though since that required taking the locks.
2482 */
2483
2484 for (i = 0; i < nf_conntrack_htable_size; i++) {
2485 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2486 h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2487 struct nf_conntrack_tuple_hash, hnnode);
2488 ct = nf_ct_tuplehash_to_ctrack(h);
2489 hlist_nulls_del_rcu(&h->hnnode);
2490 bucket = __hash_conntrack(nf_ct_net(ct),
2491 &h->tuple, hashsize);
2492 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2493 }
2494 }
2495 old_size = nf_conntrack_htable_size;
2496 old_hash = nf_conntrack_hash;
2497
2498 nf_conntrack_hash = hash;
2499 nf_conntrack_htable_size = hashsize;
2500
2501 write_seqcount_end(&nf_conntrack_generation);
2502 nf_conntrack_all_unlock();
2503 local_bh_enable();
2504
2505 mutex_unlock(&nf_conntrack_mutex);
2506
2507 synchronize_net();
2508 kvfree(old_hash);
2509 return 0;
2510 }
2511
nf_conntrack_set_hashsize(const char * val,const struct kernel_param * kp)2512 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2513 {
2514 unsigned int hashsize;
2515 int rc;
2516
2517 if (current->nsproxy->net_ns != &init_net)
2518 return -EOPNOTSUPP;
2519
2520 /* On boot, we can set this without any fancy locking. */
2521 if (!nf_conntrack_hash)
2522 return param_set_uint(val, kp);
2523
2524 rc = kstrtouint(val, 0, &hashsize);
2525 if (rc)
2526 return rc;
2527
2528 return nf_conntrack_hash_resize(hashsize);
2529 }
2530
total_extension_size(void)2531 static __always_inline unsigned int total_extension_size(void)
2532 {
2533 /* remember to add new extensions below */
2534 BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
2535
2536 return sizeof(struct nf_ct_ext) +
2537 sizeof(struct nf_conn_help)
2538 #if IS_ENABLED(CONFIG_NF_NAT)
2539 + sizeof(struct nf_conn_nat)
2540 #endif
2541 + sizeof(struct nf_conn_seqadj)
2542 + sizeof(struct nf_conn_acct)
2543 #ifdef CONFIG_NF_CONNTRACK_EVENTS
2544 + sizeof(struct nf_conntrack_ecache)
2545 #endif
2546 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
2547 + sizeof(struct nf_conn_tstamp)
2548 #endif
2549 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
2550 + sizeof(struct nf_conn_timeout)
2551 #endif
2552 #ifdef CONFIG_NF_CONNTRACK_LABELS
2553 + sizeof(struct nf_conn_labels)
2554 #endif
2555 #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
2556 + sizeof(struct nf_conn_synproxy)
2557 #endif
2558 ;
2559 };
2560
nf_conntrack_init_start(void)2561 int nf_conntrack_init_start(void)
2562 {
2563 unsigned long nr_pages = totalram_pages();
2564 int max_factor = 8;
2565 int ret = -ENOMEM;
2566 int i;
2567
2568 /* struct nf_ct_ext uses u8 to store offsets/size */
2569 BUILD_BUG_ON(total_extension_size() > 255u);
2570
2571 seqcount_spinlock_init(&nf_conntrack_generation,
2572 &nf_conntrack_locks_all_lock);
2573
2574 for (i = 0; i < CONNTRACK_LOCKS; i++)
2575 spin_lock_init(&nf_conntrack_locks[i]);
2576
2577 if (!nf_conntrack_htable_size) {
2578 /* Idea from tcp.c: use 1/16384 of memory.
2579 * On i386: 32MB machine has 512 buckets.
2580 * >= 1GB machines have 16384 buckets.
2581 * >= 4GB machines have 65536 buckets.
2582 */
2583 nf_conntrack_htable_size
2584 = (((nr_pages << PAGE_SHIFT) / 16384)
2585 / sizeof(struct hlist_head));
2586 if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2587 nf_conntrack_htable_size = 65536;
2588 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2589 nf_conntrack_htable_size = 16384;
2590 if (nf_conntrack_htable_size < 32)
2591 nf_conntrack_htable_size = 32;
2592
2593 /* Use a max. factor of four by default to get the same max as
2594 * with the old struct list_heads. When a table size is given
2595 * we use the old value of 8 to avoid reducing the max.
2596 * entries. */
2597 max_factor = 4;
2598 }
2599
2600 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2601 if (!nf_conntrack_hash)
2602 return -ENOMEM;
2603
2604 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2605
2606 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2607 sizeof(struct nf_conn),
2608 NFCT_INFOMASK + 1,
2609 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2610 if (!nf_conntrack_cachep)
2611 goto err_cachep;
2612
2613 ret = nf_conntrack_expect_init();
2614 if (ret < 0)
2615 goto err_expect;
2616
2617 ret = nf_conntrack_acct_init();
2618 if (ret < 0)
2619 goto err_acct;
2620
2621 ret = nf_conntrack_tstamp_init();
2622 if (ret < 0)
2623 goto err_tstamp;
2624
2625 ret = nf_conntrack_ecache_init();
2626 if (ret < 0)
2627 goto err_ecache;
2628
2629 ret = nf_conntrack_timeout_init();
2630 if (ret < 0)
2631 goto err_timeout;
2632
2633 ret = nf_conntrack_helper_init();
2634 if (ret < 0)
2635 goto err_helper;
2636
2637 ret = nf_conntrack_labels_init();
2638 if (ret < 0)
2639 goto err_labels;
2640
2641 ret = nf_conntrack_seqadj_init();
2642 if (ret < 0)
2643 goto err_seqadj;
2644
2645 ret = nf_conntrack_proto_init();
2646 if (ret < 0)
2647 goto err_proto;
2648
2649 conntrack_gc_work_init(&conntrack_gc_work);
2650 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2651
2652 return 0;
2653
2654 err_proto:
2655 nf_conntrack_seqadj_fini();
2656 err_seqadj:
2657 nf_conntrack_labels_fini();
2658 err_labels:
2659 nf_conntrack_helper_fini();
2660 err_helper:
2661 nf_conntrack_timeout_fini();
2662 err_timeout:
2663 nf_conntrack_ecache_fini();
2664 err_ecache:
2665 nf_conntrack_tstamp_fini();
2666 err_tstamp:
2667 nf_conntrack_acct_fini();
2668 err_acct:
2669 nf_conntrack_expect_fini();
2670 err_expect:
2671 kmem_cache_destroy(nf_conntrack_cachep);
2672 err_cachep:
2673 kvfree(nf_conntrack_hash);
2674 return ret;
2675 }
2676
2677 static struct nf_ct_hook nf_conntrack_hook = {
2678 .update = nf_conntrack_update,
2679 .destroy = destroy_conntrack,
2680 .get_tuple_skb = nf_conntrack_get_tuple_skb,
2681 };
2682
nf_conntrack_init_end(void)2683 void nf_conntrack_init_end(void)
2684 {
2685 /* For use by REJECT target */
2686 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2687 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2688 }
2689
2690 /*
2691 * We need to use special "null" values, not used in hash table
2692 */
2693 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
2694 #define DYING_NULLS_VAL ((1<<30)+1)
2695
nf_conntrack_init_net(struct net * net)2696 int nf_conntrack_init_net(struct net *net)
2697 {
2698 int ret = -ENOMEM;
2699 int cpu;
2700
2701 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2702 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2703 atomic_set(&net->ct.count, 0);
2704
2705 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2706 if (!net->ct.pcpu_lists)
2707 goto err_stat;
2708
2709 for_each_possible_cpu(cpu) {
2710 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2711
2712 spin_lock_init(&pcpu->lock);
2713 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2714 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
2715 }
2716
2717 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2718 if (!net->ct.stat)
2719 goto err_pcpu_lists;
2720
2721 ret = nf_conntrack_expect_pernet_init(net);
2722 if (ret < 0)
2723 goto err_expect;
2724
2725 nf_conntrack_acct_pernet_init(net);
2726 nf_conntrack_tstamp_pernet_init(net);
2727 nf_conntrack_ecache_pernet_init(net);
2728 nf_conntrack_helper_pernet_init(net);
2729 nf_conntrack_proto_pernet_init(net);
2730
2731 return 0;
2732
2733 err_expect:
2734 free_percpu(net->ct.stat);
2735 err_pcpu_lists:
2736 free_percpu(net->ct.pcpu_lists);
2737 err_stat:
2738 return ret;
2739 }
2740