1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84
85 #include <trace/events/tcp.h>
86 #include <trace/hooks/net.h>
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95
96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
97
98 static DEFINE_MUTEX(tcp_exit_batch_mutex);
99
tcp_v4_init_seq(const struct sk_buff * skb)100 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
101 {
102 return secure_tcp_seq(ip_hdr(skb)->daddr,
103 ip_hdr(skb)->saddr,
104 tcp_hdr(skb)->dest,
105 tcp_hdr(skb)->source);
106 }
107
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)108 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
109 {
110 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
111 }
112
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)113 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
114 {
115 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
116 const struct inet_timewait_sock *tw = inet_twsk(sktw);
117 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118 struct tcp_sock *tp = tcp_sk(sk);
119
120 if (reuse == 2) {
121 /* Still does not detect *everything* that goes through
122 * lo, since we require a loopback src or dst address
123 * or direct binding to 'lo' interface.
124 */
125 bool loopback = false;
126 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
127 loopback = true;
128 #if IS_ENABLED(CONFIG_IPV6)
129 if (tw->tw_family == AF_INET6) {
130 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
131 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
132 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
133 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 loopback = true;
135 } else
136 #endif
137 {
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
140 loopback = true;
141 }
142 if (!loopback)
143 reuse = 0;
144 }
145
146 /* With PAWS, it is safe from the viewpoint
147 of data integrity. Even without PAWS it is safe provided sequence
148 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150 Actually, the idea is close to VJ's one, only timestamp cache is
151 held not per host, but per port pair and TW bucket is used as state
152 holder.
153
154 If TW bucket has been already destroyed we fall back to VJ's scheme
155 and use initial timestamp retrieved from peer table.
156 */
157 if (tcptw->tw_ts_recent_stamp &&
158 (!twp || (reuse && time_after32(ktime_get_seconds(),
159 tcptw->tw_ts_recent_stamp)))) {
160 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk
161 * and releasing the bucket lock.
162 */
163 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
164 return 0;
165
166 /* In case of repair and re-using TIME-WAIT sockets we still
167 * want to be sure that it is safe as above but honor the
168 * sequence numbers and time stamps set as part of the repair
169 * process.
170 *
171 * Without this check re-using a TIME-WAIT socket with TCP
172 * repair would accumulate a -1 on the repair assigned
173 * sequence number. The first time it is reused the sequence
174 * is -1, the second time -2, etc. This fixes that issue
175 * without appearing to create any others.
176 */
177 if (likely(!tp->repair)) {
178 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
179
180 if (!seq)
181 seq = 1;
182 WRITE_ONCE(tp->write_seq, seq);
183 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
184 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
185 }
186
187 return 1;
188 }
189
190 return 0;
191 }
192 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
193
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)194 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
195 int addr_len)
196 {
197 /* This check is replicated from tcp_v4_connect() and intended to
198 * prevent BPF program called below from accessing bytes that are out
199 * of the bound specified by user in addr_len.
200 */
201 if (addr_len < sizeof(struct sockaddr_in))
202 return -EINVAL;
203
204 sock_owned_by_me(sk);
205
206 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
207 }
208
209 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)210 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
211 {
212 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
213 struct inet_timewait_death_row *tcp_death_row;
214 struct inet_sock *inet = inet_sk(sk);
215 struct tcp_sock *tp = tcp_sk(sk);
216 struct ip_options_rcu *inet_opt;
217 struct net *net = sock_net(sk);
218 __be16 orig_sport, orig_dport;
219 __be32 daddr, nexthop;
220 struct flowi4 *fl4;
221 struct rtable *rt;
222 int err;
223
224 if (addr_len < sizeof(struct sockaddr_in))
225 return -EINVAL;
226
227 if (usin->sin_family != AF_INET)
228 return -EAFNOSUPPORT;
229
230 trace_android_vh_tcp_v4_connect(sk, uaddr);
231
232 nexthop = daddr = usin->sin_addr.s_addr;
233 inet_opt = rcu_dereference_protected(inet->inet_opt,
234 lockdep_sock_is_held(sk));
235 if (inet_opt && inet_opt->opt.srr) {
236 if (!daddr)
237 return -EINVAL;
238 nexthop = inet_opt->opt.faddr;
239 }
240
241 orig_sport = inet->inet_sport;
242 orig_dport = usin->sin_port;
243 fl4 = &inet->cork.fl.u.ip4;
244 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
245 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
246 orig_dport, sk);
247 if (IS_ERR(rt)) {
248 err = PTR_ERR(rt);
249 if (err == -ENETUNREACH)
250 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
251 return err;
252 }
253
254 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
255 ip_rt_put(rt);
256 return -ENETUNREACH;
257 }
258
259 if (!inet_opt || !inet_opt->opt.srr)
260 daddr = fl4->daddr;
261
262 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
263
264 if (!inet->inet_saddr) {
265 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
266 if (err) {
267 ip_rt_put(rt);
268 return err;
269 }
270 } else {
271 sk_rcv_saddr_set(sk, inet->inet_saddr);
272 }
273
274 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275 /* Reset inherited state */
276 tp->rx_opt.ts_recent = 0;
277 tp->rx_opt.ts_recent_stamp = 0;
278 if (likely(!tp->repair))
279 WRITE_ONCE(tp->write_seq, 0);
280 }
281
282 inet->inet_dport = usin->sin_port;
283 sk_daddr_set(sk, daddr);
284
285 inet_csk(sk)->icsk_ext_hdr_len = 0;
286 if (inet_opt)
287 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
288
289 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
290
291 /* Socket identity is still unknown (sport may be zero).
292 * However we set state to SYN-SENT and not releasing socket
293 * lock select source port, enter ourselves into the hash tables and
294 * complete initialization after this.
295 */
296 tcp_set_state(sk, TCP_SYN_SENT);
297 err = inet_hash_connect(tcp_death_row, sk);
298 if (err)
299 goto failure;
300
301 sk_set_txhash(sk);
302
303 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304 inet->inet_sport, inet->inet_dport, sk);
305 if (IS_ERR(rt)) {
306 err = PTR_ERR(rt);
307 rt = NULL;
308 goto failure;
309 }
310 /* OK, now commit destination to socket. */
311 sk->sk_gso_type = SKB_GSO_TCPV4;
312 sk_setup_caps(sk, &rt->dst);
313 rt = NULL;
314
315 if (likely(!tp->repair)) {
316 if (!tp->write_seq)
317 WRITE_ONCE(tp->write_seq,
318 secure_tcp_seq(inet->inet_saddr,
319 inet->inet_daddr,
320 inet->inet_sport,
321 usin->sin_port));
322 WRITE_ONCE(tp->tsoffset,
323 secure_tcp_ts_off(net, inet->inet_saddr,
324 inet->inet_daddr));
325 }
326
327 atomic_set(&inet->inet_id, get_random_u16());
328
329 if (tcp_fastopen_defer_connect(sk, &err))
330 return err;
331 if (err)
332 goto failure;
333
334 err = tcp_connect(sk);
335
336 if (err)
337 goto failure;
338
339 return 0;
340
341 failure:
342 /*
343 * This unhashes the socket and releases the local port,
344 * if necessary.
345 */
346 tcp_set_state(sk, TCP_CLOSE);
347 inet_bhash2_reset_saddr(sk);
348 ip_rt_put(rt);
349 sk->sk_route_caps = 0;
350 inet->inet_dport = 0;
351 return err;
352 }
353 EXPORT_SYMBOL(tcp_v4_connect);
354
355 /*
356 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
357 * It can be called through tcp_release_cb() if socket was owned by user
358 * at the time tcp_v4_err() was called to handle ICMP message.
359 */
tcp_v4_mtu_reduced(struct sock * sk)360 void tcp_v4_mtu_reduced(struct sock *sk)
361 {
362 struct inet_sock *inet = inet_sk(sk);
363 struct dst_entry *dst;
364 u32 mtu;
365
366 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
367 return;
368 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
369 dst = inet_csk_update_pmtu(sk, mtu);
370 if (!dst)
371 return;
372
373 /* Something is about to be wrong... Remember soft error
374 * for the case, if this connection will not able to recover.
375 */
376 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
377 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
378
379 mtu = dst_mtu(dst);
380
381 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
382 ip_sk_accept_pmtu(sk) &&
383 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
384 tcp_sync_mss(sk, mtu);
385
386 /* Resend the TCP packet because it's
387 * clear that the old packet has been
388 * dropped. This is the new "fast" path mtu
389 * discovery.
390 */
391 tcp_simple_retransmit(sk);
392 } /* else let the usual retransmit timer handle it */
393 }
394 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
395
do_redirect(struct sk_buff * skb,struct sock * sk)396 static void do_redirect(struct sk_buff *skb, struct sock *sk)
397 {
398 struct dst_entry *dst = __sk_dst_check(sk, 0);
399
400 if (dst)
401 dst->ops->redirect(dst, sk, skb);
402 }
403
404
405 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)406 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
407 {
408 struct request_sock *req = inet_reqsk(sk);
409 struct net *net = sock_net(sk);
410
411 /* ICMPs are not backlogged, hence we cannot get
412 * an established socket here.
413 */
414 if (seq != tcp_rsk(req)->snt_isn) {
415 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
416 } else if (abort) {
417 /*
418 * Still in SYN_RECV, just remove it silently.
419 * There is no good way to pass the error to the newly
420 * created socket, and POSIX does not want network
421 * errors returned from accept().
422 */
423 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
424 tcp_listendrop(req->rsk_listener);
425 }
426 reqsk_put(req);
427 }
428 EXPORT_SYMBOL(tcp_req_err);
429
430 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)431 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
432 {
433 struct inet_connection_sock *icsk = inet_csk(sk);
434 struct tcp_sock *tp = tcp_sk(sk);
435 struct sk_buff *skb;
436 s32 remaining;
437 u32 delta_us;
438
439 if (sock_owned_by_user(sk))
440 return;
441
442 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
443 !icsk->icsk_backoff)
444 return;
445
446 skb = tcp_rtx_queue_head(sk);
447 if (WARN_ON_ONCE(!skb))
448 return;
449
450 icsk->icsk_backoff--;
451 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
452 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
453
454 tcp_mstamp_refresh(tp);
455 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
456 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
457
458 if (remaining > 0) {
459 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
460 remaining, TCP_RTO_MAX);
461 } else {
462 /* RTO revert clocked out retransmission.
463 * Will retransmit now.
464 */
465 tcp_retransmit_timer(sk);
466 }
467 }
468 EXPORT_SYMBOL(tcp_ld_RTO_revert);
469
470 /*
471 * This routine is called by the ICMP module when it gets some
472 * sort of error condition. If err < 0 then the socket should
473 * be closed and the error returned to the user. If err > 0
474 * it's just the icmp type << 8 | icmp code. After adjustment
475 * header points to the first 8 bytes of the tcp header. We need
476 * to find the appropriate port.
477 *
478 * The locking strategy used here is very "optimistic". When
479 * someone else accesses the socket the ICMP is just dropped
480 * and for some paths there is no check at all.
481 * A more general error queue to queue errors for later handling
482 * is probably better.
483 *
484 */
485
tcp_v4_err(struct sk_buff * skb,u32 info)486 int tcp_v4_err(struct sk_buff *skb, u32 info)
487 {
488 const struct iphdr *iph = (const struct iphdr *)skb->data;
489 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
490 struct tcp_sock *tp;
491 const int type = icmp_hdr(skb)->type;
492 const int code = icmp_hdr(skb)->code;
493 struct sock *sk;
494 struct request_sock *fastopen;
495 u32 seq, snd_una;
496 int err;
497 struct net *net = dev_net(skb->dev);
498
499 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
500 iph->daddr, th->dest, iph->saddr,
501 ntohs(th->source), inet_iif(skb), 0);
502 if (!sk) {
503 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
504 return -ENOENT;
505 }
506 if (sk->sk_state == TCP_TIME_WAIT) {
507 inet_twsk_put(inet_twsk(sk));
508 return 0;
509 }
510 seq = ntohl(th->seq);
511 if (sk->sk_state == TCP_NEW_SYN_RECV) {
512 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
513 type == ICMP_TIME_EXCEEDED ||
514 (type == ICMP_DEST_UNREACH &&
515 (code == ICMP_NET_UNREACH ||
516 code == ICMP_HOST_UNREACH)));
517 return 0;
518 }
519
520 bh_lock_sock(sk);
521 /* If too many ICMPs get dropped on busy
522 * servers this needs to be solved differently.
523 * We do take care of PMTU discovery (RFC1191) special case :
524 * we can receive locally generated ICMP messages while socket is held.
525 */
526 if (sock_owned_by_user(sk)) {
527 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
528 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
529 }
530 if (sk->sk_state == TCP_CLOSE)
531 goto out;
532
533 if (static_branch_unlikely(&ip4_min_ttl)) {
534 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
535 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
536 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
537 goto out;
538 }
539 }
540
541 tp = tcp_sk(sk);
542 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
543 fastopen = rcu_dereference(tp->fastopen_rsk);
544 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
545 if (sk->sk_state != TCP_LISTEN &&
546 !between(seq, snd_una, tp->snd_nxt)) {
547 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
548 goto out;
549 }
550
551 switch (type) {
552 case ICMP_REDIRECT:
553 if (!sock_owned_by_user(sk))
554 do_redirect(skb, sk);
555 goto out;
556 case ICMP_SOURCE_QUENCH:
557 /* Just silently ignore these. */
558 goto out;
559 case ICMP_PARAMETERPROB:
560 err = EPROTO;
561 break;
562 case ICMP_DEST_UNREACH:
563 if (code > NR_ICMP_UNREACH)
564 goto out;
565
566 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
567 /* We are not interested in TCP_LISTEN and open_requests
568 * (SYN-ACKs send out by Linux are always <576bytes so
569 * they should go through unfragmented).
570 */
571 if (sk->sk_state == TCP_LISTEN)
572 goto out;
573
574 WRITE_ONCE(tp->mtu_info, info);
575 if (!sock_owned_by_user(sk)) {
576 tcp_v4_mtu_reduced(sk);
577 } else {
578 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
579 sock_hold(sk);
580 }
581 goto out;
582 }
583
584 err = icmp_err_convert[code].errno;
585 /* check if this ICMP message allows revert of backoff.
586 * (see RFC 6069)
587 */
588 if (!fastopen &&
589 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
590 tcp_ld_RTO_revert(sk, seq);
591 break;
592 case ICMP_TIME_EXCEEDED:
593 err = EHOSTUNREACH;
594 break;
595 default:
596 goto out;
597 }
598
599 switch (sk->sk_state) {
600 case TCP_SYN_SENT:
601 case TCP_SYN_RECV:
602 /* Only in fast or simultaneous open. If a fast open socket is
603 * already accepted it is treated as a connected one below.
604 */
605 if (fastopen && !fastopen->sk)
606 break;
607
608 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
609
610 if (!sock_owned_by_user(sk))
611 tcp_done_with_error(sk, err);
612 else
613 WRITE_ONCE(sk->sk_err_soft, err);
614 goto out;
615 }
616
617 /* If we've already connected we will keep trying
618 * until we time out, or the user gives up.
619 *
620 * rfc1122 4.2.3.9 allows to consider as hard errors
621 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
622 * but it is obsoleted by pmtu discovery).
623 *
624 * Note, that in modern internet, where routing is unreliable
625 * and in each dark corner broken firewalls sit, sending random
626 * errors ordered by their masters even this two messages finally lose
627 * their original sense (even Linux sends invalid PORT_UNREACHs)
628 *
629 * Now we are in compliance with RFCs.
630 * --ANK (980905)
631 */
632
633 if (!sock_owned_by_user(sk) &&
634 inet_test_bit(RECVERR, sk)) {
635 WRITE_ONCE(sk->sk_err, err);
636 sk_error_report(sk);
637 } else { /* Only an error on timeout */
638 WRITE_ONCE(sk->sk_err_soft, err);
639 }
640
641 out:
642 bh_unlock_sock(sk);
643 sock_put(sk);
644 return 0;
645 }
646
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)647 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
648 {
649 struct tcphdr *th = tcp_hdr(skb);
650
651 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
652 skb->csum_start = skb_transport_header(skb) - skb->head;
653 skb->csum_offset = offsetof(struct tcphdr, check);
654 }
655
656 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)657 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
658 {
659 const struct inet_sock *inet = inet_sk(sk);
660
661 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
662 }
663 EXPORT_SYMBOL(tcp_v4_send_check);
664
665 /*
666 * This routine will send an RST to the other tcp.
667 *
668 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
669 * for reset.
670 * Answer: if a packet caused RST, it is not for a socket
671 * existing in our system, if it is matched to a socket,
672 * it is just duplicate segment or bug in other side's TCP.
673 * So that we build reply only basing on parameters
674 * arrived with segment.
675 * Exception: precedence violation. We do not implement it in any case.
676 */
677
678 #ifdef CONFIG_TCP_MD5SIG
679 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
680 #else
681 #define OPTION_BYTES sizeof(__be32)
682 #endif
683
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)684 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
685 {
686 const struct tcphdr *th = tcp_hdr(skb);
687 struct {
688 struct tcphdr th;
689 __be32 opt[OPTION_BYTES / sizeof(__be32)];
690 } rep;
691 struct ip_reply_arg arg;
692 #ifdef CONFIG_TCP_MD5SIG
693 struct tcp_md5sig_key *key = NULL;
694 const __u8 *hash_location = NULL;
695 unsigned char newhash[16];
696 int genhash;
697 struct sock *sk1 = NULL;
698 #endif
699 u64 transmit_time = 0;
700 struct sock *ctl_sk;
701 struct net *net;
702 u32 txhash = 0;
703
704 /* Never send a reset in response to a reset. */
705 if (th->rst)
706 return;
707
708 /* If sk not NULL, it means we did a successful lookup and incoming
709 * route had to be correct. prequeue might have dropped our dst.
710 */
711 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
712 return;
713
714 /* Swap the send and the receive. */
715 memset(&rep, 0, sizeof(rep));
716 rep.th.dest = th->source;
717 rep.th.source = th->dest;
718 rep.th.doff = sizeof(struct tcphdr) / 4;
719 rep.th.rst = 1;
720
721 if (th->ack) {
722 rep.th.seq = th->ack_seq;
723 } else {
724 rep.th.ack = 1;
725 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
726 skb->len - (th->doff << 2));
727 }
728
729 memset(&arg, 0, sizeof(arg));
730 arg.iov[0].iov_base = (unsigned char *)&rep;
731 arg.iov[0].iov_len = sizeof(rep.th);
732
733 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
734 #ifdef CONFIG_TCP_MD5SIG
735 rcu_read_lock();
736 hash_location = tcp_parse_md5sig_option(th);
737 if (sk && sk_fullsock(sk)) {
738 const union tcp_md5_addr *addr;
739 int l3index;
740
741 /* sdif set, means packet ingressed via a device
742 * in an L3 domain and inet_iif is set to it.
743 */
744 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
745 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
746 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
747 } else if (hash_location) {
748 const union tcp_md5_addr *addr;
749 int sdif = tcp_v4_sdif(skb);
750 int dif = inet_iif(skb);
751 int l3index;
752
753 /*
754 * active side is lost. Try to find listening socket through
755 * source port, and then find md5 key through listening socket.
756 * we are not loose security here:
757 * Incoming packet is checked with md5 hash with finding key,
758 * no RST generated if md5 hash doesn't match.
759 */
760 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
761 NULL, 0, ip_hdr(skb)->saddr,
762 th->source, ip_hdr(skb)->daddr,
763 ntohs(th->source), dif, sdif);
764 /* don't send rst if it can't find key */
765 if (!sk1)
766 goto out;
767
768 /* sdif set, means packet ingressed via a device
769 * in an L3 domain and dif is set to it.
770 */
771 l3index = sdif ? dif : 0;
772 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
773 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
774 if (!key)
775 goto out;
776
777
778 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
779 if (genhash || memcmp(hash_location, newhash, 16) != 0)
780 goto out;
781
782 }
783
784 if (key) {
785 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
786 (TCPOPT_NOP << 16) |
787 (TCPOPT_MD5SIG << 8) |
788 TCPOLEN_MD5SIG);
789 /* Update length and the length the header thinks exists */
790 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
791 rep.th.doff = arg.iov[0].iov_len / 4;
792
793 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
794 key, ip_hdr(skb)->saddr,
795 ip_hdr(skb)->daddr, &rep.th);
796 }
797 #endif
798 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
799 if (rep.opt[0] == 0) {
800 __be32 mrst = mptcp_reset_option(skb);
801
802 if (mrst) {
803 rep.opt[0] = mrst;
804 arg.iov[0].iov_len += sizeof(mrst);
805 rep.th.doff = arg.iov[0].iov_len / 4;
806 }
807 }
808
809 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
810 ip_hdr(skb)->saddr, /* XXX */
811 arg.iov[0].iov_len, IPPROTO_TCP, 0);
812 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
813 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
814
815 /* When socket is gone, all binding information is lost.
816 * routing might fail in this case. No choice here, if we choose to force
817 * input interface, we will misroute in case of asymmetric route.
818 */
819 if (sk) {
820 arg.bound_dev_if = sk->sk_bound_dev_if;
821 if (sk_fullsock(sk))
822 trace_tcp_send_reset(sk, skb);
823 }
824
825 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
826 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
827
828 arg.tos = ip_hdr(skb)->tos;
829 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
830 local_bh_disable();
831 ctl_sk = this_cpu_read(ipv4_tcp_sk);
832 sock_net_set(ctl_sk, net);
833 if (sk) {
834 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
835 inet_twsk(sk)->tw_mark : sk->sk_mark;
836 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
837 inet_twsk(sk)->tw_priority : sk->sk_priority;
838 transmit_time = tcp_transmit_time(sk);
839 xfrm_sk_clone_policy(ctl_sk, sk);
840 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
841 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
842 } else {
843 ctl_sk->sk_mark = 0;
844 ctl_sk->sk_priority = 0;
845 }
846 ip_send_unicast_reply(ctl_sk,
847 skb, &TCP_SKB_CB(skb)->header.h4.opt,
848 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
849 &arg, arg.iov[0].iov_len,
850 transmit_time, txhash);
851
852 xfrm_sk_free_policy(ctl_sk);
853 sock_net_set(ctl_sk, &init_net);
854 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
855 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
856 local_bh_enable();
857
858 #ifdef CONFIG_TCP_MD5SIG
859 out:
860 rcu_read_unlock();
861 #endif
862 }
863
864 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
865 outside socket context is ugly, certainly. What can I do?
866 */
867
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos,u32 txhash)868 static void tcp_v4_send_ack(const struct sock *sk,
869 struct sk_buff *skb, u32 seq, u32 ack,
870 u32 win, u32 tsval, u32 tsecr, int oif,
871 struct tcp_md5sig_key *key,
872 int reply_flags, u8 tos, u32 txhash)
873 {
874 const struct tcphdr *th = tcp_hdr(skb);
875 struct {
876 struct tcphdr th;
877 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
878 #ifdef CONFIG_TCP_MD5SIG
879 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
880 #endif
881 ];
882 } rep;
883 struct net *net = sock_net(sk);
884 struct ip_reply_arg arg;
885 struct sock *ctl_sk;
886 u64 transmit_time;
887
888 memset(&rep.th, 0, sizeof(struct tcphdr));
889 memset(&arg, 0, sizeof(arg));
890
891 arg.iov[0].iov_base = (unsigned char *)&rep;
892 arg.iov[0].iov_len = sizeof(rep.th);
893 if (tsecr) {
894 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
895 (TCPOPT_TIMESTAMP << 8) |
896 TCPOLEN_TIMESTAMP);
897 rep.opt[1] = htonl(tsval);
898 rep.opt[2] = htonl(tsecr);
899 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
900 }
901
902 /* Swap the send and the receive. */
903 rep.th.dest = th->source;
904 rep.th.source = th->dest;
905 rep.th.doff = arg.iov[0].iov_len / 4;
906 rep.th.seq = htonl(seq);
907 rep.th.ack_seq = htonl(ack);
908 rep.th.ack = 1;
909 rep.th.window = htons(win);
910
911 #ifdef CONFIG_TCP_MD5SIG
912 if (key) {
913 int offset = (tsecr) ? 3 : 0;
914
915 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
916 (TCPOPT_NOP << 16) |
917 (TCPOPT_MD5SIG << 8) |
918 TCPOLEN_MD5SIG);
919 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
920 rep.th.doff = arg.iov[0].iov_len/4;
921
922 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
923 key, ip_hdr(skb)->saddr,
924 ip_hdr(skb)->daddr, &rep.th);
925 }
926 #endif
927 arg.flags = reply_flags;
928 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
929 ip_hdr(skb)->saddr, /* XXX */
930 arg.iov[0].iov_len, IPPROTO_TCP, 0);
931 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
932 if (oif)
933 arg.bound_dev_if = oif;
934 arg.tos = tos;
935 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
936 local_bh_disable();
937 ctl_sk = this_cpu_read(ipv4_tcp_sk);
938 sock_net_set(ctl_sk, net);
939 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
940 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
941 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
942 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
943 transmit_time = tcp_transmit_time(sk);
944 ip_send_unicast_reply(ctl_sk,
945 skb, &TCP_SKB_CB(skb)->header.h4.opt,
946 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
947 &arg, arg.iov[0].iov_len,
948 transmit_time, txhash);
949
950 sock_net_set(ctl_sk, &init_net);
951 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
952 local_bh_enable();
953 }
954
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
956 {
957 struct inet_timewait_sock *tw = inet_twsk(sk);
958 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
959
960 tcp_v4_send_ack(sk, skb,
961 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
962 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
963 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
964 tcptw->tw_ts_recent,
965 tw->tw_bound_dev_if,
966 tcp_twsk_md5_key(tcptw),
967 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
968 tw->tw_tos,
969 tw->tw_txhash
970 );
971
972 inet_twsk_put(tw);
973 }
974
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)975 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
976 struct request_sock *req)
977 {
978 const union tcp_md5_addr *addr;
979 int l3index;
980
981 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
982 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
983 */
984 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
985 tcp_sk(sk)->snd_nxt;
986
987 /* RFC 7323 2.3
988 * The window field (SEG.WND) of every outgoing segment, with the
989 * exception of <SYN> segments, MUST be right-shifted by
990 * Rcv.Wind.Shift bits:
991 */
992 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
993 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
994 tcp_v4_send_ack(sk, skb, seq,
995 tcp_rsk(req)->rcv_nxt,
996 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
997 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
998 READ_ONCE(req->ts_recent),
999 0,
1000 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1001 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1002 ip_hdr(skb)->tos,
1003 READ_ONCE(tcp_rsk(req)->txhash));
1004 }
1005
1006 /*
1007 * Send a SYN-ACK after having received a SYN.
1008 * This still operates on a request_sock only, not on a big
1009 * socket.
1010 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)1011 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1012 struct flowi *fl,
1013 struct request_sock *req,
1014 struct tcp_fastopen_cookie *foc,
1015 enum tcp_synack_type synack_type,
1016 struct sk_buff *syn_skb)
1017 {
1018 const struct inet_request_sock *ireq = inet_rsk(req);
1019 struct flowi4 fl4;
1020 int err = -1;
1021 struct sk_buff *skb;
1022 u8 tos;
1023
1024 /* First, grab a route. */
1025 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1026 return -1;
1027
1028 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1029
1030 if (skb) {
1031 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1032
1033 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1034 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1035 (inet_sk(sk)->tos & INET_ECN_MASK) :
1036 inet_sk(sk)->tos;
1037
1038 if (!INET_ECN_is_capable(tos) &&
1039 tcp_bpf_ca_needs_ecn((struct sock *)req))
1040 tos |= INET_ECN_ECT_0;
1041
1042 rcu_read_lock();
1043 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1044 ireq->ir_rmt_addr,
1045 rcu_dereference(ireq->ireq_opt),
1046 tos);
1047 rcu_read_unlock();
1048 err = net_xmit_eval(err);
1049 }
1050
1051 return err;
1052 }
1053
1054 /*
1055 * IPv4 request_sock destructor.
1056 */
tcp_v4_reqsk_destructor(struct request_sock * req)1057 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1058 {
1059 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1060 }
1061
1062 #ifdef CONFIG_TCP_MD5SIG
1063 /*
1064 * RFC2385 MD5 checksumming requires a mapping of
1065 * IP address->MD5 Key.
1066 * We need to maintain these in the sk structure.
1067 */
1068
1069 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1070 EXPORT_SYMBOL(tcp_md5_needed);
1071
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1072 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1073 {
1074 if (!old)
1075 return true;
1076
1077 /* l3index always overrides non-l3index */
1078 if (old->l3index && new->l3index == 0)
1079 return false;
1080 if (old->l3index == 0 && new->l3index)
1081 return true;
1082
1083 return old->prefixlen < new->prefixlen;
1084 }
1085
1086 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1087 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1088 const union tcp_md5_addr *addr,
1089 int family)
1090 {
1091 const struct tcp_sock *tp = tcp_sk(sk);
1092 struct tcp_md5sig_key *key;
1093 const struct tcp_md5sig_info *md5sig;
1094 __be32 mask;
1095 struct tcp_md5sig_key *best_match = NULL;
1096 bool match;
1097
1098 /* caller either holds rcu_read_lock() or socket lock */
1099 md5sig = rcu_dereference_check(tp->md5sig_info,
1100 lockdep_sock_is_held(sk));
1101 if (!md5sig)
1102 return NULL;
1103
1104 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 lockdep_sock_is_held(sk)) {
1106 if (key->family != family)
1107 continue;
1108 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1109 continue;
1110 if (family == AF_INET) {
1111 mask = inet_make_mask(key->prefixlen);
1112 match = (key->addr.a4.s_addr & mask) ==
1113 (addr->a4.s_addr & mask);
1114 #if IS_ENABLED(CONFIG_IPV6)
1115 } else if (family == AF_INET6) {
1116 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1117 key->prefixlen);
1118 #endif
1119 } else {
1120 match = false;
1121 }
1122
1123 if (match && better_md5_match(best_match, key))
1124 best_match = key;
1125 }
1126 return best_match;
1127 }
1128 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1129
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1130 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1131 const union tcp_md5_addr *addr,
1132 int family, u8 prefixlen,
1133 int l3index, u8 flags)
1134 {
1135 const struct tcp_sock *tp = tcp_sk(sk);
1136 struct tcp_md5sig_key *key;
1137 unsigned int size = sizeof(struct in_addr);
1138 const struct tcp_md5sig_info *md5sig;
1139
1140 /* caller either holds rcu_read_lock() or socket lock */
1141 md5sig = rcu_dereference_check(tp->md5sig_info,
1142 lockdep_sock_is_held(sk));
1143 if (!md5sig)
1144 return NULL;
1145 #if IS_ENABLED(CONFIG_IPV6)
1146 if (family == AF_INET6)
1147 size = sizeof(struct in6_addr);
1148 #endif
1149 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1150 lockdep_sock_is_held(sk)) {
1151 if (key->family != family)
1152 continue;
1153 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1154 continue;
1155 if (key->l3index != l3index)
1156 continue;
1157 if (!memcmp(&key->addr, addr, size) &&
1158 key->prefixlen == prefixlen)
1159 return key;
1160 }
1161 return NULL;
1162 }
1163
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1164 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1165 const struct sock *addr_sk)
1166 {
1167 const union tcp_md5_addr *addr;
1168 int l3index;
1169
1170 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1171 addr_sk->sk_bound_dev_if);
1172 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1173 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1174 }
1175 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1176
tcp_md5sig_info_add(struct sock * sk,gfp_t gfp)1177 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1178 {
1179 struct tcp_sock *tp = tcp_sk(sk);
1180 struct tcp_md5sig_info *md5sig;
1181
1182 md5sig = kmalloc(sizeof(*md5sig), gfp);
1183 if (!md5sig)
1184 return -ENOMEM;
1185
1186 sk_gso_disable(sk);
1187 INIT_HLIST_HEAD(&md5sig->head);
1188 rcu_assign_pointer(tp->md5sig_info, md5sig);
1189 return 0;
1190 }
1191
1192 /* This can be called on a newly created socket, from other files */
__tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1193 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1194 int family, u8 prefixlen, int l3index, u8 flags,
1195 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1196 {
1197 /* Add Key to the list */
1198 struct tcp_md5sig_key *key;
1199 struct tcp_sock *tp = tcp_sk(sk);
1200 struct tcp_md5sig_info *md5sig;
1201
1202 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1203 if (key) {
1204 /* Pre-existing entry - just update that one.
1205 * Note that the key might be used concurrently.
1206 * data_race() is telling kcsan that we do not care of
1207 * key mismatches, since changing MD5 key on live flows
1208 * can lead to packet drops.
1209 */
1210 data_race(memcpy(key->key, newkey, newkeylen));
1211
1212 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1213 * Also note that a reader could catch new key->keylen value
1214 * but old key->key[], this is the reason we use __GFP_ZERO
1215 * at sock_kmalloc() time below these lines.
1216 */
1217 WRITE_ONCE(key->keylen, newkeylen);
1218
1219 return 0;
1220 }
1221
1222 md5sig = rcu_dereference_protected(tp->md5sig_info,
1223 lockdep_sock_is_held(sk));
1224
1225 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1226 if (!key)
1227 return -ENOMEM;
1228 if (!tcp_alloc_md5sig_pool()) {
1229 sock_kfree_s(sk, key, sizeof(*key));
1230 return -ENOMEM;
1231 }
1232
1233 memcpy(key->key, newkey, newkeylen);
1234 key->keylen = newkeylen;
1235 key->family = family;
1236 key->prefixlen = prefixlen;
1237 key->l3index = l3index;
1238 key->flags = flags;
1239 memcpy(&key->addr, addr,
1240 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1241 sizeof(struct in_addr));
1242 hlist_add_head_rcu(&key->node, &md5sig->head);
1243 return 0;
1244 }
1245
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen)1246 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1247 int family, u8 prefixlen, int l3index, u8 flags,
1248 const u8 *newkey, u8 newkeylen)
1249 {
1250 struct tcp_sock *tp = tcp_sk(sk);
1251
1252 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1253 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1254 return -ENOMEM;
1255
1256 if (!static_branch_inc(&tcp_md5_needed.key)) {
1257 struct tcp_md5sig_info *md5sig;
1258
1259 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1260 rcu_assign_pointer(tp->md5sig_info, NULL);
1261 kfree_rcu(md5sig, rcu);
1262 return -EUSERS;
1263 }
1264 }
1265
1266 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1267 newkey, newkeylen, GFP_KERNEL);
1268 }
1269 EXPORT_SYMBOL(tcp_md5_do_add);
1270
tcp_md5_key_copy(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,struct tcp_md5sig_key * key)1271 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1272 int family, u8 prefixlen, int l3index,
1273 struct tcp_md5sig_key *key)
1274 {
1275 struct tcp_sock *tp = tcp_sk(sk);
1276
1277 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1278 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1279 return -ENOMEM;
1280
1281 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1282 struct tcp_md5sig_info *md5sig;
1283
1284 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1285 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1286 rcu_assign_pointer(tp->md5sig_info, NULL);
1287 kfree_rcu(md5sig, rcu);
1288 return -EUSERS;
1289 }
1290 }
1291
1292 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1293 key->flags, key->key, key->keylen,
1294 sk_gfp_mask(sk, GFP_ATOMIC));
1295 }
1296 EXPORT_SYMBOL(tcp_md5_key_copy);
1297
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1298 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1299 u8 prefixlen, int l3index, u8 flags)
1300 {
1301 struct tcp_md5sig_key *key;
1302
1303 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1304 if (!key)
1305 return -ENOENT;
1306 hlist_del_rcu(&key->node);
1307 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1308 kfree_rcu(key, rcu);
1309 return 0;
1310 }
1311 EXPORT_SYMBOL(tcp_md5_do_del);
1312
tcp_clear_md5_list(struct sock * sk)1313 static void tcp_clear_md5_list(struct sock *sk)
1314 {
1315 struct tcp_sock *tp = tcp_sk(sk);
1316 struct tcp_md5sig_key *key;
1317 struct hlist_node *n;
1318 struct tcp_md5sig_info *md5sig;
1319
1320 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1321
1322 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1323 hlist_del_rcu(&key->node);
1324 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1325 kfree_rcu(key, rcu);
1326 }
1327 }
1328
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1329 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1330 sockptr_t optval, int optlen)
1331 {
1332 struct tcp_md5sig cmd;
1333 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1334 const union tcp_md5_addr *addr;
1335 u8 prefixlen = 32;
1336 int l3index = 0;
1337 u8 flags;
1338
1339 if (optlen < sizeof(cmd))
1340 return -EINVAL;
1341
1342 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1343 return -EFAULT;
1344
1345 if (sin->sin_family != AF_INET)
1346 return -EINVAL;
1347
1348 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1349
1350 if (optname == TCP_MD5SIG_EXT &&
1351 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1352 prefixlen = cmd.tcpm_prefixlen;
1353 if (prefixlen > 32)
1354 return -EINVAL;
1355 }
1356
1357 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1358 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1359 struct net_device *dev;
1360
1361 rcu_read_lock();
1362 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1363 if (dev && netif_is_l3_master(dev))
1364 l3index = dev->ifindex;
1365
1366 rcu_read_unlock();
1367
1368 /* ok to reference set/not set outside of rcu;
1369 * right now device MUST be an L3 master
1370 */
1371 if (!dev || !l3index)
1372 return -EINVAL;
1373 }
1374
1375 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1376
1377 if (!cmd.tcpm_keylen)
1378 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1379
1380 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1381 return -EINVAL;
1382
1383 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1384 cmd.tcpm_key, cmd.tcpm_keylen);
1385 }
1386
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1387 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1388 __be32 daddr, __be32 saddr,
1389 const struct tcphdr *th, int nbytes)
1390 {
1391 struct tcp4_pseudohdr *bp;
1392 struct scatterlist sg;
1393 struct tcphdr *_th;
1394
1395 bp = hp->scratch;
1396 bp->saddr = saddr;
1397 bp->daddr = daddr;
1398 bp->pad = 0;
1399 bp->protocol = IPPROTO_TCP;
1400 bp->len = cpu_to_be16(nbytes);
1401
1402 _th = (struct tcphdr *)(bp + 1);
1403 memcpy(_th, th, sizeof(*th));
1404 _th->check = 0;
1405
1406 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1407 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1408 sizeof(*bp) + sizeof(*th));
1409 return crypto_ahash_update(hp->md5_req);
1410 }
1411
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1412 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1413 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1414 {
1415 struct tcp_md5sig_pool *hp;
1416 struct ahash_request *req;
1417
1418 hp = tcp_get_md5sig_pool();
1419 if (!hp)
1420 goto clear_hash_noput;
1421 req = hp->md5_req;
1422
1423 if (crypto_ahash_init(req))
1424 goto clear_hash;
1425 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1426 goto clear_hash;
1427 if (tcp_md5_hash_key(hp, key))
1428 goto clear_hash;
1429 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1430 if (crypto_ahash_final(req))
1431 goto clear_hash;
1432
1433 tcp_put_md5sig_pool();
1434 return 0;
1435
1436 clear_hash:
1437 tcp_put_md5sig_pool();
1438 clear_hash_noput:
1439 memset(md5_hash, 0, 16);
1440 return 1;
1441 }
1442
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1443 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1444 const struct sock *sk,
1445 const struct sk_buff *skb)
1446 {
1447 struct tcp_md5sig_pool *hp;
1448 struct ahash_request *req;
1449 const struct tcphdr *th = tcp_hdr(skb);
1450 __be32 saddr, daddr;
1451
1452 if (sk) { /* valid for establish/request sockets */
1453 saddr = sk->sk_rcv_saddr;
1454 daddr = sk->sk_daddr;
1455 } else {
1456 const struct iphdr *iph = ip_hdr(skb);
1457 saddr = iph->saddr;
1458 daddr = iph->daddr;
1459 }
1460
1461 hp = tcp_get_md5sig_pool();
1462 if (!hp)
1463 goto clear_hash_noput;
1464 req = hp->md5_req;
1465
1466 if (crypto_ahash_init(req))
1467 goto clear_hash;
1468
1469 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1470 goto clear_hash;
1471 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1472 goto clear_hash;
1473 if (tcp_md5_hash_key(hp, key))
1474 goto clear_hash;
1475 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1476 if (crypto_ahash_final(req))
1477 goto clear_hash;
1478
1479 tcp_put_md5sig_pool();
1480 return 0;
1481
1482 clear_hash:
1483 tcp_put_md5sig_pool();
1484 clear_hash_noput:
1485 memset(md5_hash, 0, 16);
1486 return 1;
1487 }
1488 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1489
1490 #endif
1491
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1492 static void tcp_v4_init_req(struct request_sock *req,
1493 const struct sock *sk_listener,
1494 struct sk_buff *skb)
1495 {
1496 struct inet_request_sock *ireq = inet_rsk(req);
1497 struct net *net = sock_net(sk_listener);
1498
1499 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1500 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1501 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1502 }
1503
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1504 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1505 struct sk_buff *skb,
1506 struct flowi *fl,
1507 struct request_sock *req)
1508 {
1509 tcp_v4_init_req(req, sk, skb);
1510
1511 if (security_inet_conn_request(sk, skb, req))
1512 return NULL;
1513
1514 return inet_csk_route_req(sk, &fl->u.ip4, req);
1515 }
1516
1517 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1518 .family = PF_INET,
1519 .obj_size = sizeof(struct tcp_request_sock),
1520 .rtx_syn_ack = tcp_rtx_synack,
1521 .send_ack = tcp_v4_reqsk_send_ack,
1522 .destructor = tcp_v4_reqsk_destructor,
1523 .send_reset = tcp_v4_send_reset,
1524 .syn_ack_timeout = tcp_syn_ack_timeout,
1525 };
1526
1527 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1528 .mss_clamp = TCP_MSS_DEFAULT,
1529 #ifdef CONFIG_TCP_MD5SIG
1530 .req_md5_lookup = tcp_v4_md5_lookup,
1531 .calc_md5_hash = tcp_v4_md5_hash_skb,
1532 #endif
1533 #ifdef CONFIG_SYN_COOKIES
1534 .cookie_init_seq = cookie_v4_init_sequence,
1535 #endif
1536 .route_req = tcp_v4_route_req,
1537 .init_seq = tcp_v4_init_seq,
1538 .init_ts_off = tcp_v4_init_ts_off,
1539 .send_synack = tcp_v4_send_synack,
1540 };
1541
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1542 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1543 {
1544 /* Never answer to SYNs send to broadcast or multicast */
1545 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1546 goto drop;
1547
1548 return tcp_conn_request(&tcp_request_sock_ops,
1549 &tcp_request_sock_ipv4_ops, sk, skb);
1550
1551 drop:
1552 tcp_listendrop(sk);
1553 return 0;
1554 }
1555 EXPORT_SYMBOL(tcp_v4_conn_request);
1556
1557
1558 /*
1559 * The three way handshake has completed - we got a valid synack -
1560 * now create the new socket.
1561 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1562 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1563 struct request_sock *req,
1564 struct dst_entry *dst,
1565 struct request_sock *req_unhash,
1566 bool *own_req)
1567 {
1568 struct inet_request_sock *ireq;
1569 bool found_dup_sk = false;
1570 struct inet_sock *newinet;
1571 struct tcp_sock *newtp;
1572 struct sock *newsk;
1573 #ifdef CONFIG_TCP_MD5SIG
1574 const union tcp_md5_addr *addr;
1575 struct tcp_md5sig_key *key;
1576 int l3index;
1577 #endif
1578 struct ip_options_rcu *inet_opt;
1579
1580 if (sk_acceptq_is_full(sk))
1581 goto exit_overflow;
1582
1583 newsk = tcp_create_openreq_child(sk, req, skb);
1584 if (!newsk)
1585 goto exit_nonewsk;
1586
1587 newsk->sk_gso_type = SKB_GSO_TCPV4;
1588 inet_sk_rx_dst_set(newsk, skb);
1589
1590 newtp = tcp_sk(newsk);
1591 newinet = inet_sk(newsk);
1592 ireq = inet_rsk(req);
1593 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1594 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1595 newsk->sk_bound_dev_if = ireq->ir_iif;
1596 newinet->inet_saddr = ireq->ir_loc_addr;
1597 inet_opt = rcu_dereference(ireq->ireq_opt);
1598 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1599 newinet->mc_index = inet_iif(skb);
1600 newinet->mc_ttl = ip_hdr(skb)->ttl;
1601 newinet->rcv_tos = ip_hdr(skb)->tos;
1602 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1603 if (inet_opt)
1604 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1605 atomic_set(&newinet->inet_id, get_random_u16());
1606
1607 /* Set ToS of the new socket based upon the value of incoming SYN.
1608 * ECT bits are set later in tcp_init_transfer().
1609 */
1610 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1611 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1612
1613 if (!dst) {
1614 dst = inet_csk_route_child_sock(sk, newsk, req);
1615 if (!dst)
1616 goto put_and_exit;
1617 } else {
1618 /* syncookie case : see end of cookie_v4_check() */
1619 }
1620 sk_setup_caps(newsk, dst);
1621
1622 tcp_ca_openreq_child(newsk, dst);
1623
1624 tcp_sync_mss(newsk, dst_mtu(dst));
1625 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1626
1627 tcp_initialize_rcv_mss(newsk);
1628
1629 #ifdef CONFIG_TCP_MD5SIG
1630 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1631 /* Copy over the MD5 key from the original socket */
1632 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1633 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1634 if (key) {
1635 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1636 goto put_and_exit;
1637 sk_gso_disable(newsk);
1638 }
1639 #endif
1640
1641 if (__inet_inherit_port(sk, newsk) < 0)
1642 goto put_and_exit;
1643 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1644 &found_dup_sk);
1645 if (likely(*own_req)) {
1646 tcp_move_syn(newtp, req);
1647 ireq->ireq_opt = NULL;
1648 } else {
1649 newinet->inet_opt = NULL;
1650
1651 if (!req_unhash && found_dup_sk) {
1652 /* This code path should only be executed in the
1653 * syncookie case only
1654 */
1655 bh_unlock_sock(newsk);
1656 sock_put(newsk);
1657 newsk = NULL;
1658 }
1659 }
1660 return newsk;
1661
1662 exit_overflow:
1663 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1664 exit_nonewsk:
1665 dst_release(dst);
1666 exit:
1667 tcp_listendrop(sk);
1668 return NULL;
1669 put_and_exit:
1670 newinet->inet_opt = NULL;
1671 inet_csk_prepare_forced_close(newsk);
1672 tcp_done(newsk);
1673 goto exit;
1674 }
1675 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1676
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1677 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1678 {
1679 #ifdef CONFIG_SYN_COOKIES
1680 const struct tcphdr *th = tcp_hdr(skb);
1681
1682 if (!th->syn)
1683 sk = cookie_v4_check(sk, skb);
1684 #endif
1685 return sk;
1686 }
1687
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1688 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1689 struct tcphdr *th, u32 *cookie)
1690 {
1691 u16 mss = 0;
1692 #ifdef CONFIG_SYN_COOKIES
1693 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1694 &tcp_request_sock_ipv4_ops, sk, th);
1695 if (mss) {
1696 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1697 tcp_synq_overflow(sk);
1698 }
1699 #endif
1700 return mss;
1701 }
1702
1703 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1704 u32));
1705 /* The socket must have it's spinlock held when we get
1706 * here, unless it is a TCP_LISTEN socket.
1707 *
1708 * We have a potential double-lock case here, so even when
1709 * doing backlog processing we use the BH locking scheme.
1710 * This is because we cannot sleep with the original spinlock
1711 * held.
1712 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1713 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1714 {
1715 enum skb_drop_reason reason;
1716 struct sock *rsk;
1717
1718 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1719 struct dst_entry *dst;
1720
1721 dst = rcu_dereference_protected(sk->sk_rx_dst,
1722 lockdep_sock_is_held(sk));
1723
1724 sock_rps_save_rxhash(sk, skb);
1725 sk_mark_napi_id(sk, skb);
1726 if (dst) {
1727 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1728 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1729 dst, 0)) {
1730 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1731 dst_release(dst);
1732 }
1733 }
1734 tcp_rcv_established(sk, skb);
1735 return 0;
1736 }
1737
1738 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1739 if (tcp_checksum_complete(skb))
1740 goto csum_err;
1741
1742 if (sk->sk_state == TCP_LISTEN) {
1743 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1744
1745 if (!nsk)
1746 goto discard;
1747 if (nsk != sk) {
1748 if (tcp_child_process(sk, nsk, skb)) {
1749 rsk = nsk;
1750 goto reset;
1751 }
1752 return 0;
1753 }
1754 } else
1755 sock_rps_save_rxhash(sk, skb);
1756
1757 if (tcp_rcv_state_process(sk, skb)) {
1758 rsk = sk;
1759 goto reset;
1760 }
1761 return 0;
1762
1763 reset:
1764 tcp_v4_send_reset(rsk, skb);
1765 discard:
1766 kfree_skb_reason(skb, reason);
1767 /* Be careful here. If this function gets more complicated and
1768 * gcc suffers from register pressure on the x86, sk (in %ebx)
1769 * might be destroyed here. This current version compiles correctly,
1770 * but you have been warned.
1771 */
1772 return 0;
1773
1774 csum_err:
1775 reason = SKB_DROP_REASON_TCP_CSUM;
1776 trace_tcp_bad_csum(skb);
1777 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1778 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1779 goto discard;
1780 }
1781 EXPORT_SYMBOL(tcp_v4_do_rcv);
1782
tcp_v4_early_demux(struct sk_buff * skb)1783 int tcp_v4_early_demux(struct sk_buff *skb)
1784 {
1785 struct net *net = dev_net(skb->dev);
1786 const struct iphdr *iph;
1787 const struct tcphdr *th;
1788 struct sock *sk;
1789
1790 if (skb->pkt_type != PACKET_HOST)
1791 return 0;
1792
1793 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1794 return 0;
1795
1796 iph = ip_hdr(skb);
1797 th = tcp_hdr(skb);
1798
1799 if (th->doff < sizeof(struct tcphdr) / 4)
1800 return 0;
1801
1802 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1803 iph->saddr, th->source,
1804 iph->daddr, ntohs(th->dest),
1805 skb->skb_iif, inet_sdif(skb));
1806 if (sk) {
1807 skb->sk = sk;
1808 skb->destructor = sock_edemux;
1809 if (sk_fullsock(sk)) {
1810 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1811
1812 if (dst)
1813 dst = dst_check(dst, 0);
1814 if (dst &&
1815 sk->sk_rx_dst_ifindex == skb->skb_iif)
1816 skb_dst_set_noref(skb, dst);
1817 }
1818 }
1819 return 0;
1820 }
1821
tcp_add_backlog(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)1822 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1823 enum skb_drop_reason *reason)
1824 {
1825 u32 tail_gso_size, tail_gso_segs;
1826 struct skb_shared_info *shinfo;
1827 const struct tcphdr *th;
1828 struct tcphdr *thtail;
1829 struct sk_buff *tail;
1830 unsigned int hdrlen;
1831 bool fragstolen;
1832 u32 gso_segs;
1833 u32 gso_size;
1834 u64 limit;
1835 int delta;
1836
1837 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1838 * we can fix skb->truesize to its real value to avoid future drops.
1839 * This is valid because skb is not yet charged to the socket.
1840 * It has been noticed pure SACK packets were sometimes dropped
1841 * (if cooked by drivers without copybreak feature).
1842 */
1843 skb_condense(skb);
1844
1845 skb_dst_drop(skb);
1846
1847 if (unlikely(tcp_checksum_complete(skb))) {
1848 bh_unlock_sock(sk);
1849 trace_tcp_bad_csum(skb);
1850 *reason = SKB_DROP_REASON_TCP_CSUM;
1851 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1852 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1853 return true;
1854 }
1855
1856 /* Attempt coalescing to last skb in backlog, even if we are
1857 * above the limits.
1858 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1859 */
1860 th = (const struct tcphdr *)skb->data;
1861 hdrlen = th->doff * 4;
1862
1863 tail = sk->sk_backlog.tail;
1864 if (!tail)
1865 goto no_coalesce;
1866 thtail = (struct tcphdr *)tail->data;
1867
1868 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1869 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1870 ((TCP_SKB_CB(tail)->tcp_flags |
1871 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1872 !((TCP_SKB_CB(tail)->tcp_flags &
1873 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1874 ((TCP_SKB_CB(tail)->tcp_flags ^
1875 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1876 #ifdef CONFIG_TLS_DEVICE
1877 tail->decrypted != skb->decrypted ||
1878 #endif
1879 !mptcp_skb_can_collapse(tail, skb) ||
1880 thtail->doff != th->doff ||
1881 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1882 goto no_coalesce;
1883
1884 __skb_pull(skb, hdrlen);
1885
1886 shinfo = skb_shinfo(skb);
1887 gso_size = shinfo->gso_size ?: skb->len;
1888 gso_segs = shinfo->gso_segs ?: 1;
1889
1890 shinfo = skb_shinfo(tail);
1891 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1892 tail_gso_segs = shinfo->gso_segs ?: 1;
1893
1894 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1895 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1896
1897 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1898 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1899 thtail->window = th->window;
1900 }
1901
1902 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1903 * thtail->fin, so that the fast path in tcp_rcv_established()
1904 * is not entered if we append a packet with a FIN.
1905 * SYN, RST, URG are not present.
1906 * ACK is set on both packets.
1907 * PSH : we do not really care in TCP stack,
1908 * at least for 'GRO' packets.
1909 */
1910 thtail->fin |= th->fin;
1911 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1912
1913 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1914 TCP_SKB_CB(tail)->has_rxtstamp = true;
1915 tail->tstamp = skb->tstamp;
1916 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1917 }
1918
1919 /* Not as strict as GRO. We only need to carry mss max value */
1920 shinfo->gso_size = max(gso_size, tail_gso_size);
1921 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1922
1923 sk->sk_backlog.len += delta;
1924 __NET_INC_STATS(sock_net(sk),
1925 LINUX_MIB_TCPBACKLOGCOALESCE);
1926 kfree_skb_partial(skb, fragstolen);
1927 return false;
1928 }
1929 __skb_push(skb, hdrlen);
1930
1931 no_coalesce:
1932 /* sk->sk_backlog.len is reset only at the end of __release_sock().
1933 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
1934 * sk_rcvbuf in normal conditions.
1935 */
1936 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
1937
1938 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
1939
1940 /* Only socket owner can try to collapse/prune rx queues
1941 * to reduce memory overhead, so add a little headroom here.
1942 * Few sockets backlog are possibly concurrently non empty.
1943 */
1944 limit += 64 * 1024;
1945
1946 limit = min_t(u64, limit, UINT_MAX);
1947
1948 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1949 bh_unlock_sock(sk);
1950 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1951 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1952 return true;
1953 }
1954 return false;
1955 }
1956 EXPORT_SYMBOL(tcp_add_backlog);
1957
tcp_filter(struct sock * sk,struct sk_buff * skb)1958 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1959 {
1960 struct tcphdr *th = (struct tcphdr *)skb->data;
1961
1962 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1963 }
1964 EXPORT_SYMBOL(tcp_filter);
1965
tcp_v4_restore_cb(struct sk_buff * skb)1966 static void tcp_v4_restore_cb(struct sk_buff *skb)
1967 {
1968 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1969 sizeof(struct inet_skb_parm));
1970 }
1971
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1972 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1973 const struct tcphdr *th)
1974 {
1975 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1976 * barrier() makes sure compiler wont play fool^Waliasing games.
1977 */
1978 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1979 sizeof(struct inet_skb_parm));
1980 barrier();
1981
1982 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1983 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1984 skb->len - th->doff * 4);
1985 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1986 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1987 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1988 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1989 TCP_SKB_CB(skb)->sacked = 0;
1990 TCP_SKB_CB(skb)->has_rxtstamp =
1991 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1992 }
1993
1994 /*
1995 * From tcp_input.c
1996 */
1997
tcp_v4_rcv(struct sk_buff * skb)1998 int tcp_v4_rcv(struct sk_buff *skb)
1999 {
2000 struct net *net = dev_net(skb->dev);
2001 enum skb_drop_reason drop_reason;
2002 int sdif = inet_sdif(skb);
2003 int dif = inet_iif(skb);
2004 const struct iphdr *iph;
2005 const struct tcphdr *th;
2006 bool refcounted;
2007 struct sock *sk;
2008 int ret;
2009
2010 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2011 if (skb->pkt_type != PACKET_HOST)
2012 goto discard_it;
2013
2014 /* Count it even if it's bad */
2015 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2016
2017 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2018 goto discard_it;
2019
2020 th = (const struct tcphdr *)skb->data;
2021
2022 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2023 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2024 goto bad_packet;
2025 }
2026 if (!pskb_may_pull(skb, th->doff * 4))
2027 goto discard_it;
2028
2029 /* An explanation is required here, I think.
2030 * Packet length and doff are validated by header prediction,
2031 * provided case of th->doff==0 is eliminated.
2032 * So, we defer the checks. */
2033
2034 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2035 goto csum_error;
2036
2037 th = (const struct tcphdr *)skb->data;
2038 iph = ip_hdr(skb);
2039 lookup:
2040 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2041 skb, __tcp_hdrlen(th), th->source,
2042 th->dest, sdif, &refcounted);
2043 if (!sk)
2044 goto no_tcp_socket;
2045
2046 process:
2047 if (sk->sk_state == TCP_TIME_WAIT)
2048 goto do_time_wait;
2049
2050 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2051 struct request_sock *req = inet_reqsk(sk);
2052 bool req_stolen = false;
2053 struct sock *nsk;
2054
2055 sk = req->rsk_listener;
2056 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2057 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2058 else
2059 drop_reason = tcp_inbound_md5_hash(sk, skb,
2060 &iph->saddr, &iph->daddr,
2061 AF_INET, dif, sdif);
2062 if (unlikely(drop_reason)) {
2063 sk_drops_add(sk, skb);
2064 reqsk_put(req);
2065 goto discard_it;
2066 }
2067 if (tcp_checksum_complete(skb)) {
2068 reqsk_put(req);
2069 goto csum_error;
2070 }
2071 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2072 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2073 if (!nsk) {
2074 inet_csk_reqsk_queue_drop_and_put(sk, req);
2075 goto lookup;
2076 }
2077 sk = nsk;
2078 /* reuseport_migrate_sock() has already held one sk_refcnt
2079 * before returning.
2080 */
2081 } else {
2082 /* We own a reference on the listener, increase it again
2083 * as we might lose it too soon.
2084 */
2085 sock_hold(sk);
2086 }
2087 refcounted = true;
2088 nsk = NULL;
2089 if (!tcp_filter(sk, skb)) {
2090 th = (const struct tcphdr *)skb->data;
2091 iph = ip_hdr(skb);
2092 tcp_v4_fill_cb(skb, iph, th);
2093 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2094 } else {
2095 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2096 }
2097 if (!nsk) {
2098 reqsk_put(req);
2099 if (req_stolen) {
2100 /* Another cpu got exclusive access to req
2101 * and created a full blown socket.
2102 * Try to feed this packet to this socket
2103 * instead of discarding it.
2104 */
2105 tcp_v4_restore_cb(skb);
2106 sock_put(sk);
2107 goto lookup;
2108 }
2109 goto discard_and_relse;
2110 }
2111 nf_reset_ct(skb);
2112 if (nsk == sk) {
2113 reqsk_put(req);
2114 tcp_v4_restore_cb(skb);
2115 } else if (tcp_child_process(sk, nsk, skb)) {
2116 tcp_v4_send_reset(nsk, skb);
2117 goto discard_and_relse;
2118 } else {
2119 sock_put(sk);
2120 return 0;
2121 }
2122 }
2123
2124 if (static_branch_unlikely(&ip4_min_ttl)) {
2125 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2126 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2127 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2128 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2129 goto discard_and_relse;
2130 }
2131 }
2132
2133 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2134 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2135 goto discard_and_relse;
2136 }
2137
2138 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2139 &iph->daddr, AF_INET, dif, sdif);
2140 if (drop_reason)
2141 goto discard_and_relse;
2142
2143 nf_reset_ct(skb);
2144
2145 if (tcp_filter(sk, skb)) {
2146 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2147 goto discard_and_relse;
2148 }
2149 th = (const struct tcphdr *)skb->data;
2150 iph = ip_hdr(skb);
2151 tcp_v4_fill_cb(skb, iph, th);
2152
2153 skb->dev = NULL;
2154
2155 if (sk->sk_state == TCP_LISTEN) {
2156 ret = tcp_v4_do_rcv(sk, skb);
2157 goto put_and_return;
2158 }
2159
2160 sk_incoming_cpu_update(sk);
2161
2162 bh_lock_sock_nested(sk);
2163 tcp_segs_in(tcp_sk(sk), skb);
2164 ret = 0;
2165 if (!sock_owned_by_user(sk)) {
2166 ret = tcp_v4_do_rcv(sk, skb);
2167 } else {
2168 if (tcp_add_backlog(sk, skb, &drop_reason))
2169 goto discard_and_relse;
2170 }
2171 bh_unlock_sock(sk);
2172
2173 put_and_return:
2174 if (refcounted)
2175 sock_put(sk);
2176
2177 return ret;
2178
2179 no_tcp_socket:
2180 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2181 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2182 goto discard_it;
2183
2184 tcp_v4_fill_cb(skb, iph, th);
2185
2186 if (tcp_checksum_complete(skb)) {
2187 csum_error:
2188 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2189 trace_tcp_bad_csum(skb);
2190 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2191 bad_packet:
2192 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2193 } else {
2194 tcp_v4_send_reset(NULL, skb);
2195 }
2196
2197 discard_it:
2198 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2199 /* Discard frame. */
2200 kfree_skb_reason(skb, drop_reason);
2201 return 0;
2202
2203 discard_and_relse:
2204 sk_drops_add(sk, skb);
2205 if (refcounted)
2206 sock_put(sk);
2207 goto discard_it;
2208
2209 do_time_wait:
2210 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2211 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2212 inet_twsk_put(inet_twsk(sk));
2213 goto discard_it;
2214 }
2215
2216 tcp_v4_fill_cb(skb, iph, th);
2217
2218 if (tcp_checksum_complete(skb)) {
2219 inet_twsk_put(inet_twsk(sk));
2220 goto csum_error;
2221 }
2222 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2223 case TCP_TW_SYN: {
2224 struct sock *sk2 = inet_lookup_listener(net,
2225 net->ipv4.tcp_death_row.hashinfo,
2226 skb, __tcp_hdrlen(th),
2227 iph->saddr, th->source,
2228 iph->daddr, th->dest,
2229 inet_iif(skb),
2230 sdif);
2231 if (sk2) {
2232 inet_twsk_deschedule_put(inet_twsk(sk));
2233 sk = sk2;
2234 tcp_v4_restore_cb(skb);
2235 refcounted = false;
2236 goto process;
2237 }
2238 }
2239 /* to ACK */
2240 fallthrough;
2241 case TCP_TW_ACK:
2242 tcp_v4_timewait_ack(sk, skb);
2243 break;
2244 case TCP_TW_RST:
2245 tcp_v4_send_reset(sk, skb);
2246 inet_twsk_deschedule_put(inet_twsk(sk));
2247 goto discard_it;
2248 case TCP_TW_SUCCESS:;
2249 }
2250 goto discard_it;
2251 }
2252
2253 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2254 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2255 .twsk_unique = tcp_twsk_unique,
2256 .twsk_destructor= tcp_twsk_destructor,
2257 };
2258
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2259 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2260 {
2261 struct dst_entry *dst = skb_dst(skb);
2262
2263 if (dst && dst_hold_safe(dst)) {
2264 rcu_assign_pointer(sk->sk_rx_dst, dst);
2265 sk->sk_rx_dst_ifindex = skb->skb_iif;
2266 }
2267 }
2268 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2269
2270 const struct inet_connection_sock_af_ops ipv4_specific = {
2271 .queue_xmit = ip_queue_xmit,
2272 .send_check = tcp_v4_send_check,
2273 .rebuild_header = inet_sk_rebuild_header,
2274 .sk_rx_dst_set = inet_sk_rx_dst_set,
2275 .conn_request = tcp_v4_conn_request,
2276 .syn_recv_sock = tcp_v4_syn_recv_sock,
2277 .net_header_len = sizeof(struct iphdr),
2278 .setsockopt = ip_setsockopt,
2279 .getsockopt = ip_getsockopt,
2280 .addr2sockaddr = inet_csk_addr2sockaddr,
2281 .sockaddr_len = sizeof(struct sockaddr_in),
2282 .mtu_reduced = tcp_v4_mtu_reduced,
2283 };
2284 EXPORT_SYMBOL(ipv4_specific);
2285
2286 #ifdef CONFIG_TCP_MD5SIG
2287 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2288 .md5_lookup = tcp_v4_md5_lookup,
2289 .calc_md5_hash = tcp_v4_md5_hash_skb,
2290 .md5_parse = tcp_v4_parse_md5_keys,
2291 };
2292 #endif
2293
2294 /* NOTE: A lot of things set to zero explicitly by call to
2295 * sk_alloc() so need not be done here.
2296 */
tcp_v4_init_sock(struct sock * sk)2297 static int tcp_v4_init_sock(struct sock *sk)
2298 {
2299 struct inet_connection_sock *icsk = inet_csk(sk);
2300
2301 tcp_init_sock(sk);
2302
2303 icsk->icsk_af_ops = &ipv4_specific;
2304
2305 #ifdef CONFIG_TCP_MD5SIG
2306 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2307 #endif
2308
2309 return 0;
2310 }
2311
tcp_v4_destroy_sock(struct sock * sk)2312 void tcp_v4_destroy_sock(struct sock *sk)
2313 {
2314 struct tcp_sock *tp = tcp_sk(sk);
2315
2316 trace_tcp_destroy_sock(sk);
2317
2318 tcp_clear_xmit_timers(sk);
2319
2320 tcp_cleanup_congestion_control(sk);
2321
2322 tcp_cleanup_ulp(sk);
2323
2324 /* Cleanup up the write buffer. */
2325 tcp_write_queue_purge(sk);
2326
2327 /* Check if we want to disable active TFO */
2328 tcp_fastopen_active_disable_ofo_check(sk);
2329
2330 /* Cleans up our, hopefully empty, out_of_order_queue. */
2331 skb_rbtree_purge(&tp->out_of_order_queue);
2332
2333 #ifdef CONFIG_TCP_MD5SIG
2334 /* Clean up the MD5 key list, if any */
2335 if (tp->md5sig_info) {
2336 tcp_clear_md5_list(sk);
2337 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2338 tp->md5sig_info = NULL;
2339 static_branch_slow_dec_deferred(&tcp_md5_needed);
2340 }
2341 #endif
2342
2343 /* Clean up a referenced TCP bind bucket. */
2344 if (inet_csk(sk)->icsk_bind_hash)
2345 inet_put_port(sk);
2346
2347 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2348
2349 /* If socket is aborted during connect operation */
2350 tcp_free_fastopen_req(tp);
2351 tcp_fastopen_destroy_cipher(sk);
2352 tcp_saved_syn_free(tp);
2353
2354 sk_sockets_allocated_dec(sk);
2355 }
2356 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2357
2358 #ifdef CONFIG_PROC_FS
2359 /* Proc filesystem TCP sock list dumping. */
2360
2361 static unsigned short seq_file_family(const struct seq_file *seq);
2362
seq_sk_match(struct seq_file * seq,const struct sock * sk)2363 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2364 {
2365 unsigned short family = seq_file_family(seq);
2366
2367 /* AF_UNSPEC is used as a match all */
2368 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2369 net_eq(sock_net(sk), seq_file_net(seq)));
2370 }
2371
2372 /* Find a non empty bucket (starting from st->bucket)
2373 * and return the first sk from it.
2374 */
listening_get_first(struct seq_file * seq)2375 static void *listening_get_first(struct seq_file *seq)
2376 {
2377 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2378 struct tcp_iter_state *st = seq->private;
2379
2380 st->offset = 0;
2381 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2382 struct inet_listen_hashbucket *ilb2;
2383 struct hlist_nulls_node *node;
2384 struct sock *sk;
2385
2386 ilb2 = &hinfo->lhash2[st->bucket];
2387 if (hlist_nulls_empty(&ilb2->nulls_head))
2388 continue;
2389
2390 spin_lock(&ilb2->lock);
2391 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2392 if (seq_sk_match(seq, sk))
2393 return sk;
2394 }
2395 spin_unlock(&ilb2->lock);
2396 }
2397
2398 return NULL;
2399 }
2400
2401 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2402 * If "cur" is the last one in the st->bucket,
2403 * call listening_get_first() to return the first sk of the next
2404 * non empty bucket.
2405 */
listening_get_next(struct seq_file * seq,void * cur)2406 static void *listening_get_next(struct seq_file *seq, void *cur)
2407 {
2408 struct tcp_iter_state *st = seq->private;
2409 struct inet_listen_hashbucket *ilb2;
2410 struct hlist_nulls_node *node;
2411 struct inet_hashinfo *hinfo;
2412 struct sock *sk = cur;
2413
2414 ++st->num;
2415 ++st->offset;
2416
2417 sk = sk_nulls_next(sk);
2418 sk_nulls_for_each_from(sk, node) {
2419 if (seq_sk_match(seq, sk))
2420 return sk;
2421 }
2422
2423 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2424 ilb2 = &hinfo->lhash2[st->bucket];
2425 spin_unlock(&ilb2->lock);
2426 ++st->bucket;
2427 return listening_get_first(seq);
2428 }
2429
listening_get_idx(struct seq_file * seq,loff_t * pos)2430 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2431 {
2432 struct tcp_iter_state *st = seq->private;
2433 void *rc;
2434
2435 st->bucket = 0;
2436 st->offset = 0;
2437 rc = listening_get_first(seq);
2438
2439 while (rc && *pos) {
2440 rc = listening_get_next(seq, rc);
2441 --*pos;
2442 }
2443 return rc;
2444 }
2445
empty_bucket(struct inet_hashinfo * hinfo,const struct tcp_iter_state * st)2446 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2447 const struct tcp_iter_state *st)
2448 {
2449 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2450 }
2451
2452 /*
2453 * Get first established socket starting from bucket given in st->bucket.
2454 * If st->bucket is zero, the very first socket in the hash is returned.
2455 */
established_get_first(struct seq_file * seq)2456 static void *established_get_first(struct seq_file *seq)
2457 {
2458 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2459 struct tcp_iter_state *st = seq->private;
2460
2461 st->offset = 0;
2462 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2463 struct sock *sk;
2464 struct hlist_nulls_node *node;
2465 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2466
2467 cond_resched();
2468
2469 /* Lockless fast path for the common case of empty buckets */
2470 if (empty_bucket(hinfo, st))
2471 continue;
2472
2473 spin_lock_bh(lock);
2474 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2475 if (seq_sk_match(seq, sk))
2476 return sk;
2477 }
2478 spin_unlock_bh(lock);
2479 }
2480
2481 return NULL;
2482 }
2483
established_get_next(struct seq_file * seq,void * cur)2484 static void *established_get_next(struct seq_file *seq, void *cur)
2485 {
2486 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2487 struct tcp_iter_state *st = seq->private;
2488 struct hlist_nulls_node *node;
2489 struct sock *sk = cur;
2490
2491 ++st->num;
2492 ++st->offset;
2493
2494 sk = sk_nulls_next(sk);
2495
2496 sk_nulls_for_each_from(sk, node) {
2497 if (seq_sk_match(seq, sk))
2498 return sk;
2499 }
2500
2501 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2502 ++st->bucket;
2503 return established_get_first(seq);
2504 }
2505
established_get_idx(struct seq_file * seq,loff_t pos)2506 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2507 {
2508 struct tcp_iter_state *st = seq->private;
2509 void *rc;
2510
2511 st->bucket = 0;
2512 rc = established_get_first(seq);
2513
2514 while (rc && pos) {
2515 rc = established_get_next(seq, rc);
2516 --pos;
2517 }
2518 return rc;
2519 }
2520
tcp_get_idx(struct seq_file * seq,loff_t pos)2521 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2522 {
2523 void *rc;
2524 struct tcp_iter_state *st = seq->private;
2525
2526 st->state = TCP_SEQ_STATE_LISTENING;
2527 rc = listening_get_idx(seq, &pos);
2528
2529 if (!rc) {
2530 st->state = TCP_SEQ_STATE_ESTABLISHED;
2531 rc = established_get_idx(seq, pos);
2532 }
2533
2534 return rc;
2535 }
2536
tcp_seek_last_pos(struct seq_file * seq)2537 static void *tcp_seek_last_pos(struct seq_file *seq)
2538 {
2539 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2540 struct tcp_iter_state *st = seq->private;
2541 int bucket = st->bucket;
2542 int offset = st->offset;
2543 int orig_num = st->num;
2544 void *rc = NULL;
2545
2546 switch (st->state) {
2547 case TCP_SEQ_STATE_LISTENING:
2548 if (st->bucket > hinfo->lhash2_mask)
2549 break;
2550 rc = listening_get_first(seq);
2551 while (offset-- && rc && bucket == st->bucket)
2552 rc = listening_get_next(seq, rc);
2553 if (rc)
2554 break;
2555 st->bucket = 0;
2556 st->state = TCP_SEQ_STATE_ESTABLISHED;
2557 fallthrough;
2558 case TCP_SEQ_STATE_ESTABLISHED:
2559 if (st->bucket > hinfo->ehash_mask)
2560 break;
2561 rc = established_get_first(seq);
2562 while (offset-- && rc && bucket == st->bucket)
2563 rc = established_get_next(seq, rc);
2564 }
2565
2566 st->num = orig_num;
2567
2568 return rc;
2569 }
2570
tcp_seq_start(struct seq_file * seq,loff_t * pos)2571 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2572 {
2573 struct tcp_iter_state *st = seq->private;
2574 void *rc;
2575
2576 if (*pos && *pos == st->last_pos) {
2577 rc = tcp_seek_last_pos(seq);
2578 if (rc)
2579 goto out;
2580 }
2581
2582 st->state = TCP_SEQ_STATE_LISTENING;
2583 st->num = 0;
2584 st->bucket = 0;
2585 st->offset = 0;
2586 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2587
2588 out:
2589 st->last_pos = *pos;
2590 return rc;
2591 }
2592 EXPORT_SYMBOL(tcp_seq_start);
2593
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2594 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2595 {
2596 struct tcp_iter_state *st = seq->private;
2597 void *rc = NULL;
2598
2599 if (v == SEQ_START_TOKEN) {
2600 rc = tcp_get_idx(seq, 0);
2601 goto out;
2602 }
2603
2604 switch (st->state) {
2605 case TCP_SEQ_STATE_LISTENING:
2606 rc = listening_get_next(seq, v);
2607 if (!rc) {
2608 st->state = TCP_SEQ_STATE_ESTABLISHED;
2609 st->bucket = 0;
2610 st->offset = 0;
2611 rc = established_get_first(seq);
2612 }
2613 break;
2614 case TCP_SEQ_STATE_ESTABLISHED:
2615 rc = established_get_next(seq, v);
2616 break;
2617 }
2618 out:
2619 ++*pos;
2620 st->last_pos = *pos;
2621 return rc;
2622 }
2623 EXPORT_SYMBOL(tcp_seq_next);
2624
tcp_seq_stop(struct seq_file * seq,void * v)2625 void tcp_seq_stop(struct seq_file *seq, void *v)
2626 {
2627 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2628 struct tcp_iter_state *st = seq->private;
2629
2630 switch (st->state) {
2631 case TCP_SEQ_STATE_LISTENING:
2632 if (v != SEQ_START_TOKEN)
2633 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2634 break;
2635 case TCP_SEQ_STATE_ESTABLISHED:
2636 if (v)
2637 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2638 break;
2639 }
2640 }
2641 EXPORT_SYMBOL(tcp_seq_stop);
2642
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2643 static void get_openreq4(const struct request_sock *req,
2644 struct seq_file *f, int i)
2645 {
2646 const struct inet_request_sock *ireq = inet_rsk(req);
2647 long delta = req->rsk_timer.expires - jiffies;
2648
2649 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2650 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2651 i,
2652 ireq->ir_loc_addr,
2653 ireq->ir_num,
2654 ireq->ir_rmt_addr,
2655 ntohs(ireq->ir_rmt_port),
2656 TCP_SYN_RECV,
2657 0, 0, /* could print option size, but that is af dependent. */
2658 1, /* timers active (only the expire timer) */
2659 jiffies_delta_to_clock_t(delta),
2660 req->num_timeout,
2661 from_kuid_munged(seq_user_ns(f),
2662 sock_i_uid(req->rsk_listener)),
2663 0, /* non standard timer */
2664 0, /* open_requests have no inode */
2665 0,
2666 req);
2667 }
2668
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2669 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2670 {
2671 int timer_active;
2672 unsigned long timer_expires;
2673 const struct tcp_sock *tp = tcp_sk(sk);
2674 const struct inet_connection_sock *icsk = inet_csk(sk);
2675 const struct inet_sock *inet = inet_sk(sk);
2676 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2677 __be32 dest = inet->inet_daddr;
2678 __be32 src = inet->inet_rcv_saddr;
2679 __u16 destp = ntohs(inet->inet_dport);
2680 __u16 srcp = ntohs(inet->inet_sport);
2681 int rx_queue;
2682 int state;
2683
2684 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2685 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2686 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2687 timer_active = 1;
2688 timer_expires = icsk->icsk_timeout;
2689 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2690 timer_active = 4;
2691 timer_expires = icsk->icsk_timeout;
2692 } else if (timer_pending(&sk->sk_timer)) {
2693 timer_active = 2;
2694 timer_expires = sk->sk_timer.expires;
2695 } else {
2696 timer_active = 0;
2697 timer_expires = jiffies;
2698 }
2699
2700 state = inet_sk_state_load(sk);
2701 if (state == TCP_LISTEN)
2702 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2703 else
2704 /* Because we don't lock the socket,
2705 * we might find a transient negative value.
2706 */
2707 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2708 READ_ONCE(tp->copied_seq), 0);
2709
2710 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2711 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2712 i, src, srcp, dest, destp, state,
2713 READ_ONCE(tp->write_seq) - tp->snd_una,
2714 rx_queue,
2715 timer_active,
2716 jiffies_delta_to_clock_t(timer_expires - jiffies),
2717 icsk->icsk_retransmits,
2718 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2719 icsk->icsk_probes_out,
2720 sock_i_ino(sk),
2721 refcount_read(&sk->sk_refcnt), sk,
2722 jiffies_to_clock_t(icsk->icsk_rto),
2723 jiffies_to_clock_t(icsk->icsk_ack.ato),
2724 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2725 tcp_snd_cwnd(tp),
2726 state == TCP_LISTEN ?
2727 fastopenq->max_qlen :
2728 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2729 }
2730
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2731 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2732 struct seq_file *f, int i)
2733 {
2734 long delta = tw->tw_timer.expires - jiffies;
2735 __be32 dest, src;
2736 __u16 destp, srcp;
2737
2738 dest = tw->tw_daddr;
2739 src = tw->tw_rcv_saddr;
2740 destp = ntohs(tw->tw_dport);
2741 srcp = ntohs(tw->tw_sport);
2742
2743 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2744 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2745 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2746 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2747 refcount_read(&tw->tw_refcnt), tw);
2748 }
2749
2750 #define TMPSZ 150
2751
tcp4_seq_show(struct seq_file * seq,void * v)2752 static int tcp4_seq_show(struct seq_file *seq, void *v)
2753 {
2754 struct tcp_iter_state *st;
2755 struct sock *sk = v;
2756
2757 seq_setwidth(seq, TMPSZ - 1);
2758 if (v == SEQ_START_TOKEN) {
2759 seq_puts(seq, " sl local_address rem_address st tx_queue "
2760 "rx_queue tr tm->when retrnsmt uid timeout "
2761 "inode");
2762 goto out;
2763 }
2764 st = seq->private;
2765
2766 if (sk->sk_state == TCP_TIME_WAIT)
2767 get_timewait4_sock(v, seq, st->num);
2768 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2769 get_openreq4(v, seq, st->num);
2770 else
2771 get_tcp4_sock(v, seq, st->num);
2772 out:
2773 seq_pad(seq, '\n');
2774 return 0;
2775 }
2776
2777 #ifdef CONFIG_BPF_SYSCALL
2778 struct bpf_tcp_iter_state {
2779 struct tcp_iter_state state;
2780 unsigned int cur_sk;
2781 unsigned int end_sk;
2782 unsigned int max_sk;
2783 struct sock **batch;
2784 bool st_bucket_done;
2785 };
2786
2787 struct bpf_iter__tcp {
2788 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2789 __bpf_md_ptr(struct sock_common *, sk_common);
2790 uid_t uid __aligned(8);
2791 };
2792
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2793 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2794 struct sock_common *sk_common, uid_t uid)
2795 {
2796 struct bpf_iter__tcp ctx;
2797
2798 meta->seq_num--; /* skip SEQ_START_TOKEN */
2799 ctx.meta = meta;
2800 ctx.sk_common = sk_common;
2801 ctx.uid = uid;
2802 return bpf_iter_run_prog(prog, &ctx);
2803 }
2804
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2805 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2806 {
2807 while (iter->cur_sk < iter->end_sk)
2808 sock_gen_put(iter->batch[iter->cur_sk++]);
2809 }
2810
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2811 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2812 unsigned int new_batch_sz)
2813 {
2814 struct sock **new_batch;
2815
2816 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2817 GFP_USER | __GFP_NOWARN);
2818 if (!new_batch)
2819 return -ENOMEM;
2820
2821 bpf_iter_tcp_put_batch(iter);
2822 kvfree(iter->batch);
2823 iter->batch = new_batch;
2824 iter->max_sk = new_batch_sz;
2825
2826 return 0;
2827 }
2828
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2829 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2830 struct sock *start_sk)
2831 {
2832 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2833 struct bpf_tcp_iter_state *iter = seq->private;
2834 struct tcp_iter_state *st = &iter->state;
2835 struct hlist_nulls_node *node;
2836 unsigned int expected = 1;
2837 struct sock *sk;
2838
2839 sock_hold(start_sk);
2840 iter->batch[iter->end_sk++] = start_sk;
2841
2842 sk = sk_nulls_next(start_sk);
2843 sk_nulls_for_each_from(sk, node) {
2844 if (seq_sk_match(seq, sk)) {
2845 if (iter->end_sk < iter->max_sk) {
2846 sock_hold(sk);
2847 iter->batch[iter->end_sk++] = sk;
2848 }
2849 expected++;
2850 }
2851 }
2852 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2853
2854 return expected;
2855 }
2856
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2857 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2858 struct sock *start_sk)
2859 {
2860 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2861 struct bpf_tcp_iter_state *iter = seq->private;
2862 struct tcp_iter_state *st = &iter->state;
2863 struct hlist_nulls_node *node;
2864 unsigned int expected = 1;
2865 struct sock *sk;
2866
2867 sock_hold(start_sk);
2868 iter->batch[iter->end_sk++] = start_sk;
2869
2870 sk = sk_nulls_next(start_sk);
2871 sk_nulls_for_each_from(sk, node) {
2872 if (seq_sk_match(seq, sk)) {
2873 if (iter->end_sk < iter->max_sk) {
2874 sock_hold(sk);
2875 iter->batch[iter->end_sk++] = sk;
2876 }
2877 expected++;
2878 }
2879 }
2880 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2881
2882 return expected;
2883 }
2884
bpf_iter_tcp_batch(struct seq_file * seq)2885 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2886 {
2887 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2888 struct bpf_tcp_iter_state *iter = seq->private;
2889 struct tcp_iter_state *st = &iter->state;
2890 unsigned int expected;
2891 bool resized = false;
2892 struct sock *sk;
2893
2894 /* The st->bucket is done. Directly advance to the next
2895 * bucket instead of having the tcp_seek_last_pos() to skip
2896 * one by one in the current bucket and eventually find out
2897 * it has to advance to the next bucket.
2898 */
2899 if (iter->st_bucket_done) {
2900 st->offset = 0;
2901 st->bucket++;
2902 if (st->state == TCP_SEQ_STATE_LISTENING &&
2903 st->bucket > hinfo->lhash2_mask) {
2904 st->state = TCP_SEQ_STATE_ESTABLISHED;
2905 st->bucket = 0;
2906 }
2907 }
2908
2909 again:
2910 /* Get a new batch */
2911 iter->cur_sk = 0;
2912 iter->end_sk = 0;
2913 iter->st_bucket_done = false;
2914
2915 sk = tcp_seek_last_pos(seq);
2916 if (!sk)
2917 return NULL; /* Done */
2918
2919 if (st->state == TCP_SEQ_STATE_LISTENING)
2920 expected = bpf_iter_tcp_listening_batch(seq, sk);
2921 else
2922 expected = bpf_iter_tcp_established_batch(seq, sk);
2923
2924 if (iter->end_sk == expected) {
2925 iter->st_bucket_done = true;
2926 return sk;
2927 }
2928
2929 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2930 resized = true;
2931 goto again;
2932 }
2933
2934 return sk;
2935 }
2936
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2937 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2938 {
2939 /* bpf iter does not support lseek, so it always
2940 * continue from where it was stop()-ped.
2941 */
2942 if (*pos)
2943 return bpf_iter_tcp_batch(seq);
2944
2945 return SEQ_START_TOKEN;
2946 }
2947
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2948 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2949 {
2950 struct bpf_tcp_iter_state *iter = seq->private;
2951 struct tcp_iter_state *st = &iter->state;
2952 struct sock *sk;
2953
2954 /* Whenever seq_next() is called, the iter->cur_sk is
2955 * done with seq_show(), so advance to the next sk in
2956 * the batch.
2957 */
2958 if (iter->cur_sk < iter->end_sk) {
2959 /* Keeping st->num consistent in tcp_iter_state.
2960 * bpf_iter_tcp does not use st->num.
2961 * meta.seq_num is used instead.
2962 */
2963 st->num++;
2964 /* Move st->offset to the next sk in the bucket such that
2965 * the future start() will resume at st->offset in
2966 * st->bucket. See tcp_seek_last_pos().
2967 */
2968 st->offset++;
2969 sock_gen_put(iter->batch[iter->cur_sk++]);
2970 }
2971
2972 if (iter->cur_sk < iter->end_sk)
2973 sk = iter->batch[iter->cur_sk];
2974 else
2975 sk = bpf_iter_tcp_batch(seq);
2976
2977 ++*pos;
2978 /* Keeping st->last_pos consistent in tcp_iter_state.
2979 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2980 */
2981 st->last_pos = *pos;
2982 return sk;
2983 }
2984
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2985 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2986 {
2987 struct bpf_iter_meta meta;
2988 struct bpf_prog *prog;
2989 struct sock *sk = v;
2990 uid_t uid;
2991 int ret;
2992
2993 if (v == SEQ_START_TOKEN)
2994 return 0;
2995
2996 if (sk_fullsock(sk))
2997 lock_sock(sk);
2998
2999 if (unlikely(sk_unhashed(sk))) {
3000 ret = SEQ_SKIP;
3001 goto unlock;
3002 }
3003
3004 if (sk->sk_state == TCP_TIME_WAIT) {
3005 uid = 0;
3006 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3007 const struct request_sock *req = v;
3008
3009 uid = from_kuid_munged(seq_user_ns(seq),
3010 sock_i_uid(req->rsk_listener));
3011 } else {
3012 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3013 }
3014
3015 meta.seq = seq;
3016 prog = bpf_iter_get_info(&meta, false);
3017 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3018
3019 unlock:
3020 if (sk_fullsock(sk))
3021 release_sock(sk);
3022 return ret;
3023
3024 }
3025
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)3026 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3027 {
3028 struct bpf_tcp_iter_state *iter = seq->private;
3029 struct bpf_iter_meta meta;
3030 struct bpf_prog *prog;
3031
3032 if (!v) {
3033 meta.seq = seq;
3034 prog = bpf_iter_get_info(&meta, true);
3035 if (prog)
3036 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3037 }
3038
3039 if (iter->cur_sk < iter->end_sk) {
3040 bpf_iter_tcp_put_batch(iter);
3041 iter->st_bucket_done = false;
3042 }
3043 }
3044
3045 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3046 .show = bpf_iter_tcp_seq_show,
3047 .start = bpf_iter_tcp_seq_start,
3048 .next = bpf_iter_tcp_seq_next,
3049 .stop = bpf_iter_tcp_seq_stop,
3050 };
3051 #endif
seq_file_family(const struct seq_file * seq)3052 static unsigned short seq_file_family(const struct seq_file *seq)
3053 {
3054 const struct tcp_seq_afinfo *afinfo;
3055
3056 #ifdef CONFIG_BPF_SYSCALL
3057 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3058 if (seq->op == &bpf_iter_tcp_seq_ops)
3059 return AF_UNSPEC;
3060 #endif
3061
3062 /* Iterated from proc fs */
3063 afinfo = pde_data(file_inode(seq->file));
3064 return afinfo->family;
3065 }
3066
3067 static const struct seq_operations tcp4_seq_ops = {
3068 .show = tcp4_seq_show,
3069 .start = tcp_seq_start,
3070 .next = tcp_seq_next,
3071 .stop = tcp_seq_stop,
3072 };
3073
3074 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3075 .family = AF_INET,
3076 };
3077
tcp4_proc_init_net(struct net * net)3078 static int __net_init tcp4_proc_init_net(struct net *net)
3079 {
3080 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3081 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3082 return -ENOMEM;
3083 return 0;
3084 }
3085
tcp4_proc_exit_net(struct net * net)3086 static void __net_exit tcp4_proc_exit_net(struct net *net)
3087 {
3088 remove_proc_entry("tcp", net->proc_net);
3089 }
3090
3091 static struct pernet_operations tcp4_net_ops = {
3092 .init = tcp4_proc_init_net,
3093 .exit = tcp4_proc_exit_net,
3094 };
3095
tcp4_proc_init(void)3096 int __init tcp4_proc_init(void)
3097 {
3098 return register_pernet_subsys(&tcp4_net_ops);
3099 }
3100
tcp4_proc_exit(void)3101 void tcp4_proc_exit(void)
3102 {
3103 unregister_pernet_subsys(&tcp4_net_ops);
3104 }
3105 #endif /* CONFIG_PROC_FS */
3106
3107 /* @wake is one when sk_stream_write_space() calls us.
3108 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3109 * This mimics the strategy used in sock_def_write_space().
3110 */
tcp_stream_memory_free(const struct sock * sk,int wake)3111 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3112 {
3113 const struct tcp_sock *tp = tcp_sk(sk);
3114 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3115 READ_ONCE(tp->snd_nxt);
3116
3117 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3118 }
3119 EXPORT_SYMBOL(tcp_stream_memory_free);
3120
3121 struct proto tcp_prot = {
3122 .name = "TCP",
3123 .owner = THIS_MODULE,
3124 .close = tcp_close,
3125 .pre_connect = tcp_v4_pre_connect,
3126 .connect = tcp_v4_connect,
3127 .disconnect = tcp_disconnect,
3128 .accept = inet_csk_accept,
3129 .ioctl = tcp_ioctl,
3130 .init = tcp_v4_init_sock,
3131 .destroy = tcp_v4_destroy_sock,
3132 .shutdown = tcp_shutdown,
3133 .setsockopt = tcp_setsockopt,
3134 .getsockopt = tcp_getsockopt,
3135 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3136 .keepalive = tcp_set_keepalive,
3137 .recvmsg = tcp_recvmsg,
3138 .sendmsg = tcp_sendmsg,
3139 .splice_eof = tcp_splice_eof,
3140 .backlog_rcv = tcp_v4_do_rcv,
3141 .release_cb = tcp_release_cb,
3142 .hash = inet_hash,
3143 .unhash = inet_unhash,
3144 .get_port = inet_csk_get_port,
3145 .put_port = inet_put_port,
3146 #ifdef CONFIG_BPF_SYSCALL
3147 .psock_update_sk_prot = tcp_bpf_update_proto,
3148 #endif
3149 .enter_memory_pressure = tcp_enter_memory_pressure,
3150 .leave_memory_pressure = tcp_leave_memory_pressure,
3151 .stream_memory_free = tcp_stream_memory_free,
3152 .sockets_allocated = &tcp_sockets_allocated,
3153 .orphan_count = &tcp_orphan_count,
3154
3155 .memory_allocated = &tcp_memory_allocated,
3156 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3157
3158 .memory_pressure = &tcp_memory_pressure,
3159 .sysctl_mem = sysctl_tcp_mem,
3160 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3161 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3162 .max_header = MAX_TCP_HEADER,
3163 .obj_size = sizeof(struct tcp_sock),
3164 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3165 .twsk_prot = &tcp_timewait_sock_ops,
3166 .rsk_prot = &tcp_request_sock_ops,
3167 .h.hashinfo = NULL,
3168 .no_autobind = true,
3169 .diag_destroy = tcp_abort,
3170 };
3171 EXPORT_SYMBOL(tcp_prot);
3172
tcp_sk_exit(struct net * net)3173 static void __net_exit tcp_sk_exit(struct net *net)
3174 {
3175 if (net->ipv4.tcp_congestion_control)
3176 bpf_module_put(net->ipv4.tcp_congestion_control,
3177 net->ipv4.tcp_congestion_control->owner);
3178 }
3179
tcp_set_hashinfo(struct net * net)3180 static void __net_init tcp_set_hashinfo(struct net *net)
3181 {
3182 struct inet_hashinfo *hinfo;
3183 unsigned int ehash_entries;
3184 struct net *old_net;
3185
3186 if (net_eq(net, &init_net))
3187 goto fallback;
3188
3189 old_net = current->nsproxy->net_ns;
3190 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3191 if (!ehash_entries)
3192 goto fallback;
3193
3194 ehash_entries = roundup_pow_of_two(ehash_entries);
3195 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3196 if (!hinfo) {
3197 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3198 "for a netns, fallback to the global one\n",
3199 ehash_entries);
3200 fallback:
3201 hinfo = &tcp_hashinfo;
3202 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3203 }
3204
3205 net->ipv4.tcp_death_row.hashinfo = hinfo;
3206 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3207 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3208 }
3209
tcp_sk_init(struct net * net)3210 static int __net_init tcp_sk_init(struct net *net)
3211 {
3212 net->ipv4.sysctl_tcp_ecn = 2;
3213 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3214
3215 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3216 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3217 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3218 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3219 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3220
3221 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3222 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3223 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3224
3225 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3226 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3227 net->ipv4.sysctl_tcp_syncookies = 1;
3228 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3229 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3230 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3231 net->ipv4.sysctl_tcp_orphan_retries = 0;
3232 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3233 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3234 net->ipv4.sysctl_tcp_tw_reuse = 2;
3235 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3236
3237 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3238 tcp_set_hashinfo(net);
3239
3240 net->ipv4.sysctl_tcp_sack = 1;
3241 net->ipv4.sysctl_tcp_window_scaling = 1;
3242 net->ipv4.sysctl_tcp_timestamps = 1;
3243 net->ipv4.sysctl_tcp_early_retrans = 3;
3244 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3245 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3246 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3247 net->ipv4.sysctl_tcp_max_reordering = 300;
3248 net->ipv4.sysctl_tcp_dsack = 1;
3249 net->ipv4.sysctl_tcp_app_win = 31;
3250 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3251 net->ipv4.sysctl_tcp_frto = 2;
3252 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3253 /* This limits the percentage of the congestion window which we
3254 * will allow a single TSO frame to consume. Building TSO frames
3255 * which are too large can cause TCP streams to be bursty.
3256 */
3257 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3258 /* Default TSQ limit of 16 TSO segments */
3259 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3260
3261 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3262 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3263
3264 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3265 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3266 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3267 net->ipv4.sysctl_tcp_autocorking = 1;
3268 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3269 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3270 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3271 if (net != &init_net) {
3272 memcpy(net->ipv4.sysctl_tcp_rmem,
3273 init_net.ipv4.sysctl_tcp_rmem,
3274 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3275 memcpy(net->ipv4.sysctl_tcp_wmem,
3276 init_net.ipv4.sysctl_tcp_wmem,
3277 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3278 }
3279 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3280 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3281 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3282 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3283 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3284 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3285
3286 /* Set default values for PLB */
3287 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3288 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3289 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3290 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3291 /* Default congestion threshold for PLB to mark a round is 50% */
3292 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3293
3294 /* Reno is always built in */
3295 if (!net_eq(net, &init_net) &&
3296 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3297 init_net.ipv4.tcp_congestion_control->owner))
3298 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3299 else
3300 net->ipv4.tcp_congestion_control = &tcp_reno;
3301
3302 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3303 net->ipv4.sysctl_tcp_shrink_window = 0;
3304
3305 return 0;
3306 }
3307
tcp_sk_exit_batch(struct list_head * net_exit_list)3308 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3309 {
3310 struct net *net;
3311
3312 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3313 * and failed setup_net error unwinding path are serialized.
3314 *
3315 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3316 * net_exit_list, the thread that dismantles a particular twsk must
3317 * do so without other thread progressing to refcount_dec_and_test() of
3318 * tcp_death_row.tw_refcount.
3319 */
3320 mutex_lock(&tcp_exit_batch_mutex);
3321
3322 tcp_twsk_purge(net_exit_list);
3323
3324 list_for_each_entry(net, net_exit_list, exit_list) {
3325 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3326 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3327 tcp_fastopen_ctx_destroy(net);
3328 }
3329
3330 mutex_unlock(&tcp_exit_batch_mutex);
3331 }
3332
3333 static struct pernet_operations __net_initdata tcp_sk_ops = {
3334 .init = tcp_sk_init,
3335 .exit = tcp_sk_exit,
3336 .exit_batch = tcp_sk_exit_batch,
3337 };
3338
3339 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3340 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3341 struct sock_common *sk_common, uid_t uid)
3342
3343 #define INIT_BATCH_SZ 16
3344
3345 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3346 {
3347 struct bpf_tcp_iter_state *iter = priv_data;
3348 int err;
3349
3350 err = bpf_iter_init_seq_net(priv_data, aux);
3351 if (err)
3352 return err;
3353
3354 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3355 if (err) {
3356 bpf_iter_fini_seq_net(priv_data);
3357 return err;
3358 }
3359
3360 return 0;
3361 }
3362
bpf_iter_fini_tcp(void * priv_data)3363 static void bpf_iter_fini_tcp(void *priv_data)
3364 {
3365 struct bpf_tcp_iter_state *iter = priv_data;
3366
3367 bpf_iter_fini_seq_net(priv_data);
3368 kvfree(iter->batch);
3369 }
3370
3371 static const struct bpf_iter_seq_info tcp_seq_info = {
3372 .seq_ops = &bpf_iter_tcp_seq_ops,
3373 .init_seq_private = bpf_iter_init_tcp,
3374 .fini_seq_private = bpf_iter_fini_tcp,
3375 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3376 };
3377
3378 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3379 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3380 const struct bpf_prog *prog)
3381 {
3382 switch (func_id) {
3383 case BPF_FUNC_setsockopt:
3384 return &bpf_sk_setsockopt_proto;
3385 case BPF_FUNC_getsockopt:
3386 return &bpf_sk_getsockopt_proto;
3387 default:
3388 return NULL;
3389 }
3390 }
3391
3392 static struct bpf_iter_reg tcp_reg_info = {
3393 .target = "tcp",
3394 .ctx_arg_info_size = 1,
3395 .ctx_arg_info = {
3396 { offsetof(struct bpf_iter__tcp, sk_common),
3397 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3398 },
3399 .get_func_proto = bpf_iter_tcp_get_func_proto,
3400 .seq_info = &tcp_seq_info,
3401 };
3402
bpf_iter_register(void)3403 static void __init bpf_iter_register(void)
3404 {
3405 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3406 if (bpf_iter_reg_target(&tcp_reg_info))
3407 pr_warn("Warning: could not register bpf iterator tcp\n");
3408 }
3409
3410 #endif
3411
tcp_v4_init(void)3412 void __init tcp_v4_init(void)
3413 {
3414 int cpu, res;
3415
3416 for_each_possible_cpu(cpu) {
3417 struct sock *sk;
3418
3419 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3420 IPPROTO_TCP, &init_net);
3421 if (res)
3422 panic("Failed to create the TCP control socket.\n");
3423 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3424
3425 /* Please enforce IP_DF and IPID==0 for RST and
3426 * ACK sent in SYN-RECV and TIME-WAIT state.
3427 */
3428 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3429
3430 per_cpu(ipv4_tcp_sk, cpu) = sk;
3431 }
3432 if (register_pernet_subsys(&tcp_sk_ops))
3433 panic("Failed to create the TCP control socket.\n");
3434
3435 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3436 bpf_iter_register();
3437 #endif
3438 }
3439