1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 #include <linux/skbuff_ref.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 #include <trace/events/tcp.h>
88 #include <trace/hooks/net.h>
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97
98 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
99 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
100 };
101
102 static DEFINE_MUTEX(tcp_exit_batch_mutex);
103
tcp_v4_init_seq(const struct sk_buff * skb)104 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
105 {
106 return secure_tcp_seq(ip_hdr(skb)->daddr,
107 ip_hdr(skb)->saddr,
108 tcp_hdr(skb)->dest,
109 tcp_hdr(skb)->source);
110 }
111
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)112 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
113 {
114 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
115 }
116
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)117 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
118 {
119 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
120 const struct inet_timewait_sock *tw = inet_twsk(sktw);
121 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
122 struct tcp_sock *tp = tcp_sk(sk);
123 int ts_recent_stamp;
124
125 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
126 reuse = 0;
127
128 if (reuse == 2) {
129 /* Still does not detect *everything* that goes through
130 * lo, since we require a loopback src or dst address
131 * or direct binding to 'lo' interface.
132 */
133 bool loopback = false;
134 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
135 loopback = true;
136 #if IS_ENABLED(CONFIG_IPV6)
137 if (tw->tw_family == AF_INET6) {
138 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
139 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
140 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
142 loopback = true;
143 } else
144 #endif
145 {
146 if (ipv4_is_loopback(tw->tw_daddr) ||
147 ipv4_is_loopback(tw->tw_rcv_saddr))
148 loopback = true;
149 }
150 if (!loopback)
151 reuse = 0;
152 }
153
154 /* With PAWS, it is safe from the viewpoint
155 of data integrity. Even without PAWS it is safe provided sequence
156 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
157
158 Actually, the idea is close to VJ's one, only timestamp cache is
159 held not per host, but per port pair and TW bucket is used as state
160 holder.
161
162 If TW bucket has been already destroyed we fall back to VJ's scheme
163 and use initial timestamp retrieved from peer table.
164 */
165 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
166 if (ts_recent_stamp &&
167 (!twp || (reuse && time_after32(ktime_get_seconds(),
168 ts_recent_stamp)))) {
169 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
170 * and releasing the bucket lock.
171 */
172 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
173 return 0;
174
175 /* In case of repair and re-using TIME-WAIT sockets we still
176 * want to be sure that it is safe as above but honor the
177 * sequence numbers and time stamps set as part of the repair
178 * process.
179 *
180 * Without this check re-using a TIME-WAIT socket with TCP
181 * repair would accumulate a -1 on the repair assigned
182 * sequence number. The first time it is reused the sequence
183 * is -1, the second time -2, etc. This fixes that issue
184 * without appearing to create any others.
185 */
186 if (likely(!tp->repair)) {
187 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
188
189 if (!seq)
190 seq = 1;
191 WRITE_ONCE(tp->write_seq, seq);
192 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
193 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
194 }
195
196 return 1;
197 }
198
199 return 0;
200 }
201 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
202
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)203 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
204 int addr_len)
205 {
206 /* This check is replicated from tcp_v4_connect() and intended to
207 * prevent BPF program called below from accessing bytes that are out
208 * of the bound specified by user in addr_len.
209 */
210 if (addr_len < sizeof(struct sockaddr_in))
211 return -EINVAL;
212
213 sock_owned_by_me(sk);
214
215 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
216 }
217
218 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)219 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
220 {
221 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
222 struct inet_timewait_death_row *tcp_death_row;
223 struct inet_sock *inet = inet_sk(sk);
224 struct tcp_sock *tp = tcp_sk(sk);
225 struct ip_options_rcu *inet_opt;
226 struct net *net = sock_net(sk);
227 __be16 orig_sport, orig_dport;
228 __be32 daddr, nexthop;
229 struct flowi4 *fl4;
230 struct rtable *rt;
231 int err;
232
233 if (addr_len < sizeof(struct sockaddr_in))
234 return -EINVAL;
235
236 if (usin->sin_family != AF_INET)
237 return -EAFNOSUPPORT;
238
239 trace_android_vh_tcp_v4_connect(sk, uaddr);
240
241 nexthop = daddr = usin->sin_addr.s_addr;
242 inet_opt = rcu_dereference_protected(inet->inet_opt,
243 lockdep_sock_is_held(sk));
244 if (inet_opt && inet_opt->opt.srr) {
245 if (!daddr)
246 return -EINVAL;
247 nexthop = inet_opt->opt.faddr;
248 }
249
250 orig_sport = inet->inet_sport;
251 orig_dport = usin->sin_port;
252 fl4 = &inet->cork.fl.u.ip4;
253 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
254 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
255 orig_dport, sk);
256 if (IS_ERR(rt)) {
257 err = PTR_ERR(rt);
258 if (err == -ENETUNREACH)
259 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
260 return err;
261 }
262
263 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
264 ip_rt_put(rt);
265 return -ENETUNREACH;
266 }
267
268 if (!inet_opt || !inet_opt->opt.srr)
269 daddr = fl4->daddr;
270
271 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
272
273 if (!inet->inet_saddr) {
274 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
275 if (err) {
276 ip_rt_put(rt);
277 return err;
278 }
279 } else {
280 sk_rcv_saddr_set(sk, inet->inet_saddr);
281 }
282
283 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
284 /* Reset inherited state */
285 tp->rx_opt.ts_recent = 0;
286 tp->rx_opt.ts_recent_stamp = 0;
287 if (likely(!tp->repair))
288 WRITE_ONCE(tp->write_seq, 0);
289 }
290
291 inet->inet_dport = usin->sin_port;
292 sk_daddr_set(sk, daddr);
293
294 inet_csk(sk)->icsk_ext_hdr_len = 0;
295 if (inet_opt)
296 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
297
298 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
299
300 /* Socket identity is still unknown (sport may be zero).
301 * However we set state to SYN-SENT and not releasing socket
302 * lock select source port, enter ourselves into the hash tables and
303 * complete initialization after this.
304 */
305 tcp_set_state(sk, TCP_SYN_SENT);
306 err = inet_hash_connect(tcp_death_row, sk);
307 if (err)
308 goto failure;
309
310 sk_set_txhash(sk);
311
312 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
313 inet->inet_sport, inet->inet_dport, sk);
314 if (IS_ERR(rt)) {
315 err = PTR_ERR(rt);
316 rt = NULL;
317 goto failure;
318 }
319 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
320 /* OK, now commit destination to socket. */
321 sk->sk_gso_type = SKB_GSO_TCPV4;
322 sk_setup_caps(sk, &rt->dst);
323 rt = NULL;
324
325 if (likely(!tp->repair)) {
326 if (!tp->write_seq)
327 WRITE_ONCE(tp->write_seq,
328 secure_tcp_seq(inet->inet_saddr,
329 inet->inet_daddr,
330 inet->inet_sport,
331 usin->sin_port));
332 WRITE_ONCE(tp->tsoffset,
333 secure_tcp_ts_off(net, inet->inet_saddr,
334 inet->inet_daddr));
335 }
336
337 atomic_set(&inet->inet_id, get_random_u16());
338
339 if (tcp_fastopen_defer_connect(sk, &err))
340 return err;
341 if (err)
342 goto failure;
343
344 err = tcp_connect(sk);
345
346 if (err)
347 goto failure;
348
349 return 0;
350
351 failure:
352 /*
353 * This unhashes the socket and releases the local port,
354 * if necessary.
355 */
356 tcp_set_state(sk, TCP_CLOSE);
357 inet_bhash2_reset_saddr(sk);
358 ip_rt_put(rt);
359 sk->sk_route_caps = 0;
360 inet->inet_dport = 0;
361 return err;
362 }
363 EXPORT_SYMBOL(tcp_v4_connect);
364
365 /*
366 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
367 * It can be called through tcp_release_cb() if socket was owned by user
368 * at the time tcp_v4_err() was called to handle ICMP message.
369 */
tcp_v4_mtu_reduced(struct sock * sk)370 void tcp_v4_mtu_reduced(struct sock *sk)
371 {
372 struct inet_sock *inet = inet_sk(sk);
373 struct dst_entry *dst;
374 u32 mtu;
375
376 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
377 return;
378 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
379 dst = inet_csk_update_pmtu(sk, mtu);
380 if (!dst)
381 return;
382
383 /* Something is about to be wrong... Remember soft error
384 * for the case, if this connection will not able to recover.
385 */
386 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
387 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
388
389 mtu = dst_mtu(dst);
390
391 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
392 ip_sk_accept_pmtu(sk) &&
393 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
394 tcp_sync_mss(sk, mtu);
395
396 /* Resend the TCP packet because it's
397 * clear that the old packet has been
398 * dropped. This is the new "fast" path mtu
399 * discovery.
400 */
401 tcp_simple_retransmit(sk);
402 } /* else let the usual retransmit timer handle it */
403 }
404 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
405
do_redirect(struct sk_buff * skb,struct sock * sk)406 static void do_redirect(struct sk_buff *skb, struct sock *sk)
407 {
408 struct dst_entry *dst = __sk_dst_check(sk, 0);
409
410 if (dst)
411 dst->ops->redirect(dst, sk, skb);
412 }
413
414
415 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)416 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
417 {
418 struct request_sock *req = inet_reqsk(sk);
419 struct net *net = sock_net(sk);
420
421 /* ICMPs are not backlogged, hence we cannot get
422 * an established socket here.
423 */
424 if (seq != tcp_rsk(req)->snt_isn) {
425 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
426 } else if (abort) {
427 /*
428 * Still in SYN_RECV, just remove it silently.
429 * There is no good way to pass the error to the newly
430 * created socket, and POSIX does not want network
431 * errors returned from accept().
432 */
433 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
434 tcp_listendrop(req->rsk_listener);
435 }
436 reqsk_put(req);
437 }
438 EXPORT_SYMBOL(tcp_req_err);
439
440 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)441 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
442 {
443 struct inet_connection_sock *icsk = inet_csk(sk);
444 struct tcp_sock *tp = tcp_sk(sk);
445 struct sk_buff *skb;
446 s32 remaining;
447 u32 delta_us;
448
449 if (sock_owned_by_user(sk))
450 return;
451
452 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
453 !icsk->icsk_backoff)
454 return;
455
456 skb = tcp_rtx_queue_head(sk);
457 if (WARN_ON_ONCE(!skb))
458 return;
459
460 icsk->icsk_backoff--;
461 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
462 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
463
464 tcp_mstamp_refresh(tp);
465 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
466 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
467
468 if (remaining > 0) {
469 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
470 remaining, TCP_RTO_MAX);
471 } else {
472 /* RTO revert clocked out retransmission.
473 * Will retransmit now.
474 */
475 tcp_retransmit_timer(sk);
476 }
477 }
478 EXPORT_SYMBOL(tcp_ld_RTO_revert);
479
480 /*
481 * This routine is called by the ICMP module when it gets some
482 * sort of error condition. If err < 0 then the socket should
483 * be closed and the error returned to the user. If err > 0
484 * it's just the icmp type << 8 | icmp code. After adjustment
485 * header points to the first 8 bytes of the tcp header. We need
486 * to find the appropriate port.
487 *
488 * The locking strategy used here is very "optimistic". When
489 * someone else accesses the socket the ICMP is just dropped
490 * and for some paths there is no check at all.
491 * A more general error queue to queue errors for later handling
492 * is probably better.
493 *
494 */
495
tcp_v4_err(struct sk_buff * skb,u32 info)496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 const struct iphdr *iph = (const struct iphdr *)skb->data;
499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 struct tcp_sock *tp;
501 const int type = icmp_hdr(skb)->type;
502 const int code = icmp_hdr(skb)->code;
503 struct sock *sk;
504 struct request_sock *fastopen;
505 u32 seq, snd_una;
506 int err;
507 struct net *net = dev_net(skb->dev);
508
509 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
510 iph->daddr, th->dest, iph->saddr,
511 ntohs(th->source), inet_iif(skb), 0);
512 if (!sk) {
513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 return -ENOENT;
515 }
516 if (sk->sk_state == TCP_TIME_WAIT) {
517 /* To increase the counter of ignored icmps for TCP-AO */
518 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 inet_twsk_put(inet_twsk(sk));
520 return 0;
521 }
522 seq = ntohl(th->seq);
523 if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 type == ICMP_TIME_EXCEEDED ||
526 (type == ICMP_DEST_UNREACH &&
527 (code == ICMP_NET_UNREACH ||
528 code == ICMP_HOST_UNREACH)));
529 return 0;
530 }
531
532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 sock_put(sk);
534 return 0;
535 }
536
537 bh_lock_sock(sk);
538 /* If too many ICMPs get dropped on busy
539 * servers this needs to be solved differently.
540 * We do take care of PMTU discovery (RFC1191) special case :
541 * we can receive locally generated ICMP messages while socket is held.
542 */
543 if (sock_owned_by_user(sk)) {
544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 }
547 if (sk->sk_state == TCP_CLOSE)
548 goto out;
549
550 if (static_branch_unlikely(&ip4_min_ttl)) {
551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 goto out;
555 }
556 }
557
558 tp = tcp_sk(sk);
559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 fastopen = rcu_dereference(tp->fastopen_rsk);
561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 if (sk->sk_state != TCP_LISTEN &&
563 !between(seq, snd_una, tp->snd_nxt)) {
564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 goto out;
566 }
567
568 switch (type) {
569 case ICMP_REDIRECT:
570 if (!sock_owned_by_user(sk))
571 do_redirect(skb, sk);
572 goto out;
573 case ICMP_SOURCE_QUENCH:
574 /* Just silently ignore these. */
575 goto out;
576 case ICMP_PARAMETERPROB:
577 err = EPROTO;
578 break;
579 case ICMP_DEST_UNREACH:
580 if (code > NR_ICMP_UNREACH)
581 goto out;
582
583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 /* We are not interested in TCP_LISTEN and open_requests
585 * (SYN-ACKs send out by Linux are always <576bytes so
586 * they should go through unfragmented).
587 */
588 if (sk->sk_state == TCP_LISTEN)
589 goto out;
590
591 WRITE_ONCE(tp->mtu_info, info);
592 if (!sock_owned_by_user(sk)) {
593 tcp_v4_mtu_reduced(sk);
594 } else {
595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 sock_hold(sk);
597 }
598 goto out;
599 }
600
601 err = icmp_err_convert[code].errno;
602 /* check if this ICMP message allows revert of backoff.
603 * (see RFC 6069)
604 */
605 if (!fastopen &&
606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 tcp_ld_RTO_revert(sk, seq);
608 break;
609 case ICMP_TIME_EXCEEDED:
610 err = EHOSTUNREACH;
611 break;
612 default:
613 goto out;
614 }
615
616 switch (sk->sk_state) {
617 case TCP_SYN_SENT:
618 case TCP_SYN_RECV:
619 /* Only in fast or simultaneous open. If a fast open socket is
620 * already accepted it is treated as a connected one below.
621 */
622 if (fastopen && !fastopen->sk)
623 break;
624
625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626
627 if (!sock_owned_by_user(sk))
628 tcp_done_with_error(sk, err);
629 else
630 WRITE_ONCE(sk->sk_err_soft, err);
631 goto out;
632 }
633
634 /* If we've already connected we will keep trying
635 * until we time out, or the user gives up.
636 *
637 * rfc1122 4.2.3.9 allows to consider as hard errors
638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 * but it is obsoleted by pmtu discovery).
640 *
641 * Note, that in modern internet, where routing is unreliable
642 * and in each dark corner broken firewalls sit, sending random
643 * errors ordered by their masters even this two messages finally lose
644 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 *
646 * Now we are in compliance with RFCs.
647 * --ANK (980905)
648 */
649
650 if (!sock_owned_by_user(sk) &&
651 inet_test_bit(RECVERR, sk)) {
652 WRITE_ONCE(sk->sk_err, err);
653 sk_error_report(sk);
654 } else { /* Only an error on timeout */
655 WRITE_ONCE(sk->sk_err_soft, err);
656 }
657
658 out:
659 bh_unlock_sock(sk);
660 sock_put(sk);
661 return 0;
662 }
663
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 struct tcphdr *th = tcp_hdr(skb);
667
668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 skb->csum_start = skb_transport_header(skb) - skb->head;
670 skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672
673 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 const struct inet_sock *inet = inet_sk(sk);
677
678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_SYMBOL(tcp_v4_send_check);
681
682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 const struct tcp_ao_hdr *aoh,
686 struct ip_reply_arg *arg, struct tcphdr *reply,
687 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 int sdif = tcp_v4_sdif(skb);
691 int dif = inet_iif(skb);
692 int l3index = sdif ? dif : 0;
693 bool allocated_traffic_key;
694 struct tcp_ao_key *key;
695 char *traffic_key;
696 bool drop = true;
697 u32 ao_sne = 0;
698 u8 keyid;
699
700 rcu_read_lock();
701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 &key, &traffic_key, &allocated_traffic_key,
703 &keyid, &ao_sne))
704 goto out;
705
706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 (aoh->rnext_keyid << 8) | keyid);
708 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 reply->doff = arg->iov[0].iov_len / 4;
710
711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 key, traffic_key,
713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 reply, ao_sne))
716 goto out;
717 drop = false;
718 out:
719 rcu_read_unlock();
720 if (allocated_traffic_key)
721 kfree(traffic_key);
722 return drop;
723 #else
724 return true;
725 #endif
726 }
727
728 /*
729 * This routine will send an RST to the other tcp.
730 *
731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732 * for reset.
733 * Answer: if a packet caused RST, it is not for a socket
734 * existing in our system, if it is matched to a socket,
735 * it is just duplicate segment or bug in other side's TCP.
736 * So that we build reply only basing on parameters
737 * arrived with segment.
738 * Exception: precedence violation. We do not implement it in any case.
739 */
740
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 enum sk_rst_reason reason)
743 {
744 const struct tcphdr *th = tcp_hdr(skb);
745 struct {
746 struct tcphdr th;
747 __be32 opt[REPLY_OPTIONS_LEN];
748 } rep;
749 const __u8 *md5_hash_location = NULL;
750 const struct tcp_ao_hdr *aoh;
751 struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 struct tcp_md5sig_key *key = NULL;
754 unsigned char newhash[16];
755 struct sock *sk1 = NULL;
756 int genhash;
757 #endif
758 u64 transmit_time = 0;
759 struct sock *ctl_sk;
760 struct net *net;
761 u32 txhash = 0;
762
763 /* Never send a reset in response to a reset. */
764 if (th->rst)
765 return;
766
767 /* If sk not NULL, it means we did a successful lookup and incoming
768 * route had to be correct. prequeue might have dropped our dst.
769 */
770 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
771 return;
772
773 /* Swap the send and the receive. */
774 memset(&rep, 0, sizeof(rep));
775 rep.th.dest = th->source;
776 rep.th.source = th->dest;
777 rep.th.doff = sizeof(struct tcphdr) / 4;
778 rep.th.rst = 1;
779
780 if (th->ack) {
781 rep.th.seq = th->ack_seq;
782 } else {
783 rep.th.ack = 1;
784 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
785 skb->len - (th->doff << 2));
786 }
787
788 memset(&arg, 0, sizeof(arg));
789 arg.iov[0].iov_base = (unsigned char *)&rep;
790 arg.iov[0].iov_len = sizeof(rep.th);
791
792 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
793
794 /* Invalid TCP option size or twice included auth */
795 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
796 return;
797
798 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
799 return;
800
801 #ifdef CONFIG_TCP_MD5SIG
802 rcu_read_lock();
803 if (sk && sk_fullsock(sk)) {
804 const union tcp_md5_addr *addr;
805 int l3index;
806
807 /* sdif set, means packet ingressed via a device
808 * in an L3 domain and inet_iif is set to it.
809 */
810 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
811 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
812 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
813 } else if (md5_hash_location) {
814 const union tcp_md5_addr *addr;
815 int sdif = tcp_v4_sdif(skb);
816 int dif = inet_iif(skb);
817 int l3index;
818
819 /*
820 * active side is lost. Try to find listening socket through
821 * source port, and then find md5 key through listening socket.
822 * we are not loose security here:
823 * Incoming packet is checked with md5 hash with finding key,
824 * no RST generated if md5 hash doesn't match.
825 */
826 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
827 NULL, 0, ip_hdr(skb)->saddr,
828 th->source, ip_hdr(skb)->daddr,
829 ntohs(th->source), dif, sdif);
830 /* don't send rst if it can't find key */
831 if (!sk1)
832 goto out;
833
834 /* sdif set, means packet ingressed via a device
835 * in an L3 domain and dif is set to it.
836 */
837 l3index = sdif ? dif : 0;
838 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
839 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
840 if (!key)
841 goto out;
842
843
844 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
845 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
846 goto out;
847
848 }
849
850 if (key) {
851 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
852 (TCPOPT_NOP << 16) |
853 (TCPOPT_MD5SIG << 8) |
854 TCPOLEN_MD5SIG);
855 /* Update length and the length the header thinks exists */
856 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857 rep.th.doff = arg.iov[0].iov_len / 4;
858
859 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
860 key, ip_hdr(skb)->saddr,
861 ip_hdr(skb)->daddr, &rep.th);
862 }
863 #endif
864 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
865 if (rep.opt[0] == 0) {
866 __be32 mrst = mptcp_reset_option(skb);
867
868 if (mrst) {
869 rep.opt[0] = mrst;
870 arg.iov[0].iov_len += sizeof(mrst);
871 rep.th.doff = arg.iov[0].iov_len / 4;
872 }
873 }
874
875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 ip_hdr(skb)->saddr, /* XXX */
877 arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
880
881 /* When socket is gone, all binding information is lost.
882 * routing might fail in this case. No choice here, if we choose to force
883 * input interface, we will misroute in case of asymmetric route.
884 */
885 if (sk)
886 arg.bound_dev_if = sk->sk_bound_dev_if;
887
888 trace_tcp_send_reset(sk, skb, reason);
889
890 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
891 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
892
893 arg.tos = ip_hdr(skb)->tos;
894 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
895 local_bh_disable();
896 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
897 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
898
899 sock_net_set(ctl_sk, net);
900 if (sk) {
901 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
902 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
903 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
904 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
905 transmit_time = tcp_transmit_time(sk);
906 xfrm_sk_clone_policy(ctl_sk, sk);
907 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
908 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
909 } else {
910 ctl_sk->sk_mark = 0;
911 ctl_sk->sk_priority = 0;
912 }
913 ip_send_unicast_reply(ctl_sk,
914 skb, &TCP_SKB_CB(skb)->header.h4.opt,
915 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
916 &arg, arg.iov[0].iov_len,
917 transmit_time, txhash);
918
919 xfrm_sk_free_policy(ctl_sk);
920 sock_net_set(ctl_sk, &init_net);
921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
923 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
924 local_bh_enable();
925
926 #ifdef CONFIG_TCP_MD5SIG
927 out:
928 rcu_read_unlock();
929 #endif
930 }
931
932 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
933 outside socket context is ugly, certainly. What can I do?
934 */
935
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)936 static void tcp_v4_send_ack(const struct sock *sk,
937 struct sk_buff *skb, u32 seq, u32 ack,
938 u32 win, u32 tsval, u32 tsecr, int oif,
939 struct tcp_key *key,
940 int reply_flags, u8 tos, u32 txhash)
941 {
942 const struct tcphdr *th = tcp_hdr(skb);
943 struct {
944 struct tcphdr th;
945 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
946 } rep;
947 struct net *net = sock_net(sk);
948 struct ip_reply_arg arg;
949 struct sock *ctl_sk;
950 u64 transmit_time;
951
952 memset(&rep.th, 0, sizeof(struct tcphdr));
953 memset(&arg, 0, sizeof(arg));
954
955 arg.iov[0].iov_base = (unsigned char *)&rep;
956 arg.iov[0].iov_len = sizeof(rep.th);
957 if (tsecr) {
958 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
959 (TCPOPT_TIMESTAMP << 8) |
960 TCPOLEN_TIMESTAMP);
961 rep.opt[1] = htonl(tsval);
962 rep.opt[2] = htonl(tsecr);
963 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
964 }
965
966 /* Swap the send and the receive. */
967 rep.th.dest = th->source;
968 rep.th.source = th->dest;
969 rep.th.doff = arg.iov[0].iov_len / 4;
970 rep.th.seq = htonl(seq);
971 rep.th.ack_seq = htonl(ack);
972 rep.th.ack = 1;
973 rep.th.window = htons(win);
974
975 #ifdef CONFIG_TCP_MD5SIG
976 if (tcp_key_is_md5(key)) {
977 int offset = (tsecr) ? 3 : 0;
978
979 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
980 (TCPOPT_NOP << 16) |
981 (TCPOPT_MD5SIG << 8) |
982 TCPOLEN_MD5SIG);
983 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
984 rep.th.doff = arg.iov[0].iov_len/4;
985
986 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
987 key->md5_key, ip_hdr(skb)->saddr,
988 ip_hdr(skb)->daddr, &rep.th);
989 }
990 #endif
991 #ifdef CONFIG_TCP_AO
992 if (tcp_key_is_ao(key)) {
993 int offset = (tsecr) ? 3 : 0;
994
995 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
996 (tcp_ao_len(key->ao_key) << 16) |
997 (key->ao_key->sndid << 8) |
998 key->rcv_next);
999 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1000 rep.th.doff = arg.iov[0].iov_len / 4;
1001
1002 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1003 key->ao_key, key->traffic_key,
1004 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1005 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1006 &rep.th, key->sne);
1007 }
1008 #endif
1009 arg.flags = reply_flags;
1010 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1011 ip_hdr(skb)->saddr, /* XXX */
1012 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1013 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1014 if (oif)
1015 arg.bound_dev_if = oif;
1016 arg.tos = tos;
1017 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1018 local_bh_disable();
1019 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1020 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1021 sock_net_set(ctl_sk, net);
1022 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1023 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1024 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1025 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1026 transmit_time = tcp_transmit_time(sk);
1027 ip_send_unicast_reply(ctl_sk,
1028 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1029 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1030 &arg, arg.iov[0].iov_len,
1031 transmit_time, txhash);
1032
1033 sock_net_set(ctl_sk, &init_net);
1034 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1035 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1036 local_bh_enable();
1037 }
1038
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1039 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1040 {
1041 struct inet_timewait_sock *tw = inet_twsk(sk);
1042 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1043 struct tcp_key key = {};
1044 #ifdef CONFIG_TCP_AO
1045 struct tcp_ao_info *ao_info;
1046
1047 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1048 /* FIXME: the segment to-be-acked is not verified yet */
1049 ao_info = rcu_dereference(tcptw->ao_info);
1050 if (ao_info) {
1051 const struct tcp_ao_hdr *aoh;
1052
1053 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1054 inet_twsk_put(tw);
1055 return;
1056 }
1057
1058 if (aoh)
1059 key.ao_key = tcp_ao_established_key(sk, ao_info,
1060 aoh->rnext_keyid, -1);
1061 }
1062 }
1063 if (key.ao_key) {
1064 struct tcp_ao_key *rnext_key;
1065
1066 key.traffic_key = snd_other_key(key.ao_key);
1067 key.sne = READ_ONCE(ao_info->snd_sne);
1068 rnext_key = READ_ONCE(ao_info->rnext_key);
1069 key.rcv_next = rnext_key->rcvid;
1070 key.type = TCP_KEY_AO;
1071 #else
1072 if (0) {
1073 #endif
1074 } else if (static_branch_tcp_md5()) {
1075 key.md5_key = tcp_twsk_md5_key(tcptw);
1076 if (key.md5_key)
1077 key.type = TCP_KEY_MD5;
1078 }
1079
1080 tcp_v4_send_ack(sk, skb,
1081 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1082 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1083 tcp_tw_tsval(tcptw),
1084 READ_ONCE(tcptw->tw_ts_recent),
1085 tw->tw_bound_dev_if, &key,
1086 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1087 tw->tw_tos,
1088 tw->tw_txhash);
1089
1090 inet_twsk_put(tw);
1091 }
1092
1093 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1094 struct request_sock *req)
1095 {
1096 struct tcp_key key = {};
1097
1098 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1099 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1100 */
1101 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1102 tcp_sk(sk)->snd_nxt;
1103
1104 #ifdef CONFIG_TCP_AO
1105 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1106 tcp_rsk_used_ao(req)) {
1107 const union tcp_md5_addr *addr;
1108 const struct tcp_ao_hdr *aoh;
1109 int l3index;
1110
1111 /* Invalid TCP option size or twice included auth */
1112 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1113 return;
1114 if (!aoh)
1115 return;
1116
1117 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1118 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1119 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1120 aoh->rnext_keyid, -1);
1121 if (unlikely(!key.ao_key)) {
1122 /* Send ACK with any matching MKT for the peer */
1123 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1124 /* Matching key disappeared (user removed the key?)
1125 * let the handshake timeout.
1126 */
1127 if (!key.ao_key) {
1128 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1129 addr,
1130 ntohs(tcp_hdr(skb)->source),
1131 &ip_hdr(skb)->daddr,
1132 ntohs(tcp_hdr(skb)->dest));
1133 return;
1134 }
1135 }
1136 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1137 if (!key.traffic_key)
1138 return;
1139
1140 key.type = TCP_KEY_AO;
1141 key.rcv_next = aoh->keyid;
1142 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1143 #else
1144 if (0) {
1145 #endif
1146 } else if (static_branch_tcp_md5()) {
1147 const union tcp_md5_addr *addr;
1148 int l3index;
1149
1150 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1151 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1152 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1153 if (key.md5_key)
1154 key.type = TCP_KEY_MD5;
1155 }
1156
1157 tcp_v4_send_ack(sk, skb, seq,
1158 tcp_rsk(req)->rcv_nxt,
1159 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1160 tcp_rsk_tsval(tcp_rsk(req)),
1161 READ_ONCE(req->ts_recent),
1162 0, &key,
1163 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1164 ip_hdr(skb)->tos,
1165 READ_ONCE(tcp_rsk(req)->txhash));
1166 if (tcp_key_is_ao(&key))
1167 kfree(key.traffic_key);
1168 }
1169
1170 /*
1171 * Send a SYN-ACK after having received a SYN.
1172 * This still operates on a request_sock only, not on a big
1173 * socket.
1174 */
1175 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1176 struct flowi *fl,
1177 struct request_sock *req,
1178 struct tcp_fastopen_cookie *foc,
1179 enum tcp_synack_type synack_type,
1180 struct sk_buff *syn_skb)
1181 {
1182 const struct inet_request_sock *ireq = inet_rsk(req);
1183 struct flowi4 fl4;
1184 int err = -1;
1185 struct sk_buff *skb;
1186 u8 tos;
1187
1188 /* First, grab a route. */
1189 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1190 return -1;
1191
1192 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1193
1194 if (skb) {
1195 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1196
1197 tos = READ_ONCE(inet_sk(sk)->tos);
1198
1199 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1200 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1201 (tos & INET_ECN_MASK);
1202
1203 if (!INET_ECN_is_capable(tos) &&
1204 tcp_bpf_ca_needs_ecn((struct sock *)req))
1205 tos |= INET_ECN_ECT_0;
1206
1207 rcu_read_lock();
1208 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1209 ireq->ir_rmt_addr,
1210 rcu_dereference(ireq->ireq_opt),
1211 tos);
1212 rcu_read_unlock();
1213 err = net_xmit_eval(err);
1214 }
1215
1216 return err;
1217 }
1218
1219 /*
1220 * IPv4 request_sock destructor.
1221 */
1222 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1223 {
1224 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1225 }
1226
1227 #ifdef CONFIG_TCP_MD5SIG
1228 /*
1229 * RFC2385 MD5 checksumming requires a mapping of
1230 * IP address->MD5 Key.
1231 * We need to maintain these in the sk structure.
1232 */
1233
1234 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1235 EXPORT_SYMBOL(tcp_md5_needed);
1236
1237 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1238 {
1239 if (!old)
1240 return true;
1241
1242 /* l3index always overrides non-l3index */
1243 if (old->l3index && new->l3index == 0)
1244 return false;
1245 if (old->l3index == 0 && new->l3index)
1246 return true;
1247
1248 return old->prefixlen < new->prefixlen;
1249 }
1250
1251 /* Find the Key structure for an address. */
1252 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1253 const union tcp_md5_addr *addr,
1254 int family, bool any_l3index)
1255 {
1256 const struct tcp_sock *tp = tcp_sk(sk);
1257 struct tcp_md5sig_key *key;
1258 const struct tcp_md5sig_info *md5sig;
1259 __be32 mask;
1260 struct tcp_md5sig_key *best_match = NULL;
1261 bool match;
1262
1263 /* caller either holds rcu_read_lock() or socket lock */
1264 md5sig = rcu_dereference_check(tp->md5sig_info,
1265 lockdep_sock_is_held(sk));
1266 if (!md5sig)
1267 return NULL;
1268
1269 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1270 lockdep_sock_is_held(sk)) {
1271 if (key->family != family)
1272 continue;
1273 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1274 key->l3index != l3index)
1275 continue;
1276 if (family == AF_INET) {
1277 mask = inet_make_mask(key->prefixlen);
1278 match = (key->addr.a4.s_addr & mask) ==
1279 (addr->a4.s_addr & mask);
1280 #if IS_ENABLED(CONFIG_IPV6)
1281 } else if (family == AF_INET6) {
1282 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1283 key->prefixlen);
1284 #endif
1285 } else {
1286 match = false;
1287 }
1288
1289 if (match && better_md5_match(best_match, key))
1290 best_match = key;
1291 }
1292 return best_match;
1293 }
1294 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1295
1296 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1297 const union tcp_md5_addr *addr,
1298 int family, u8 prefixlen,
1299 int l3index, u8 flags)
1300 {
1301 const struct tcp_sock *tp = tcp_sk(sk);
1302 struct tcp_md5sig_key *key;
1303 unsigned int size = sizeof(struct in_addr);
1304 const struct tcp_md5sig_info *md5sig;
1305
1306 /* caller either holds rcu_read_lock() or socket lock */
1307 md5sig = rcu_dereference_check(tp->md5sig_info,
1308 lockdep_sock_is_held(sk));
1309 if (!md5sig)
1310 return NULL;
1311 #if IS_ENABLED(CONFIG_IPV6)
1312 if (family == AF_INET6)
1313 size = sizeof(struct in6_addr);
1314 #endif
1315 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1316 lockdep_sock_is_held(sk)) {
1317 if (key->family != family)
1318 continue;
1319 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1320 continue;
1321 if (key->l3index != l3index)
1322 continue;
1323 if (!memcmp(&key->addr, addr, size) &&
1324 key->prefixlen == prefixlen)
1325 return key;
1326 }
1327 return NULL;
1328 }
1329
1330 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1331 const struct sock *addr_sk)
1332 {
1333 const union tcp_md5_addr *addr;
1334 int l3index;
1335
1336 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1337 addr_sk->sk_bound_dev_if);
1338 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1339 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1340 }
1341 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1342
1343 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1344 {
1345 struct tcp_sock *tp = tcp_sk(sk);
1346 struct tcp_md5sig_info *md5sig;
1347
1348 md5sig = kmalloc(sizeof(*md5sig), gfp);
1349 if (!md5sig)
1350 return -ENOMEM;
1351
1352 sk_gso_disable(sk);
1353 INIT_HLIST_HEAD(&md5sig->head);
1354 rcu_assign_pointer(tp->md5sig_info, md5sig);
1355 return 0;
1356 }
1357
1358 /* This can be called on a newly created socket, from other files */
1359 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1360 int family, u8 prefixlen, int l3index, u8 flags,
1361 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1362 {
1363 /* Add Key to the list */
1364 struct tcp_md5sig_key *key;
1365 struct tcp_sock *tp = tcp_sk(sk);
1366 struct tcp_md5sig_info *md5sig;
1367
1368 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1369 if (key) {
1370 /* Pre-existing entry - just update that one.
1371 * Note that the key might be used concurrently.
1372 * data_race() is telling kcsan that we do not care of
1373 * key mismatches, since changing MD5 key on live flows
1374 * can lead to packet drops.
1375 */
1376 data_race(memcpy(key->key, newkey, newkeylen));
1377
1378 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1379 * Also note that a reader could catch new key->keylen value
1380 * but old key->key[], this is the reason we use __GFP_ZERO
1381 * at sock_kmalloc() time below these lines.
1382 */
1383 WRITE_ONCE(key->keylen, newkeylen);
1384
1385 return 0;
1386 }
1387
1388 md5sig = rcu_dereference_protected(tp->md5sig_info,
1389 lockdep_sock_is_held(sk));
1390
1391 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1392 if (!key)
1393 return -ENOMEM;
1394
1395 memcpy(key->key, newkey, newkeylen);
1396 key->keylen = newkeylen;
1397 key->family = family;
1398 key->prefixlen = prefixlen;
1399 key->l3index = l3index;
1400 key->flags = flags;
1401 memcpy(&key->addr, addr,
1402 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1403 sizeof(struct in_addr));
1404 hlist_add_head_rcu(&key->node, &md5sig->head);
1405 return 0;
1406 }
1407
1408 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1409 int family, u8 prefixlen, int l3index, u8 flags,
1410 const u8 *newkey, u8 newkeylen)
1411 {
1412 struct tcp_sock *tp = tcp_sk(sk);
1413
1414 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1415 if (tcp_md5_alloc_sigpool())
1416 return -ENOMEM;
1417
1418 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1419 tcp_md5_release_sigpool();
1420 return -ENOMEM;
1421 }
1422
1423 if (!static_branch_inc(&tcp_md5_needed.key)) {
1424 struct tcp_md5sig_info *md5sig;
1425
1426 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1427 rcu_assign_pointer(tp->md5sig_info, NULL);
1428 kfree_rcu(md5sig, rcu);
1429 tcp_md5_release_sigpool();
1430 return -EUSERS;
1431 }
1432 }
1433
1434 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1435 newkey, newkeylen, GFP_KERNEL);
1436 }
1437 EXPORT_SYMBOL(tcp_md5_do_add);
1438
1439 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1440 int family, u8 prefixlen, int l3index,
1441 struct tcp_md5sig_key *key)
1442 {
1443 struct tcp_sock *tp = tcp_sk(sk);
1444
1445 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1446 tcp_md5_add_sigpool();
1447
1448 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1449 tcp_md5_release_sigpool();
1450 return -ENOMEM;
1451 }
1452
1453 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1454 struct tcp_md5sig_info *md5sig;
1455
1456 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1457 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1458 rcu_assign_pointer(tp->md5sig_info, NULL);
1459 kfree_rcu(md5sig, rcu);
1460 tcp_md5_release_sigpool();
1461 return -EUSERS;
1462 }
1463 }
1464
1465 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1466 key->flags, key->key, key->keylen,
1467 sk_gfp_mask(sk, GFP_ATOMIC));
1468 }
1469 EXPORT_SYMBOL(tcp_md5_key_copy);
1470
1471 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1472 u8 prefixlen, int l3index, u8 flags)
1473 {
1474 struct tcp_md5sig_key *key;
1475
1476 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1477 if (!key)
1478 return -ENOENT;
1479 hlist_del_rcu(&key->node);
1480 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1481 kfree_rcu(key, rcu);
1482 return 0;
1483 }
1484 EXPORT_SYMBOL(tcp_md5_do_del);
1485
1486 void tcp_clear_md5_list(struct sock *sk)
1487 {
1488 struct tcp_sock *tp = tcp_sk(sk);
1489 struct tcp_md5sig_key *key;
1490 struct hlist_node *n;
1491 struct tcp_md5sig_info *md5sig;
1492
1493 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1494
1495 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1496 hlist_del_rcu(&key->node);
1497 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1498 kfree_rcu(key, rcu);
1499 }
1500 }
1501
1502 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1503 sockptr_t optval, int optlen)
1504 {
1505 struct tcp_md5sig cmd;
1506 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1507 const union tcp_md5_addr *addr;
1508 u8 prefixlen = 32;
1509 int l3index = 0;
1510 bool l3flag;
1511 u8 flags;
1512
1513 if (optlen < sizeof(cmd))
1514 return -EINVAL;
1515
1516 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1517 return -EFAULT;
1518
1519 if (sin->sin_family != AF_INET)
1520 return -EINVAL;
1521
1522 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1523 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1524
1525 if (optname == TCP_MD5SIG_EXT &&
1526 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1527 prefixlen = cmd.tcpm_prefixlen;
1528 if (prefixlen > 32)
1529 return -EINVAL;
1530 }
1531
1532 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1533 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1534 struct net_device *dev;
1535
1536 rcu_read_lock();
1537 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1538 if (dev && netif_is_l3_master(dev))
1539 l3index = dev->ifindex;
1540
1541 rcu_read_unlock();
1542
1543 /* ok to reference set/not set outside of rcu;
1544 * right now device MUST be an L3 master
1545 */
1546 if (!dev || !l3index)
1547 return -EINVAL;
1548 }
1549
1550 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1551
1552 if (!cmd.tcpm_keylen)
1553 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1554
1555 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1556 return -EINVAL;
1557
1558 /* Don't allow keys for peers that have a matching TCP-AO key.
1559 * See the comment in tcp_ao_add_cmd()
1560 */
1561 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1562 return -EKEYREJECTED;
1563
1564 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1565 cmd.tcpm_key, cmd.tcpm_keylen);
1566 }
1567
1568 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1569 __be32 daddr, __be32 saddr,
1570 const struct tcphdr *th, int nbytes)
1571 {
1572 struct tcp4_pseudohdr *bp;
1573 struct scatterlist sg;
1574 struct tcphdr *_th;
1575
1576 bp = hp->scratch;
1577 bp->saddr = saddr;
1578 bp->daddr = daddr;
1579 bp->pad = 0;
1580 bp->protocol = IPPROTO_TCP;
1581 bp->len = cpu_to_be16(nbytes);
1582
1583 _th = (struct tcphdr *)(bp + 1);
1584 memcpy(_th, th, sizeof(*th));
1585 _th->check = 0;
1586
1587 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1588 ahash_request_set_crypt(hp->req, &sg, NULL,
1589 sizeof(*bp) + sizeof(*th));
1590 return crypto_ahash_update(hp->req);
1591 }
1592
1593 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1594 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1595 {
1596 struct tcp_sigpool hp;
1597
1598 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1599 goto clear_hash_nostart;
1600
1601 if (crypto_ahash_init(hp.req))
1602 goto clear_hash;
1603 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1604 goto clear_hash;
1605 if (tcp_md5_hash_key(&hp, key))
1606 goto clear_hash;
1607 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1608 if (crypto_ahash_final(hp.req))
1609 goto clear_hash;
1610
1611 tcp_sigpool_end(&hp);
1612 return 0;
1613
1614 clear_hash:
1615 tcp_sigpool_end(&hp);
1616 clear_hash_nostart:
1617 memset(md5_hash, 0, 16);
1618 return 1;
1619 }
1620
1621 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1622 const struct sock *sk,
1623 const struct sk_buff *skb)
1624 {
1625 const struct tcphdr *th = tcp_hdr(skb);
1626 struct tcp_sigpool hp;
1627 __be32 saddr, daddr;
1628
1629 if (sk) { /* valid for establish/request sockets */
1630 saddr = sk->sk_rcv_saddr;
1631 daddr = sk->sk_daddr;
1632 } else {
1633 const struct iphdr *iph = ip_hdr(skb);
1634 saddr = iph->saddr;
1635 daddr = iph->daddr;
1636 }
1637
1638 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1639 goto clear_hash_nostart;
1640
1641 if (crypto_ahash_init(hp.req))
1642 goto clear_hash;
1643
1644 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1645 goto clear_hash;
1646 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1647 goto clear_hash;
1648 if (tcp_md5_hash_key(&hp, key))
1649 goto clear_hash;
1650 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1651 if (crypto_ahash_final(hp.req))
1652 goto clear_hash;
1653
1654 tcp_sigpool_end(&hp);
1655 return 0;
1656
1657 clear_hash:
1658 tcp_sigpool_end(&hp);
1659 clear_hash_nostart:
1660 memset(md5_hash, 0, 16);
1661 return 1;
1662 }
1663 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1664
1665 #endif
1666
1667 static void tcp_v4_init_req(struct request_sock *req,
1668 const struct sock *sk_listener,
1669 struct sk_buff *skb)
1670 {
1671 struct inet_request_sock *ireq = inet_rsk(req);
1672 struct net *net = sock_net(sk_listener);
1673
1674 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1675 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1676 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1677 }
1678
1679 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1680 struct sk_buff *skb,
1681 struct flowi *fl,
1682 struct request_sock *req,
1683 u32 tw_isn)
1684 {
1685 tcp_v4_init_req(req, sk, skb);
1686
1687 if (security_inet_conn_request(sk, skb, req))
1688 return NULL;
1689
1690 return inet_csk_route_req(sk, &fl->u.ip4, req);
1691 }
1692
1693 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1694 .family = PF_INET,
1695 .obj_size = sizeof(struct tcp_request_sock),
1696 .rtx_syn_ack = tcp_rtx_synack,
1697 .send_ack = tcp_v4_reqsk_send_ack,
1698 .destructor = tcp_v4_reqsk_destructor,
1699 .send_reset = tcp_v4_send_reset,
1700 .syn_ack_timeout = tcp_syn_ack_timeout,
1701 };
1702
1703 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1704 .mss_clamp = TCP_MSS_DEFAULT,
1705 #ifdef CONFIG_TCP_MD5SIG
1706 .req_md5_lookup = tcp_v4_md5_lookup,
1707 .calc_md5_hash = tcp_v4_md5_hash_skb,
1708 #endif
1709 #ifdef CONFIG_TCP_AO
1710 .ao_lookup = tcp_v4_ao_lookup_rsk,
1711 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1712 .ao_synack_hash = tcp_v4_ao_synack_hash,
1713 #endif
1714 #ifdef CONFIG_SYN_COOKIES
1715 .cookie_init_seq = cookie_v4_init_sequence,
1716 #endif
1717 .route_req = tcp_v4_route_req,
1718 .init_seq = tcp_v4_init_seq,
1719 .init_ts_off = tcp_v4_init_ts_off,
1720 .send_synack = tcp_v4_send_synack,
1721 };
1722
1723 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1724 {
1725 /* Never answer to SYNs send to broadcast or multicast */
1726 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1727 goto drop;
1728
1729 return tcp_conn_request(&tcp_request_sock_ops,
1730 &tcp_request_sock_ipv4_ops, sk, skb);
1731
1732 drop:
1733 tcp_listendrop(sk);
1734 return 0;
1735 }
1736 EXPORT_SYMBOL(tcp_v4_conn_request);
1737
1738
1739 /*
1740 * The three way handshake has completed - we got a valid synack -
1741 * now create the new socket.
1742 */
1743 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1744 struct request_sock *req,
1745 struct dst_entry *dst,
1746 struct request_sock *req_unhash,
1747 bool *own_req)
1748 {
1749 struct inet_request_sock *ireq;
1750 bool found_dup_sk = false;
1751 struct inet_sock *newinet;
1752 struct tcp_sock *newtp;
1753 struct sock *newsk;
1754 #ifdef CONFIG_TCP_MD5SIG
1755 const union tcp_md5_addr *addr;
1756 struct tcp_md5sig_key *key;
1757 int l3index;
1758 #endif
1759 struct ip_options_rcu *inet_opt;
1760
1761 if (sk_acceptq_is_full(sk))
1762 goto exit_overflow;
1763
1764 newsk = tcp_create_openreq_child(sk, req, skb);
1765 if (!newsk)
1766 goto exit_nonewsk;
1767
1768 newsk->sk_gso_type = SKB_GSO_TCPV4;
1769 inet_sk_rx_dst_set(newsk, skb);
1770
1771 newtp = tcp_sk(newsk);
1772 newinet = inet_sk(newsk);
1773 ireq = inet_rsk(req);
1774 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1775 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1776 newsk->sk_bound_dev_if = ireq->ir_iif;
1777 newinet->inet_saddr = ireq->ir_loc_addr;
1778 inet_opt = rcu_dereference(ireq->ireq_opt);
1779 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1780 newinet->mc_index = inet_iif(skb);
1781 newinet->mc_ttl = ip_hdr(skb)->ttl;
1782 newinet->rcv_tos = ip_hdr(skb)->tos;
1783 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1784 if (inet_opt)
1785 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1786 atomic_set(&newinet->inet_id, get_random_u16());
1787
1788 /* Set ToS of the new socket based upon the value of incoming SYN.
1789 * ECT bits are set later in tcp_init_transfer().
1790 */
1791 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1792 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1793
1794 if (!dst) {
1795 dst = inet_csk_route_child_sock(sk, newsk, req);
1796 if (!dst)
1797 goto put_and_exit;
1798 } else {
1799 /* syncookie case : see end of cookie_v4_check() */
1800 }
1801 sk_setup_caps(newsk, dst);
1802
1803 tcp_ca_openreq_child(newsk, dst);
1804
1805 tcp_sync_mss(newsk, dst_mtu(dst));
1806 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1807
1808 tcp_initialize_rcv_mss(newsk);
1809
1810 #ifdef CONFIG_TCP_MD5SIG
1811 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1812 /* Copy over the MD5 key from the original socket */
1813 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1814 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1815 if (key && !tcp_rsk_used_ao(req)) {
1816 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1817 goto put_and_exit;
1818 sk_gso_disable(newsk);
1819 }
1820 #endif
1821 #ifdef CONFIG_TCP_AO
1822 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1823 goto put_and_exit; /* OOM, release back memory */
1824 #endif
1825
1826 if (__inet_inherit_port(sk, newsk) < 0)
1827 goto put_and_exit;
1828 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1829 &found_dup_sk);
1830 if (likely(*own_req)) {
1831 tcp_move_syn(newtp, req);
1832 ireq->ireq_opt = NULL;
1833 } else {
1834 newinet->inet_opt = NULL;
1835
1836 if (!req_unhash && found_dup_sk) {
1837 /* This code path should only be executed in the
1838 * syncookie case only
1839 */
1840 bh_unlock_sock(newsk);
1841 sock_put(newsk);
1842 newsk = NULL;
1843 }
1844 }
1845 return newsk;
1846
1847 exit_overflow:
1848 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1849 exit_nonewsk:
1850 dst_release(dst);
1851 exit:
1852 tcp_listendrop(sk);
1853 return NULL;
1854 put_and_exit:
1855 newinet->inet_opt = NULL;
1856 inet_csk_prepare_forced_close(newsk);
1857 tcp_done(newsk);
1858 goto exit;
1859 }
1860 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1861
1862 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1863 {
1864 #ifdef CONFIG_SYN_COOKIES
1865 const struct tcphdr *th = tcp_hdr(skb);
1866
1867 if (!th->syn)
1868 sk = cookie_v4_check(sk, skb);
1869 #endif
1870 return sk;
1871 }
1872
1873 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1874 struct tcphdr *th, u32 *cookie)
1875 {
1876 u16 mss = 0;
1877 #ifdef CONFIG_SYN_COOKIES
1878 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1879 &tcp_request_sock_ipv4_ops, sk, th);
1880 if (mss) {
1881 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1882 tcp_synq_overflow(sk);
1883 }
1884 #endif
1885 return mss;
1886 }
1887
1888 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1889 u32));
1890 /* The socket must have it's spinlock held when we get
1891 * here, unless it is a TCP_LISTEN socket.
1892 *
1893 * We have a potential double-lock case here, so even when
1894 * doing backlog processing we use the BH locking scheme.
1895 * This is because we cannot sleep with the original spinlock
1896 * held.
1897 */
1898 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1899 {
1900 enum skb_drop_reason reason;
1901 struct sock *rsk;
1902
1903 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1904 struct dst_entry *dst;
1905
1906 dst = rcu_dereference_protected(sk->sk_rx_dst,
1907 lockdep_sock_is_held(sk));
1908
1909 sock_rps_save_rxhash(sk, skb);
1910 sk_mark_napi_id(sk, skb);
1911 if (dst) {
1912 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1913 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1914 dst, 0)) {
1915 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1916 dst_release(dst);
1917 }
1918 }
1919 tcp_rcv_established(sk, skb);
1920 return 0;
1921 }
1922
1923 if (tcp_checksum_complete(skb))
1924 goto csum_err;
1925
1926 if (sk->sk_state == TCP_LISTEN) {
1927 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1928
1929 if (!nsk)
1930 return 0;
1931 if (nsk != sk) {
1932 reason = tcp_child_process(sk, nsk, skb);
1933 if (reason) {
1934 rsk = nsk;
1935 goto reset;
1936 }
1937 return 0;
1938 }
1939 } else
1940 sock_rps_save_rxhash(sk, skb);
1941
1942 reason = tcp_rcv_state_process(sk, skb);
1943 if (reason) {
1944 rsk = sk;
1945 goto reset;
1946 }
1947 return 0;
1948
1949 reset:
1950 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1951 discard:
1952 sk_skb_reason_drop(sk, skb, reason);
1953 /* Be careful here. If this function gets more complicated and
1954 * gcc suffers from register pressure on the x86, sk (in %ebx)
1955 * might be destroyed here. This current version compiles correctly,
1956 * but you have been warned.
1957 */
1958 return 0;
1959
1960 csum_err:
1961 reason = SKB_DROP_REASON_TCP_CSUM;
1962 trace_tcp_bad_csum(skb);
1963 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1964 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1965 goto discard;
1966 }
1967 EXPORT_SYMBOL(tcp_v4_do_rcv);
1968
1969 int tcp_v4_early_demux(struct sk_buff *skb)
1970 {
1971 struct net *net = dev_net(skb->dev);
1972 const struct iphdr *iph;
1973 const struct tcphdr *th;
1974 struct sock *sk;
1975
1976 if (skb->pkt_type != PACKET_HOST)
1977 return 0;
1978
1979 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1980 return 0;
1981
1982 iph = ip_hdr(skb);
1983 th = tcp_hdr(skb);
1984
1985 if (th->doff < sizeof(struct tcphdr) / 4)
1986 return 0;
1987
1988 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1989 iph->saddr, th->source,
1990 iph->daddr, ntohs(th->dest),
1991 skb->skb_iif, inet_sdif(skb));
1992 if (sk) {
1993 skb->sk = sk;
1994 skb->destructor = sock_edemux;
1995 if (sk_fullsock(sk)) {
1996 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1997
1998 if (dst)
1999 dst = dst_check(dst, 0);
2000 if (dst &&
2001 sk->sk_rx_dst_ifindex == skb->skb_iif)
2002 skb_dst_set_noref(skb, dst);
2003 }
2004 }
2005 return 0;
2006 }
2007
2008 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2009 enum skb_drop_reason *reason)
2010 {
2011 u32 tail_gso_size, tail_gso_segs;
2012 struct skb_shared_info *shinfo;
2013 const struct tcphdr *th;
2014 struct tcphdr *thtail;
2015 struct sk_buff *tail;
2016 unsigned int hdrlen;
2017 bool fragstolen;
2018 u32 gso_segs;
2019 u32 gso_size;
2020 u64 limit;
2021 int delta;
2022
2023 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2024 * we can fix skb->truesize to its real value to avoid future drops.
2025 * This is valid because skb is not yet charged to the socket.
2026 * It has been noticed pure SACK packets were sometimes dropped
2027 * (if cooked by drivers without copybreak feature).
2028 */
2029 skb_condense(skb);
2030
2031 tcp_cleanup_skb(skb);
2032
2033 if (unlikely(tcp_checksum_complete(skb))) {
2034 bh_unlock_sock(sk);
2035 trace_tcp_bad_csum(skb);
2036 *reason = SKB_DROP_REASON_TCP_CSUM;
2037 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2038 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2039 return true;
2040 }
2041
2042 /* Attempt coalescing to last skb in backlog, even if we are
2043 * above the limits.
2044 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2045 */
2046 th = (const struct tcphdr *)skb->data;
2047 hdrlen = th->doff * 4;
2048
2049 tail = sk->sk_backlog.tail;
2050 if (!tail)
2051 goto no_coalesce;
2052 thtail = (struct tcphdr *)tail->data;
2053
2054 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2055 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2056 ((TCP_SKB_CB(tail)->tcp_flags |
2057 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2058 !((TCP_SKB_CB(tail)->tcp_flags &
2059 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2060 ((TCP_SKB_CB(tail)->tcp_flags ^
2061 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2062 !tcp_skb_can_collapse_rx(tail, skb) ||
2063 thtail->doff != th->doff ||
2064 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2065 goto no_coalesce;
2066
2067 __skb_pull(skb, hdrlen);
2068
2069 shinfo = skb_shinfo(skb);
2070 gso_size = shinfo->gso_size ?: skb->len;
2071 gso_segs = shinfo->gso_segs ?: 1;
2072
2073 shinfo = skb_shinfo(tail);
2074 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2075 tail_gso_segs = shinfo->gso_segs ?: 1;
2076
2077 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2078 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2079
2080 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2081 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2082 thtail->window = th->window;
2083 }
2084
2085 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2086 * thtail->fin, so that the fast path in tcp_rcv_established()
2087 * is not entered if we append a packet with a FIN.
2088 * SYN, RST, URG are not present.
2089 * ACK is set on both packets.
2090 * PSH : we do not really care in TCP stack,
2091 * at least for 'GRO' packets.
2092 */
2093 thtail->fin |= th->fin;
2094 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2095
2096 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2097 TCP_SKB_CB(tail)->has_rxtstamp = true;
2098 tail->tstamp = skb->tstamp;
2099 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2100 }
2101
2102 /* Not as strict as GRO. We only need to carry mss max value */
2103 shinfo->gso_size = max(gso_size, tail_gso_size);
2104 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2105
2106 sk->sk_backlog.len += delta;
2107 __NET_INC_STATS(sock_net(sk),
2108 LINUX_MIB_TCPBACKLOGCOALESCE);
2109 kfree_skb_partial(skb, fragstolen);
2110 return false;
2111 }
2112 __skb_push(skb, hdrlen);
2113
2114 no_coalesce:
2115 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2116 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2117 * sk_rcvbuf in normal conditions.
2118 */
2119 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2120
2121 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2122
2123 /* Only socket owner can try to collapse/prune rx queues
2124 * to reduce memory overhead, so add a little headroom here.
2125 * Few sockets backlog are possibly concurrently non empty.
2126 */
2127 limit += 64 * 1024;
2128
2129 limit = min_t(u64, limit, UINT_MAX);
2130
2131 if (unlikely(sk_add_backlog(sk, skb, limit))) {
2132 bh_unlock_sock(sk);
2133 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2134 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2135 return true;
2136 }
2137 return false;
2138 }
2139 EXPORT_SYMBOL(tcp_add_backlog);
2140
2141 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2142 {
2143 struct tcphdr *th = (struct tcphdr *)skb->data;
2144
2145 return sk_filter_trim_cap(sk, skb, th->doff * 4);
2146 }
2147 EXPORT_SYMBOL(tcp_filter);
2148
2149 static void tcp_v4_restore_cb(struct sk_buff *skb)
2150 {
2151 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2152 sizeof(struct inet_skb_parm));
2153 }
2154
2155 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2156 const struct tcphdr *th)
2157 {
2158 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2159 * barrier() makes sure compiler wont play fool^Waliasing games.
2160 */
2161 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2162 sizeof(struct inet_skb_parm));
2163 barrier();
2164
2165 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2166 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2167 skb->len - th->doff * 4);
2168 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2169 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2170 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2171 TCP_SKB_CB(skb)->sacked = 0;
2172 TCP_SKB_CB(skb)->has_rxtstamp =
2173 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2174 }
2175
2176 /*
2177 * From tcp_input.c
2178 */
2179
2180 int tcp_v4_rcv(struct sk_buff *skb)
2181 {
2182 struct net *net = dev_net(skb->dev);
2183 enum skb_drop_reason drop_reason;
2184 int sdif = inet_sdif(skb);
2185 int dif = inet_iif(skb);
2186 const struct iphdr *iph;
2187 const struct tcphdr *th;
2188 struct sock *sk = NULL;
2189 bool refcounted;
2190 int ret;
2191 u32 isn;
2192
2193 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2194 if (skb->pkt_type != PACKET_HOST)
2195 goto discard_it;
2196
2197 /* Count it even if it's bad */
2198 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2199
2200 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2201 goto discard_it;
2202
2203 th = (const struct tcphdr *)skb->data;
2204
2205 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2206 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2207 goto bad_packet;
2208 }
2209 if (!pskb_may_pull(skb, th->doff * 4))
2210 goto discard_it;
2211
2212 /* An explanation is required here, I think.
2213 * Packet length and doff are validated by header prediction,
2214 * provided case of th->doff==0 is eliminated.
2215 * So, we defer the checks. */
2216
2217 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2218 goto csum_error;
2219
2220 th = (const struct tcphdr *)skb->data;
2221 iph = ip_hdr(skb);
2222 lookup:
2223 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2224 skb, __tcp_hdrlen(th), th->source,
2225 th->dest, sdif, &refcounted);
2226 if (!sk)
2227 goto no_tcp_socket;
2228
2229 if (sk->sk_state == TCP_TIME_WAIT)
2230 goto do_time_wait;
2231
2232 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2233 struct request_sock *req = inet_reqsk(sk);
2234 bool req_stolen = false;
2235 struct sock *nsk;
2236
2237 sk = req->rsk_listener;
2238 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2239 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2240 else
2241 drop_reason = tcp_inbound_hash(sk, req, skb,
2242 &iph->saddr, &iph->daddr,
2243 AF_INET, dif, sdif);
2244 if (unlikely(drop_reason)) {
2245 sk_drops_add(sk, skb);
2246 reqsk_put(req);
2247 goto discard_it;
2248 }
2249 if (tcp_checksum_complete(skb)) {
2250 reqsk_put(req);
2251 goto csum_error;
2252 }
2253 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2254 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2255 if (!nsk) {
2256 inet_csk_reqsk_queue_drop_and_put(sk, req);
2257 goto lookup;
2258 }
2259 sk = nsk;
2260 /* reuseport_migrate_sock() has already held one sk_refcnt
2261 * before returning.
2262 */
2263 } else {
2264 /* We own a reference on the listener, increase it again
2265 * as we might lose it too soon.
2266 */
2267 sock_hold(sk);
2268 }
2269 refcounted = true;
2270 nsk = NULL;
2271 if (!tcp_filter(sk, skb)) {
2272 th = (const struct tcphdr *)skb->data;
2273 iph = ip_hdr(skb);
2274 tcp_v4_fill_cb(skb, iph, th);
2275 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2276 } else {
2277 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2278 }
2279 if (!nsk) {
2280 reqsk_put(req);
2281 if (req_stolen) {
2282 /* Another cpu got exclusive access to req
2283 * and created a full blown socket.
2284 * Try to feed this packet to this socket
2285 * instead of discarding it.
2286 */
2287 tcp_v4_restore_cb(skb);
2288 sock_put(sk);
2289 goto lookup;
2290 }
2291 goto discard_and_relse;
2292 }
2293 nf_reset_ct(skb);
2294 if (nsk == sk) {
2295 reqsk_put(req);
2296 tcp_v4_restore_cb(skb);
2297 } else {
2298 drop_reason = tcp_child_process(sk, nsk, skb);
2299 if (drop_reason) {
2300 enum sk_rst_reason rst_reason;
2301
2302 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2303 tcp_v4_send_reset(nsk, skb, rst_reason);
2304 goto discard_and_relse;
2305 }
2306 sock_put(sk);
2307 return 0;
2308 }
2309 }
2310
2311 process:
2312 if (static_branch_unlikely(&ip4_min_ttl)) {
2313 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2314 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2315 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2316 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2317 goto discard_and_relse;
2318 }
2319 }
2320
2321 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2322 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2323 goto discard_and_relse;
2324 }
2325
2326 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2327 AF_INET, dif, sdif);
2328 if (drop_reason)
2329 goto discard_and_relse;
2330
2331 nf_reset_ct(skb);
2332
2333 if (tcp_filter(sk, skb)) {
2334 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2335 goto discard_and_relse;
2336 }
2337 th = (const struct tcphdr *)skb->data;
2338 iph = ip_hdr(skb);
2339 tcp_v4_fill_cb(skb, iph, th);
2340
2341 skb->dev = NULL;
2342
2343 if (sk->sk_state == TCP_LISTEN) {
2344 ret = tcp_v4_do_rcv(sk, skb);
2345 goto put_and_return;
2346 }
2347
2348 sk_incoming_cpu_update(sk);
2349
2350 bh_lock_sock_nested(sk);
2351 tcp_segs_in(tcp_sk(sk), skb);
2352 ret = 0;
2353 if (!sock_owned_by_user(sk)) {
2354 ret = tcp_v4_do_rcv(sk, skb);
2355 } else {
2356 if (tcp_add_backlog(sk, skb, &drop_reason))
2357 goto discard_and_relse;
2358 }
2359 bh_unlock_sock(sk);
2360
2361 put_and_return:
2362 if (refcounted)
2363 sock_put(sk);
2364
2365 return ret;
2366
2367 no_tcp_socket:
2368 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2369 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2370 goto discard_it;
2371
2372 tcp_v4_fill_cb(skb, iph, th);
2373
2374 if (tcp_checksum_complete(skb)) {
2375 csum_error:
2376 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2377 trace_tcp_bad_csum(skb);
2378 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2379 bad_packet:
2380 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2381 } else {
2382 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2383 }
2384
2385 discard_it:
2386 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2387 /* Discard frame. */
2388 sk_skb_reason_drop(sk, skb, drop_reason);
2389 return 0;
2390
2391 discard_and_relse:
2392 sk_drops_add(sk, skb);
2393 if (refcounted)
2394 sock_put(sk);
2395 goto discard_it;
2396
2397 do_time_wait:
2398 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2399 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2400 inet_twsk_put(inet_twsk(sk));
2401 goto discard_it;
2402 }
2403
2404 tcp_v4_fill_cb(skb, iph, th);
2405
2406 if (tcp_checksum_complete(skb)) {
2407 inet_twsk_put(inet_twsk(sk));
2408 goto csum_error;
2409 }
2410 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2411 case TCP_TW_SYN: {
2412 struct sock *sk2 = inet_lookup_listener(net,
2413 net->ipv4.tcp_death_row.hashinfo,
2414 skb, __tcp_hdrlen(th),
2415 iph->saddr, th->source,
2416 iph->daddr, th->dest,
2417 inet_iif(skb),
2418 sdif);
2419 if (sk2) {
2420 inet_twsk_deschedule_put(inet_twsk(sk));
2421 sk = sk2;
2422 tcp_v4_restore_cb(skb);
2423 refcounted = false;
2424 __this_cpu_write(tcp_tw_isn, isn);
2425 goto process;
2426 }
2427 }
2428 /* to ACK */
2429 fallthrough;
2430 case TCP_TW_ACK:
2431 tcp_v4_timewait_ack(sk, skb);
2432 break;
2433 case TCP_TW_RST:
2434 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2435 inet_twsk_deschedule_put(inet_twsk(sk));
2436 goto discard_it;
2437 case TCP_TW_SUCCESS:;
2438 }
2439 goto discard_it;
2440 }
2441
2442 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2443 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2444 .twsk_destructor= tcp_twsk_destructor,
2445 };
2446
2447 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2448 {
2449 struct dst_entry *dst = skb_dst(skb);
2450
2451 if (dst && dst_hold_safe(dst)) {
2452 rcu_assign_pointer(sk->sk_rx_dst, dst);
2453 sk->sk_rx_dst_ifindex = skb->skb_iif;
2454 }
2455 }
2456 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2457
2458 const struct inet_connection_sock_af_ops ipv4_specific = {
2459 .queue_xmit = ip_queue_xmit,
2460 .send_check = tcp_v4_send_check,
2461 .rebuild_header = inet_sk_rebuild_header,
2462 .sk_rx_dst_set = inet_sk_rx_dst_set,
2463 .conn_request = tcp_v4_conn_request,
2464 .syn_recv_sock = tcp_v4_syn_recv_sock,
2465 .net_header_len = sizeof(struct iphdr),
2466 .setsockopt = ip_setsockopt,
2467 .getsockopt = ip_getsockopt,
2468 .addr2sockaddr = inet_csk_addr2sockaddr,
2469 .sockaddr_len = sizeof(struct sockaddr_in),
2470 .mtu_reduced = tcp_v4_mtu_reduced,
2471 };
2472 EXPORT_SYMBOL(ipv4_specific);
2473
2474 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2475 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2476 #ifdef CONFIG_TCP_MD5SIG
2477 .md5_lookup = tcp_v4_md5_lookup,
2478 .calc_md5_hash = tcp_v4_md5_hash_skb,
2479 .md5_parse = tcp_v4_parse_md5_keys,
2480 #endif
2481 #ifdef CONFIG_TCP_AO
2482 .ao_lookup = tcp_v4_ao_lookup,
2483 .calc_ao_hash = tcp_v4_ao_hash_skb,
2484 .ao_parse = tcp_v4_parse_ao,
2485 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2486 #endif
2487 };
2488 #endif
2489
2490 /* NOTE: A lot of things set to zero explicitly by call to
2491 * sk_alloc() so need not be done here.
2492 */
2493 static int tcp_v4_init_sock(struct sock *sk)
2494 {
2495 struct inet_connection_sock *icsk = inet_csk(sk);
2496
2497 tcp_init_sock(sk);
2498
2499 icsk->icsk_af_ops = &ipv4_specific;
2500
2501 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2502 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2503 #endif
2504
2505 return 0;
2506 }
2507
2508 #ifdef CONFIG_TCP_MD5SIG
2509 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2510 {
2511 struct tcp_md5sig_info *md5sig;
2512
2513 md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2514 kfree(md5sig);
2515 static_branch_slow_dec_deferred(&tcp_md5_needed);
2516 tcp_md5_release_sigpool();
2517 }
2518 #endif
2519
2520 static void tcp_release_user_frags(struct sock *sk)
2521 {
2522 #ifdef CONFIG_PAGE_POOL
2523 unsigned long index;
2524 void *netmem;
2525
2526 xa_for_each(&sk->sk_user_frags, index, netmem)
2527 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2528 #endif
2529 }
2530
2531 void tcp_v4_destroy_sock(struct sock *sk)
2532 {
2533 struct tcp_sock *tp = tcp_sk(sk);
2534
2535 tcp_release_user_frags(sk);
2536
2537 xa_destroy(&sk->sk_user_frags);
2538
2539 trace_tcp_destroy_sock(sk);
2540
2541 tcp_clear_xmit_timers(sk);
2542
2543 tcp_cleanup_congestion_control(sk);
2544
2545 tcp_cleanup_ulp(sk);
2546
2547 /* Cleanup up the write buffer. */
2548 tcp_write_queue_purge(sk);
2549
2550 /* Check if we want to disable active TFO */
2551 tcp_fastopen_active_disable_ofo_check(sk);
2552
2553 /* Cleans up our, hopefully empty, out_of_order_queue. */
2554 skb_rbtree_purge(&tp->out_of_order_queue);
2555
2556 #ifdef CONFIG_TCP_MD5SIG
2557 /* Clean up the MD5 key list, if any */
2558 if (tp->md5sig_info) {
2559 struct tcp_md5sig_info *md5sig;
2560
2561 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2562 tcp_clear_md5_list(sk);
2563 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2564 rcu_assign_pointer(tp->md5sig_info, NULL);
2565 }
2566 #endif
2567 tcp_ao_destroy_sock(sk, false);
2568
2569 /* Clean up a referenced TCP bind bucket. */
2570 if (inet_csk(sk)->icsk_bind_hash)
2571 inet_put_port(sk);
2572
2573 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2574
2575 /* If socket is aborted during connect operation */
2576 tcp_free_fastopen_req(tp);
2577 tcp_fastopen_destroy_cipher(sk);
2578 tcp_saved_syn_free(tp);
2579
2580 sk_sockets_allocated_dec(sk);
2581 }
2582 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2583
2584 #ifdef CONFIG_PROC_FS
2585 /* Proc filesystem TCP sock list dumping. */
2586
2587 static unsigned short seq_file_family(const struct seq_file *seq);
2588
2589 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2590 {
2591 unsigned short family = seq_file_family(seq);
2592
2593 /* AF_UNSPEC is used as a match all */
2594 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2595 net_eq(sock_net(sk), seq_file_net(seq)));
2596 }
2597
2598 /* Find a non empty bucket (starting from st->bucket)
2599 * and return the first sk from it.
2600 */
2601 static void *listening_get_first(struct seq_file *seq)
2602 {
2603 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2604 struct tcp_iter_state *st = seq->private;
2605
2606 st->offset = 0;
2607 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2608 struct inet_listen_hashbucket *ilb2;
2609 struct hlist_nulls_node *node;
2610 struct sock *sk;
2611
2612 ilb2 = &hinfo->lhash2[st->bucket];
2613 if (hlist_nulls_empty(&ilb2->nulls_head))
2614 continue;
2615
2616 spin_lock(&ilb2->lock);
2617 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2618 if (seq_sk_match(seq, sk))
2619 return sk;
2620 }
2621 spin_unlock(&ilb2->lock);
2622 }
2623
2624 return NULL;
2625 }
2626
2627 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2628 * If "cur" is the last one in the st->bucket,
2629 * call listening_get_first() to return the first sk of the next
2630 * non empty bucket.
2631 */
2632 static void *listening_get_next(struct seq_file *seq, void *cur)
2633 {
2634 struct tcp_iter_state *st = seq->private;
2635 struct inet_listen_hashbucket *ilb2;
2636 struct hlist_nulls_node *node;
2637 struct inet_hashinfo *hinfo;
2638 struct sock *sk = cur;
2639
2640 ++st->num;
2641 ++st->offset;
2642
2643 sk = sk_nulls_next(sk);
2644 sk_nulls_for_each_from(sk, node) {
2645 if (seq_sk_match(seq, sk))
2646 return sk;
2647 }
2648
2649 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2650 ilb2 = &hinfo->lhash2[st->bucket];
2651 spin_unlock(&ilb2->lock);
2652 ++st->bucket;
2653 return listening_get_first(seq);
2654 }
2655
2656 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2657 {
2658 struct tcp_iter_state *st = seq->private;
2659 void *rc;
2660
2661 st->bucket = 0;
2662 st->offset = 0;
2663 rc = listening_get_first(seq);
2664
2665 while (rc && *pos) {
2666 rc = listening_get_next(seq, rc);
2667 --*pos;
2668 }
2669 return rc;
2670 }
2671
2672 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2673 const struct tcp_iter_state *st)
2674 {
2675 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2676 }
2677
2678 /*
2679 * Get first established socket starting from bucket given in st->bucket.
2680 * If st->bucket is zero, the very first socket in the hash is returned.
2681 */
2682 static void *established_get_first(struct seq_file *seq)
2683 {
2684 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2685 struct tcp_iter_state *st = seq->private;
2686
2687 st->offset = 0;
2688 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2689 struct sock *sk;
2690 struct hlist_nulls_node *node;
2691 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2692
2693 cond_resched();
2694
2695 /* Lockless fast path for the common case of empty buckets */
2696 if (empty_bucket(hinfo, st))
2697 continue;
2698
2699 spin_lock_bh(lock);
2700 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2701 if (seq_sk_match(seq, sk))
2702 return sk;
2703 }
2704 spin_unlock_bh(lock);
2705 }
2706
2707 return NULL;
2708 }
2709
2710 static void *established_get_next(struct seq_file *seq, void *cur)
2711 {
2712 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2713 struct tcp_iter_state *st = seq->private;
2714 struct hlist_nulls_node *node;
2715 struct sock *sk = cur;
2716
2717 ++st->num;
2718 ++st->offset;
2719
2720 sk = sk_nulls_next(sk);
2721
2722 sk_nulls_for_each_from(sk, node) {
2723 if (seq_sk_match(seq, sk))
2724 return sk;
2725 }
2726
2727 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2728 ++st->bucket;
2729 return established_get_first(seq);
2730 }
2731
2732 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2733 {
2734 struct tcp_iter_state *st = seq->private;
2735 void *rc;
2736
2737 st->bucket = 0;
2738 rc = established_get_first(seq);
2739
2740 while (rc && pos) {
2741 rc = established_get_next(seq, rc);
2742 --pos;
2743 }
2744 return rc;
2745 }
2746
2747 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2748 {
2749 void *rc;
2750 struct tcp_iter_state *st = seq->private;
2751
2752 st->state = TCP_SEQ_STATE_LISTENING;
2753 rc = listening_get_idx(seq, &pos);
2754
2755 if (!rc) {
2756 st->state = TCP_SEQ_STATE_ESTABLISHED;
2757 rc = established_get_idx(seq, pos);
2758 }
2759
2760 return rc;
2761 }
2762
2763 static void *tcp_seek_last_pos(struct seq_file *seq)
2764 {
2765 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2766 struct tcp_iter_state *st = seq->private;
2767 int bucket = st->bucket;
2768 int offset = st->offset;
2769 int orig_num = st->num;
2770 void *rc = NULL;
2771
2772 switch (st->state) {
2773 case TCP_SEQ_STATE_LISTENING:
2774 if (st->bucket > hinfo->lhash2_mask)
2775 break;
2776 rc = listening_get_first(seq);
2777 while (offset-- && rc && bucket == st->bucket)
2778 rc = listening_get_next(seq, rc);
2779 if (rc)
2780 break;
2781 st->bucket = 0;
2782 st->state = TCP_SEQ_STATE_ESTABLISHED;
2783 fallthrough;
2784 case TCP_SEQ_STATE_ESTABLISHED:
2785 if (st->bucket > hinfo->ehash_mask)
2786 break;
2787 rc = established_get_first(seq);
2788 while (offset-- && rc && bucket == st->bucket)
2789 rc = established_get_next(seq, rc);
2790 }
2791
2792 st->num = orig_num;
2793
2794 return rc;
2795 }
2796
2797 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2798 {
2799 struct tcp_iter_state *st = seq->private;
2800 void *rc;
2801
2802 if (*pos && *pos == st->last_pos) {
2803 rc = tcp_seek_last_pos(seq);
2804 if (rc)
2805 goto out;
2806 }
2807
2808 st->state = TCP_SEQ_STATE_LISTENING;
2809 st->num = 0;
2810 st->bucket = 0;
2811 st->offset = 0;
2812 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2813
2814 out:
2815 st->last_pos = *pos;
2816 return rc;
2817 }
2818 EXPORT_SYMBOL(tcp_seq_start);
2819
2820 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2821 {
2822 struct tcp_iter_state *st = seq->private;
2823 void *rc = NULL;
2824
2825 if (v == SEQ_START_TOKEN) {
2826 rc = tcp_get_idx(seq, 0);
2827 goto out;
2828 }
2829
2830 switch (st->state) {
2831 case TCP_SEQ_STATE_LISTENING:
2832 rc = listening_get_next(seq, v);
2833 if (!rc) {
2834 st->state = TCP_SEQ_STATE_ESTABLISHED;
2835 st->bucket = 0;
2836 st->offset = 0;
2837 rc = established_get_first(seq);
2838 }
2839 break;
2840 case TCP_SEQ_STATE_ESTABLISHED:
2841 rc = established_get_next(seq, v);
2842 break;
2843 }
2844 out:
2845 ++*pos;
2846 st->last_pos = *pos;
2847 return rc;
2848 }
2849 EXPORT_SYMBOL(tcp_seq_next);
2850
2851 void tcp_seq_stop(struct seq_file *seq, void *v)
2852 {
2853 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2854 struct tcp_iter_state *st = seq->private;
2855
2856 switch (st->state) {
2857 case TCP_SEQ_STATE_LISTENING:
2858 if (v != SEQ_START_TOKEN)
2859 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2860 break;
2861 case TCP_SEQ_STATE_ESTABLISHED:
2862 if (v)
2863 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2864 break;
2865 }
2866 }
2867 EXPORT_SYMBOL(tcp_seq_stop);
2868
2869 static void get_openreq4(const struct request_sock *req,
2870 struct seq_file *f, int i)
2871 {
2872 const struct inet_request_sock *ireq = inet_rsk(req);
2873 long delta = req->rsk_timer.expires - jiffies;
2874
2875 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2876 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2877 i,
2878 ireq->ir_loc_addr,
2879 ireq->ir_num,
2880 ireq->ir_rmt_addr,
2881 ntohs(ireq->ir_rmt_port),
2882 TCP_SYN_RECV,
2883 0, 0, /* could print option size, but that is af dependent. */
2884 1, /* timers active (only the expire timer) */
2885 jiffies_delta_to_clock_t(delta),
2886 req->num_timeout,
2887 from_kuid_munged(seq_user_ns(f),
2888 sock_i_uid(req->rsk_listener)),
2889 0, /* non standard timer */
2890 0, /* open_requests have no inode */
2891 0,
2892 req);
2893 }
2894
2895 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2896 {
2897 int timer_active;
2898 unsigned long timer_expires;
2899 const struct tcp_sock *tp = tcp_sk(sk);
2900 const struct inet_connection_sock *icsk = inet_csk(sk);
2901 const struct inet_sock *inet = inet_sk(sk);
2902 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2903 __be32 dest = inet->inet_daddr;
2904 __be32 src = inet->inet_rcv_saddr;
2905 __u16 destp = ntohs(inet->inet_dport);
2906 __u16 srcp = ntohs(inet->inet_sport);
2907 int rx_queue;
2908 int state;
2909
2910 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2911 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2912 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2913 timer_active = 1;
2914 timer_expires = icsk->icsk_timeout;
2915 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2916 timer_active = 4;
2917 timer_expires = icsk->icsk_timeout;
2918 } else if (timer_pending(&sk->sk_timer)) {
2919 timer_active = 2;
2920 timer_expires = sk->sk_timer.expires;
2921 } else {
2922 timer_active = 0;
2923 timer_expires = jiffies;
2924 }
2925
2926 state = inet_sk_state_load(sk);
2927 if (state == TCP_LISTEN)
2928 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2929 else
2930 /* Because we don't lock the socket,
2931 * we might find a transient negative value.
2932 */
2933 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2934 READ_ONCE(tp->copied_seq), 0);
2935
2936 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2937 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2938 i, src, srcp, dest, destp, state,
2939 READ_ONCE(tp->write_seq) - tp->snd_una,
2940 rx_queue,
2941 timer_active,
2942 jiffies_delta_to_clock_t(timer_expires - jiffies),
2943 icsk->icsk_retransmits,
2944 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2945 icsk->icsk_probes_out,
2946 sock_i_ino(sk),
2947 refcount_read(&sk->sk_refcnt), sk,
2948 jiffies_to_clock_t(icsk->icsk_rto),
2949 jiffies_to_clock_t(icsk->icsk_ack.ato),
2950 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2951 tcp_snd_cwnd(tp),
2952 state == TCP_LISTEN ?
2953 fastopenq->max_qlen :
2954 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2955 }
2956
2957 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2958 struct seq_file *f, int i)
2959 {
2960 long delta = tw->tw_timer.expires - jiffies;
2961 __be32 dest, src;
2962 __u16 destp, srcp;
2963
2964 dest = tw->tw_daddr;
2965 src = tw->tw_rcv_saddr;
2966 destp = ntohs(tw->tw_dport);
2967 srcp = ntohs(tw->tw_sport);
2968
2969 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2970 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2971 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2972 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2973 refcount_read(&tw->tw_refcnt), tw);
2974 }
2975
2976 #define TMPSZ 150
2977
2978 static int tcp4_seq_show(struct seq_file *seq, void *v)
2979 {
2980 struct tcp_iter_state *st;
2981 struct sock *sk = v;
2982
2983 seq_setwidth(seq, TMPSZ - 1);
2984 if (v == SEQ_START_TOKEN) {
2985 seq_puts(seq, " sl local_address rem_address st tx_queue "
2986 "rx_queue tr tm->when retrnsmt uid timeout "
2987 "inode");
2988 goto out;
2989 }
2990 st = seq->private;
2991
2992 if (sk->sk_state == TCP_TIME_WAIT)
2993 get_timewait4_sock(v, seq, st->num);
2994 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2995 get_openreq4(v, seq, st->num);
2996 else
2997 get_tcp4_sock(v, seq, st->num);
2998 out:
2999 seq_pad(seq, '\n');
3000 return 0;
3001 }
3002
3003 #ifdef CONFIG_BPF_SYSCALL
3004 struct bpf_tcp_iter_state {
3005 struct tcp_iter_state state;
3006 unsigned int cur_sk;
3007 unsigned int end_sk;
3008 unsigned int max_sk;
3009 struct sock **batch;
3010 bool st_bucket_done;
3011 };
3012
3013 struct bpf_iter__tcp {
3014 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3015 __bpf_md_ptr(struct sock_common *, sk_common);
3016 uid_t uid __aligned(8);
3017 };
3018
3019 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3020 struct sock_common *sk_common, uid_t uid)
3021 {
3022 struct bpf_iter__tcp ctx;
3023
3024 meta->seq_num--; /* skip SEQ_START_TOKEN */
3025 ctx.meta = meta;
3026 ctx.sk_common = sk_common;
3027 ctx.uid = uid;
3028 return bpf_iter_run_prog(prog, &ctx);
3029 }
3030
3031 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3032 {
3033 while (iter->cur_sk < iter->end_sk)
3034 sock_gen_put(iter->batch[iter->cur_sk++]);
3035 }
3036
3037 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3038 unsigned int new_batch_sz)
3039 {
3040 struct sock **new_batch;
3041
3042 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3043 GFP_USER | __GFP_NOWARN);
3044 if (!new_batch)
3045 return -ENOMEM;
3046
3047 bpf_iter_tcp_put_batch(iter);
3048 kvfree(iter->batch);
3049 iter->batch = new_batch;
3050 iter->max_sk = new_batch_sz;
3051
3052 return 0;
3053 }
3054
3055 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3056 struct sock *start_sk)
3057 {
3058 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3059 struct bpf_tcp_iter_state *iter = seq->private;
3060 struct tcp_iter_state *st = &iter->state;
3061 struct hlist_nulls_node *node;
3062 unsigned int expected = 1;
3063 struct sock *sk;
3064
3065 sock_hold(start_sk);
3066 iter->batch[iter->end_sk++] = start_sk;
3067
3068 sk = sk_nulls_next(start_sk);
3069 sk_nulls_for_each_from(sk, node) {
3070 if (seq_sk_match(seq, sk)) {
3071 if (iter->end_sk < iter->max_sk) {
3072 sock_hold(sk);
3073 iter->batch[iter->end_sk++] = sk;
3074 }
3075 expected++;
3076 }
3077 }
3078 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3079
3080 return expected;
3081 }
3082
3083 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3084 struct sock *start_sk)
3085 {
3086 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3087 struct bpf_tcp_iter_state *iter = seq->private;
3088 struct tcp_iter_state *st = &iter->state;
3089 struct hlist_nulls_node *node;
3090 unsigned int expected = 1;
3091 struct sock *sk;
3092
3093 sock_hold(start_sk);
3094 iter->batch[iter->end_sk++] = start_sk;
3095
3096 sk = sk_nulls_next(start_sk);
3097 sk_nulls_for_each_from(sk, node) {
3098 if (seq_sk_match(seq, sk)) {
3099 if (iter->end_sk < iter->max_sk) {
3100 sock_hold(sk);
3101 iter->batch[iter->end_sk++] = sk;
3102 }
3103 expected++;
3104 }
3105 }
3106 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3107
3108 return expected;
3109 }
3110
3111 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3112 {
3113 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3114 struct bpf_tcp_iter_state *iter = seq->private;
3115 struct tcp_iter_state *st = &iter->state;
3116 unsigned int expected;
3117 bool resized = false;
3118 struct sock *sk;
3119
3120 /* The st->bucket is done. Directly advance to the next
3121 * bucket instead of having the tcp_seek_last_pos() to skip
3122 * one by one in the current bucket and eventually find out
3123 * it has to advance to the next bucket.
3124 */
3125 if (iter->st_bucket_done) {
3126 st->offset = 0;
3127 st->bucket++;
3128 if (st->state == TCP_SEQ_STATE_LISTENING &&
3129 st->bucket > hinfo->lhash2_mask) {
3130 st->state = TCP_SEQ_STATE_ESTABLISHED;
3131 st->bucket = 0;
3132 }
3133 }
3134
3135 again:
3136 /* Get a new batch */
3137 iter->cur_sk = 0;
3138 iter->end_sk = 0;
3139 iter->st_bucket_done = false;
3140
3141 sk = tcp_seek_last_pos(seq);
3142 if (!sk)
3143 return NULL; /* Done */
3144
3145 if (st->state == TCP_SEQ_STATE_LISTENING)
3146 expected = bpf_iter_tcp_listening_batch(seq, sk);
3147 else
3148 expected = bpf_iter_tcp_established_batch(seq, sk);
3149
3150 if (iter->end_sk == expected) {
3151 iter->st_bucket_done = true;
3152 return sk;
3153 }
3154
3155 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3156 resized = true;
3157 goto again;
3158 }
3159
3160 return sk;
3161 }
3162
3163 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3164 {
3165 /* bpf iter does not support lseek, so it always
3166 * continue from where it was stop()-ped.
3167 */
3168 if (*pos)
3169 return bpf_iter_tcp_batch(seq);
3170
3171 return SEQ_START_TOKEN;
3172 }
3173
3174 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3175 {
3176 struct bpf_tcp_iter_state *iter = seq->private;
3177 struct tcp_iter_state *st = &iter->state;
3178 struct sock *sk;
3179
3180 /* Whenever seq_next() is called, the iter->cur_sk is
3181 * done with seq_show(), so advance to the next sk in
3182 * the batch.
3183 */
3184 if (iter->cur_sk < iter->end_sk) {
3185 /* Keeping st->num consistent in tcp_iter_state.
3186 * bpf_iter_tcp does not use st->num.
3187 * meta.seq_num is used instead.
3188 */
3189 st->num++;
3190 /* Move st->offset to the next sk in the bucket such that
3191 * the future start() will resume at st->offset in
3192 * st->bucket. See tcp_seek_last_pos().
3193 */
3194 st->offset++;
3195 sock_gen_put(iter->batch[iter->cur_sk++]);
3196 }
3197
3198 if (iter->cur_sk < iter->end_sk)
3199 sk = iter->batch[iter->cur_sk];
3200 else
3201 sk = bpf_iter_tcp_batch(seq);
3202
3203 ++*pos;
3204 /* Keeping st->last_pos consistent in tcp_iter_state.
3205 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3206 */
3207 st->last_pos = *pos;
3208 return sk;
3209 }
3210
3211 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3212 {
3213 struct bpf_iter_meta meta;
3214 struct bpf_prog *prog;
3215 struct sock *sk = v;
3216 uid_t uid;
3217 int ret;
3218
3219 if (v == SEQ_START_TOKEN)
3220 return 0;
3221
3222 if (sk_fullsock(sk))
3223 lock_sock(sk);
3224
3225 if (unlikely(sk_unhashed(sk))) {
3226 ret = SEQ_SKIP;
3227 goto unlock;
3228 }
3229
3230 if (sk->sk_state == TCP_TIME_WAIT) {
3231 uid = 0;
3232 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3233 const struct request_sock *req = v;
3234
3235 uid = from_kuid_munged(seq_user_ns(seq),
3236 sock_i_uid(req->rsk_listener));
3237 } else {
3238 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3239 }
3240
3241 meta.seq = seq;
3242 prog = bpf_iter_get_info(&meta, false);
3243 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3244
3245 unlock:
3246 if (sk_fullsock(sk))
3247 release_sock(sk);
3248 return ret;
3249
3250 }
3251
3252 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3253 {
3254 struct bpf_tcp_iter_state *iter = seq->private;
3255 struct bpf_iter_meta meta;
3256 struct bpf_prog *prog;
3257
3258 if (!v) {
3259 meta.seq = seq;
3260 prog = bpf_iter_get_info(&meta, true);
3261 if (prog)
3262 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3263 }
3264
3265 if (iter->cur_sk < iter->end_sk) {
3266 bpf_iter_tcp_put_batch(iter);
3267 iter->st_bucket_done = false;
3268 }
3269 }
3270
3271 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3272 .show = bpf_iter_tcp_seq_show,
3273 .start = bpf_iter_tcp_seq_start,
3274 .next = bpf_iter_tcp_seq_next,
3275 .stop = bpf_iter_tcp_seq_stop,
3276 };
3277 #endif
3278 static unsigned short seq_file_family(const struct seq_file *seq)
3279 {
3280 const struct tcp_seq_afinfo *afinfo;
3281
3282 #ifdef CONFIG_BPF_SYSCALL
3283 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3284 if (seq->op == &bpf_iter_tcp_seq_ops)
3285 return AF_UNSPEC;
3286 #endif
3287
3288 /* Iterated from proc fs */
3289 afinfo = pde_data(file_inode(seq->file));
3290 return afinfo->family;
3291 }
3292
3293 static const struct seq_operations tcp4_seq_ops = {
3294 .show = tcp4_seq_show,
3295 .start = tcp_seq_start,
3296 .next = tcp_seq_next,
3297 .stop = tcp_seq_stop,
3298 };
3299
3300 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3301 .family = AF_INET,
3302 };
3303
3304 static int __net_init tcp4_proc_init_net(struct net *net)
3305 {
3306 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3307 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3308 return -ENOMEM;
3309 return 0;
3310 }
3311
3312 static void __net_exit tcp4_proc_exit_net(struct net *net)
3313 {
3314 remove_proc_entry("tcp", net->proc_net);
3315 }
3316
3317 static struct pernet_operations tcp4_net_ops = {
3318 .init = tcp4_proc_init_net,
3319 .exit = tcp4_proc_exit_net,
3320 };
3321
3322 int __init tcp4_proc_init(void)
3323 {
3324 return register_pernet_subsys(&tcp4_net_ops);
3325 }
3326
3327 void tcp4_proc_exit(void)
3328 {
3329 unregister_pernet_subsys(&tcp4_net_ops);
3330 }
3331 #endif /* CONFIG_PROC_FS */
3332
3333 /* @wake is one when sk_stream_write_space() calls us.
3334 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3335 * This mimics the strategy used in sock_def_write_space().
3336 */
3337 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3338 {
3339 const struct tcp_sock *tp = tcp_sk(sk);
3340 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3341 READ_ONCE(tp->snd_nxt);
3342
3343 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3344 }
3345 EXPORT_SYMBOL(tcp_stream_memory_free);
3346
3347 struct proto tcp_prot = {
3348 .name = "TCP",
3349 .owner = THIS_MODULE,
3350 .close = tcp_close,
3351 .pre_connect = tcp_v4_pre_connect,
3352 .connect = tcp_v4_connect,
3353 .disconnect = tcp_disconnect,
3354 .accept = inet_csk_accept,
3355 .ioctl = tcp_ioctl,
3356 .init = tcp_v4_init_sock,
3357 .destroy = tcp_v4_destroy_sock,
3358 .shutdown = tcp_shutdown,
3359 .setsockopt = tcp_setsockopt,
3360 .getsockopt = tcp_getsockopt,
3361 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3362 .keepalive = tcp_set_keepalive,
3363 .recvmsg = tcp_recvmsg,
3364 .sendmsg = tcp_sendmsg,
3365 .splice_eof = tcp_splice_eof,
3366 .backlog_rcv = tcp_v4_do_rcv,
3367 .release_cb = tcp_release_cb,
3368 .hash = inet_hash,
3369 .unhash = inet_unhash,
3370 .get_port = inet_csk_get_port,
3371 .put_port = inet_put_port,
3372 #ifdef CONFIG_BPF_SYSCALL
3373 .psock_update_sk_prot = tcp_bpf_update_proto,
3374 #endif
3375 .enter_memory_pressure = tcp_enter_memory_pressure,
3376 .leave_memory_pressure = tcp_leave_memory_pressure,
3377 .stream_memory_free = tcp_stream_memory_free,
3378 .sockets_allocated = &tcp_sockets_allocated,
3379 .orphan_count = &tcp_orphan_count,
3380
3381 .memory_allocated = &tcp_memory_allocated,
3382 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3383
3384 .memory_pressure = &tcp_memory_pressure,
3385 .sysctl_mem = sysctl_tcp_mem,
3386 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3387 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3388 .max_header = MAX_TCP_HEADER,
3389 .obj_size = sizeof(struct tcp_sock),
3390 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3391 .twsk_prot = &tcp_timewait_sock_ops,
3392 .rsk_prot = &tcp_request_sock_ops,
3393 .h.hashinfo = NULL,
3394 .no_autobind = true,
3395 .diag_destroy = tcp_abort,
3396 };
3397 EXPORT_SYMBOL(tcp_prot);
3398
3399 static void __net_exit tcp_sk_exit(struct net *net)
3400 {
3401 if (net->ipv4.tcp_congestion_control)
3402 bpf_module_put(net->ipv4.tcp_congestion_control,
3403 net->ipv4.tcp_congestion_control->owner);
3404 }
3405
3406 static void __net_init tcp_set_hashinfo(struct net *net)
3407 {
3408 struct inet_hashinfo *hinfo;
3409 unsigned int ehash_entries;
3410 struct net *old_net;
3411
3412 if (net_eq(net, &init_net))
3413 goto fallback;
3414
3415 old_net = current->nsproxy->net_ns;
3416 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3417 if (!ehash_entries)
3418 goto fallback;
3419
3420 ehash_entries = roundup_pow_of_two(ehash_entries);
3421 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3422 if (!hinfo) {
3423 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3424 "for a netns, fallback to the global one\n",
3425 ehash_entries);
3426 fallback:
3427 hinfo = &tcp_hashinfo;
3428 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3429 }
3430
3431 net->ipv4.tcp_death_row.hashinfo = hinfo;
3432 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3433 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3434 }
3435
3436 static int __net_init tcp_sk_init(struct net *net)
3437 {
3438 net->ipv4.sysctl_tcp_ecn = 2;
3439 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3440
3441 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3442 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3443 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3444 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3445 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3446
3447 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3448 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3449 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3450
3451 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3452 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3453 net->ipv4.sysctl_tcp_syncookies = 1;
3454 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3455 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3456 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3457 net->ipv4.sysctl_tcp_orphan_retries = 0;
3458 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3459 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3460 net->ipv4.sysctl_tcp_tw_reuse = 2;
3461 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3462
3463 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3464 tcp_set_hashinfo(net);
3465
3466 net->ipv4.sysctl_tcp_sack = 1;
3467 net->ipv4.sysctl_tcp_window_scaling = 1;
3468 net->ipv4.sysctl_tcp_timestamps = 1;
3469 net->ipv4.sysctl_tcp_early_retrans = 3;
3470 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3471 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3472 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3473 net->ipv4.sysctl_tcp_max_reordering = 300;
3474 net->ipv4.sysctl_tcp_dsack = 1;
3475 net->ipv4.sysctl_tcp_app_win = 31;
3476 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3477 net->ipv4.sysctl_tcp_frto = 2;
3478 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3479 /* This limits the percentage of the congestion window which we
3480 * will allow a single TSO frame to consume. Building TSO frames
3481 * which are too large can cause TCP streams to be bursty.
3482 */
3483 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3484 /* Default TSQ limit of 16 TSO segments */
3485 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3486
3487 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3488 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3489
3490 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3491 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3492 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3493 net->ipv4.sysctl_tcp_autocorking = 1;
3494 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3495 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3496 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3497 if (net != &init_net) {
3498 memcpy(net->ipv4.sysctl_tcp_rmem,
3499 init_net.ipv4.sysctl_tcp_rmem,
3500 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3501 memcpy(net->ipv4.sysctl_tcp_wmem,
3502 init_net.ipv4.sysctl_tcp_wmem,
3503 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3504 }
3505 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3506 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3507 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3508 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3509 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3510 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3511 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3512
3513 /* Set default values for PLB */
3514 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3515 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3516 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3517 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3518 /* Default congestion threshold for PLB to mark a round is 50% */
3519 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3520
3521 /* Reno is always built in */
3522 if (!net_eq(net, &init_net) &&
3523 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3524 init_net.ipv4.tcp_congestion_control->owner))
3525 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3526 else
3527 net->ipv4.tcp_congestion_control = &tcp_reno;
3528
3529 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3530 net->ipv4.sysctl_tcp_shrink_window = 0;
3531
3532 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3533 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3534
3535 return 0;
3536 }
3537
3538 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3539 {
3540 struct net *net;
3541
3542 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3543 * and failed setup_net error unwinding path are serialized.
3544 *
3545 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3546 * net_exit_list, the thread that dismantles a particular twsk must
3547 * do so without other thread progressing to refcount_dec_and_test() of
3548 * tcp_death_row.tw_refcount.
3549 */
3550 mutex_lock(&tcp_exit_batch_mutex);
3551
3552 tcp_twsk_purge(net_exit_list);
3553
3554 list_for_each_entry(net, net_exit_list, exit_list) {
3555 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3556 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3557 tcp_fastopen_ctx_destroy(net);
3558 }
3559
3560 mutex_unlock(&tcp_exit_batch_mutex);
3561 }
3562
3563 static struct pernet_operations __net_initdata tcp_sk_ops = {
3564 .init = tcp_sk_init,
3565 .exit = tcp_sk_exit,
3566 .exit_batch = tcp_sk_exit_batch,
3567 };
3568
3569 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3570 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3571 struct sock_common *sk_common, uid_t uid)
3572
3573 #define INIT_BATCH_SZ 16
3574
3575 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3576 {
3577 struct bpf_tcp_iter_state *iter = priv_data;
3578 int err;
3579
3580 err = bpf_iter_init_seq_net(priv_data, aux);
3581 if (err)
3582 return err;
3583
3584 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3585 if (err) {
3586 bpf_iter_fini_seq_net(priv_data);
3587 return err;
3588 }
3589
3590 return 0;
3591 }
3592
3593 static void bpf_iter_fini_tcp(void *priv_data)
3594 {
3595 struct bpf_tcp_iter_state *iter = priv_data;
3596
3597 bpf_iter_fini_seq_net(priv_data);
3598 kvfree(iter->batch);
3599 }
3600
3601 static const struct bpf_iter_seq_info tcp_seq_info = {
3602 .seq_ops = &bpf_iter_tcp_seq_ops,
3603 .init_seq_private = bpf_iter_init_tcp,
3604 .fini_seq_private = bpf_iter_fini_tcp,
3605 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3606 };
3607
3608 static const struct bpf_func_proto *
3609 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3610 const struct bpf_prog *prog)
3611 {
3612 switch (func_id) {
3613 case BPF_FUNC_setsockopt:
3614 return &bpf_sk_setsockopt_proto;
3615 case BPF_FUNC_getsockopt:
3616 return &bpf_sk_getsockopt_proto;
3617 default:
3618 return NULL;
3619 }
3620 }
3621
3622 static struct bpf_iter_reg tcp_reg_info = {
3623 .target = "tcp",
3624 .ctx_arg_info_size = 1,
3625 .ctx_arg_info = {
3626 { offsetof(struct bpf_iter__tcp, sk_common),
3627 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3628 },
3629 .get_func_proto = bpf_iter_tcp_get_func_proto,
3630 .seq_info = &tcp_seq_info,
3631 };
3632
3633 static void __init bpf_iter_register(void)
3634 {
3635 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3636 if (bpf_iter_reg_target(&tcp_reg_info))
3637 pr_warn("Warning: could not register bpf iterator tcp\n");
3638 }
3639
3640 #endif
3641
3642 void __init tcp_v4_init(void)
3643 {
3644 int cpu, res;
3645
3646 for_each_possible_cpu(cpu) {
3647 struct sock *sk;
3648
3649 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3650 IPPROTO_TCP, &init_net);
3651 if (res)
3652 panic("Failed to create the TCP control socket.\n");
3653 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3654
3655 /* Please enforce IP_DF and IPID==0 for RST and
3656 * ACK sent in SYN-RECV and TIME-WAIT state.
3657 */
3658 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3659
3660 sk->sk_clockid = CLOCK_MONOTONIC;
3661
3662 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3663 }
3664 if (register_pernet_subsys(&tcp_sk_ops))
3665 panic("Failed to create the TCP control socket.\n");
3666
3667 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3668 bpf_iter_register();
3669 #endif
3670 }
3671