1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102 }
103
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132 #endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
159 * process.
160 *
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
166 */
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186 {
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
190 */
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
207 struct flowi4 *fl4;
208 struct rtable *rt;
209 int err;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
212
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
218
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
223 if (!daddr)
224 return -EINVAL;
225 nexthop = inet_opt->opt.faddr;
226 }
227
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
233 IPPROTO_TCP,
234 orig_sport, orig_dport, sk);
235 if (IS_ERR(rt)) {
236 err = PTR_ERR(rt);
237 if (err == -ENETUNREACH)
238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 return err;
240 }
241
242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 ip_rt_put(rt);
244 return -ENETUNREACH;
245 }
246
247 if (!inet_opt || !inet_opt->opt.srr)
248 daddr = fl4->daddr;
249
250 if (!inet->inet_saddr)
251 inet->inet_saddr = fl4->saddr;
252 sk_rcv_saddr_set(sk, inet->inet_saddr);
253
254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 /* Reset inherited state */
256 tp->rx_opt.ts_recent = 0;
257 tp->rx_opt.ts_recent_stamp = 0;
258 if (likely(!tp->repair))
259 WRITE_ONCE(tp->write_seq, 0);
260 }
261
262 inet->inet_dport = usin->sin_port;
263 sk_daddr_set(sk, daddr);
264
265 inet_csk(sk)->icsk_ext_hdr_len = 0;
266 if (inet_opt)
267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268
269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270
271 /* Socket identity is still unknown (sport may be zero).
272 * However we set state to SYN-SENT and not releasing socket
273 * lock select source port, enter ourselves into the hash tables and
274 * complete initialization after this.
275 */
276 tcp_set_state(sk, TCP_SYN_SENT);
277 err = inet_hash_connect(tcp_death_row, sk);
278 if (err)
279 goto failure;
280
281 sk_set_txhash(sk);
282
283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 inet->inet_sport, inet->inet_dport, sk);
285 if (IS_ERR(rt)) {
286 err = PTR_ERR(rt);
287 rt = NULL;
288 goto failure;
289 }
290 /* OK, now commit destination to socket. */
291 sk->sk_gso_type = SKB_GSO_TCPV4;
292 sk_setup_caps(sk, &rt->dst);
293 rt = NULL;
294
295 if (likely(!tp->repair)) {
296 if (!tp->write_seq)
297 WRITE_ONCE(tp->write_seq,
298 secure_tcp_seq(inet->inet_saddr,
299 inet->inet_daddr,
300 inet->inet_sport,
301 usin->sin_port));
302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
303 inet->inet_saddr,
304 inet->inet_daddr);
305 }
306
307 inet->inet_id = prandom_u32();
308
309 if (tcp_fastopen_defer_connect(sk, &err))
310 return err;
311 if (err)
312 goto failure;
313
314 err = tcp_connect(sk);
315
316 if (err)
317 goto failure;
318
319 return 0;
320
321 failure:
322 /*
323 * This unhashes the socket and releases the local port,
324 * if necessary.
325 */
326 tcp_set_state(sk, TCP_CLOSE);
327 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
328 inet_reset_saddr(sk);
329 ip_rt_put(rt);
330 sk->sk_route_caps = 0;
331 inet->inet_dport = 0;
332 return err;
333 }
334 EXPORT_SYMBOL(tcp_v4_connect);
335
336 /*
337 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
338 * It can be called through tcp_release_cb() if socket was owned by user
339 * at the time tcp_v4_err() was called to handle ICMP message.
340 */
tcp_v4_mtu_reduced(struct sock * sk)341 void tcp_v4_mtu_reduced(struct sock *sk)
342 {
343 struct inet_sock *inet = inet_sk(sk);
344 struct dst_entry *dst;
345 u32 mtu;
346
347 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
348 return;
349 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
350 dst = inet_csk_update_pmtu(sk, mtu);
351 if (!dst)
352 return;
353
354 /* Something is about to be wrong... Remember soft error
355 * for the case, if this connection will not able to recover.
356 */
357 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
358 sk->sk_err_soft = EMSGSIZE;
359
360 mtu = dst_mtu(dst);
361
362 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
363 ip_sk_accept_pmtu(sk) &&
364 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
365 tcp_sync_mss(sk, mtu);
366
367 /* Resend the TCP packet because it's
368 * clear that the old packet has been
369 * dropped. This is the new "fast" path mtu
370 * discovery.
371 */
372 tcp_simple_retransmit(sk);
373 } /* else let the usual retransmit timer handle it */
374 }
375 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
376
do_redirect(struct sk_buff * skb,struct sock * sk)377 static void do_redirect(struct sk_buff *skb, struct sock *sk)
378 {
379 struct dst_entry *dst = __sk_dst_check(sk, 0);
380
381 if (dst)
382 dst->ops->redirect(dst, sk, skb);
383 }
384
385
386 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)387 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
388 {
389 struct request_sock *req = inet_reqsk(sk);
390 struct net *net = sock_net(sk);
391
392 /* ICMPs are not backlogged, hence we cannot get
393 * an established socket here.
394 */
395 if (seq != tcp_rsk(req)->snt_isn) {
396 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
397 } else if (abort) {
398 /*
399 * Still in SYN_RECV, just remove it silently.
400 * There is no good way to pass the error to the newly
401 * created socket, and POSIX does not want network
402 * errors returned from accept().
403 */
404 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
405 tcp_listendrop(req->rsk_listener);
406 }
407 reqsk_put(req);
408 }
409 EXPORT_SYMBOL(tcp_req_err);
410
411 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)412 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
413 {
414 struct inet_connection_sock *icsk = inet_csk(sk);
415 struct tcp_sock *tp = tcp_sk(sk);
416 struct sk_buff *skb;
417 s32 remaining;
418 u32 delta_us;
419
420 if (sock_owned_by_user(sk))
421 return;
422
423 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
424 !icsk->icsk_backoff)
425 return;
426
427 skb = tcp_rtx_queue_head(sk);
428 if (WARN_ON_ONCE(!skb))
429 return;
430
431 icsk->icsk_backoff--;
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
433 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
434
435 tcp_mstamp_refresh(tp);
436 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
437 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
438
439 if (remaining > 0) {
440 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
441 remaining, TCP_RTO_MAX);
442 } else {
443 /* RTO revert clocked out retransmission.
444 * Will retransmit now.
445 */
446 tcp_retransmit_timer(sk);
447 }
448 }
449 EXPORT_SYMBOL(tcp_ld_RTO_revert);
450
451 /*
452 * This routine is called by the ICMP module when it gets some
453 * sort of error condition. If err < 0 then the socket should
454 * be closed and the error returned to the user. If err > 0
455 * it's just the icmp type << 8 | icmp code. After adjustment
456 * header points to the first 8 bytes of the tcp header. We need
457 * to find the appropriate port.
458 *
459 * The locking strategy used here is very "optimistic". When
460 * someone else accesses the socket the ICMP is just dropped
461 * and for some paths there is no check at all.
462 * A more general error queue to queue errors for later handling
463 * is probably better.
464 *
465 */
466
tcp_v4_err(struct sk_buff * skb,u32 info)467 int tcp_v4_err(struct sk_buff *skb, u32 info)
468 {
469 const struct iphdr *iph = (const struct iphdr *)skb->data;
470 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
471 struct tcp_sock *tp;
472 struct inet_sock *inet;
473 const int type = icmp_hdr(skb)->type;
474 const int code = icmp_hdr(skb)->code;
475 struct sock *sk;
476 struct request_sock *fastopen;
477 u32 seq, snd_una;
478 int err;
479 struct net *net = dev_net(skb->dev);
480
481 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
482 th->dest, iph->saddr, ntohs(th->source),
483 inet_iif(skb), 0);
484 if (!sk) {
485 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
486 return -ENOENT;
487 }
488 if (sk->sk_state == TCP_TIME_WAIT) {
489 inet_twsk_put(inet_twsk(sk));
490 return 0;
491 }
492 seq = ntohl(th->seq);
493 if (sk->sk_state == TCP_NEW_SYN_RECV) {
494 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
495 type == ICMP_TIME_EXCEEDED ||
496 (type == ICMP_DEST_UNREACH &&
497 (code == ICMP_NET_UNREACH ||
498 code == ICMP_HOST_UNREACH)));
499 return 0;
500 }
501
502 bh_lock_sock(sk);
503 /* If too many ICMPs get dropped on busy
504 * servers this needs to be solved differently.
505 * We do take care of PMTU discovery (RFC1191) special case :
506 * we can receive locally generated ICMP messages while socket is held.
507 */
508 if (sock_owned_by_user(sk)) {
509 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
510 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
511 }
512 if (sk->sk_state == TCP_CLOSE)
513 goto out;
514
515 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 goto out;
518 }
519
520 tp = tcp_sk(sk);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 goto out;
528 }
529
530 switch (type) {
531 case ICMP_REDIRECT:
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
534 goto out;
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
549 */
550 if (sk->sk_state == TCP_LISTEN)
551 goto out;
552
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
556 } else {
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 sock_hold(sk);
559 }
560 goto out;
561 }
562
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
565 * (see RFC 6069)
566 */
567 if (!fastopen &&
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
570 break;
571 case ICMP_TIME_EXCEEDED:
572 err = EHOSTUNREACH;
573 break;
574 default:
575 goto out;
576 }
577
578 switch (sk->sk_state) {
579 case TCP_SYN_SENT:
580 case TCP_SYN_RECV:
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
583 */
584 if (fastopen && !fastopen->sk)
585 break;
586
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk->sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
603 *
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
607 *
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 *
613 * Now we are in compliance with RFCs.
614 * --ANK (980905)
615 */
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk->sk_error_report(sk);
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
623 }
624
625 out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629 }
630
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639
640 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648
649 /*
650 * This routine will send an RST to the other tcp.
651 *
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653 * for reset.
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
660 */
661
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)662 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
663 {
664 const struct tcphdr *th = tcp_hdr(skb);
665 struct {
666 struct tcphdr th;
667 #ifdef CONFIG_TCP_MD5SIG
668 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
669 #endif
670 } rep;
671 struct ip_reply_arg arg;
672 #ifdef CONFIG_TCP_MD5SIG
673 struct tcp_md5sig_key *key = NULL;
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
678 #endif
679 u64 transmit_time = 0;
680 struct sock *ctl_sk;
681 struct net *net;
682
683 /* Never send a reset in response to a reset. */
684 if (th->rst)
685 return;
686
687 /* If sk not NULL, it means we did a successful lookup and incoming
688 * route had to be correct. prequeue might have dropped our dst.
689 */
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 return;
692
693 /* Swap the send and the receive. */
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
699
700 if (th->ack) {
701 rep.th.seq = th->ack_seq;
702 } else {
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
706 }
707
708 memset(&arg, 0, sizeof(arg));
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713 #ifdef CONFIG_TCP_MD5SIG
714 rcu_read_lock();
715 hash_location = tcp_parse_md5sig_option(th);
716 if (sk && sk_fullsock(sk)) {
717 const union tcp_md5_addr *addr;
718 int l3index;
719
720 /* sdif set, means packet ingressed via a device
721 * in an L3 domain and inet_iif is set to it.
722 */
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 } else if (hash_location) {
727 const union tcp_md5_addr *addr;
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
730 int l3index;
731
732 /*
733 * active side is lost. Try to find listening socket through
734 * source port, and then find md5 key through listening socket.
735 * we are not loose security here:
736 * Incoming packet is checked with md5 hash with finding key,
737 * no RST generated if md5 hash doesn't match.
738 */
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 ip_hdr(skb)->saddr,
741 th->source, ip_hdr(skb)->daddr,
742 ntohs(th->source), dif, sdif);
743 /* don't send rst if it can't find key */
744 if (!sk1)
745 goto out;
746
747 /* sdif set, means packet ingressed via a device
748 * in an L3 domain and dif is set to it.
749 */
750 l3index = sdif ? dif : 0;
751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 if (!key)
754 goto out;
755
756
757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 goto out;
760
761 }
762
763 if (key) {
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768 /* Update length and the length the header thinks exists */
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
771
772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
775 }
776 #endif
777 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
778 ip_hdr(skb)->saddr, /* XXX */
779 arg.iov[0].iov_len, IPPROTO_TCP, 0);
780 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
781 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
782
783 /* When socket is gone, all binding information is lost.
784 * routing might fail in this case. No choice here, if we choose to force
785 * input interface, we will misroute in case of asymmetric route.
786 */
787 if (sk) {
788 arg.bound_dev_if = sk->sk_bound_dev_if;
789 if (sk_fullsock(sk))
790 trace_tcp_send_reset(sk, skb);
791 }
792
793 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
794 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
795
796 arg.tos = ip_hdr(skb)->tos;
797 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
798 local_bh_disable();
799 ctl_sk = this_cpu_read(ipv4_tcp_sk);
800 sock_net_set(ctl_sk, net);
801 if (sk) {
802 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
803 inet_twsk(sk)->tw_mark : sk->sk_mark;
804 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
805 inet_twsk(sk)->tw_priority : sk->sk_priority;
806 transmit_time = tcp_transmit_time(sk);
807 xfrm_sk_clone_policy(ctl_sk, sk);
808 } else {
809 ctl_sk->sk_mark = 0;
810 ctl_sk->sk_priority = 0;
811 }
812 ip_send_unicast_reply(ctl_sk,
813 skb, &TCP_SKB_CB(skb)->header.h4.opt,
814 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
815 &arg, arg.iov[0].iov_len,
816 transmit_time);
817
818 xfrm_sk_free_policy(ctl_sk);
819 sock_net_set(ctl_sk, &init_net);
820 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
821 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
822 local_bh_enable();
823
824 #ifdef CONFIG_TCP_MD5SIG
825 out:
826 rcu_read_unlock();
827 #endif
828 }
829
830 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
831 outside socket context is ugly, certainly. What can I do?
832 */
833
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)834 static void tcp_v4_send_ack(const struct sock *sk,
835 struct sk_buff *skb, u32 seq, u32 ack,
836 u32 win, u32 tsval, u32 tsecr, int oif,
837 struct tcp_md5sig_key *key,
838 int reply_flags, u8 tos)
839 {
840 const struct tcphdr *th = tcp_hdr(skb);
841 struct {
842 struct tcphdr th;
843 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
844 #ifdef CONFIG_TCP_MD5SIG
845 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
846 #endif
847 ];
848 } rep;
849 struct net *net = sock_net(sk);
850 struct ip_reply_arg arg;
851 struct sock *ctl_sk;
852 u64 transmit_time;
853
854 memset(&rep.th, 0, sizeof(struct tcphdr));
855 memset(&arg, 0, sizeof(arg));
856
857 arg.iov[0].iov_base = (unsigned char *)&rep;
858 arg.iov[0].iov_len = sizeof(rep.th);
859 if (tsecr) {
860 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
861 (TCPOPT_TIMESTAMP << 8) |
862 TCPOLEN_TIMESTAMP);
863 rep.opt[1] = htonl(tsval);
864 rep.opt[2] = htonl(tsecr);
865 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
866 }
867
868 /* Swap the send and the receive. */
869 rep.th.dest = th->source;
870 rep.th.source = th->dest;
871 rep.th.doff = arg.iov[0].iov_len / 4;
872 rep.th.seq = htonl(seq);
873 rep.th.ack_seq = htonl(ack);
874 rep.th.ack = 1;
875 rep.th.window = htons(win);
876
877 #ifdef CONFIG_TCP_MD5SIG
878 if (key) {
879 int offset = (tsecr) ? 3 : 0;
880
881 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
882 (TCPOPT_NOP << 16) |
883 (TCPOPT_MD5SIG << 8) |
884 TCPOLEN_MD5SIG);
885 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
886 rep.th.doff = arg.iov[0].iov_len/4;
887
888 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
889 key, ip_hdr(skb)->saddr,
890 ip_hdr(skb)->daddr, &rep.th);
891 }
892 #endif
893 arg.flags = reply_flags;
894 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
895 ip_hdr(skb)->saddr, /* XXX */
896 arg.iov[0].iov_len, IPPROTO_TCP, 0);
897 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
898 if (oif)
899 arg.bound_dev_if = oif;
900 arg.tos = tos;
901 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
902 local_bh_disable();
903 ctl_sk = this_cpu_read(ipv4_tcp_sk);
904 sock_net_set(ctl_sk, net);
905 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
906 inet_twsk(sk)->tw_mark : sk->sk_mark;
907 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
908 inet_twsk(sk)->tw_priority : sk->sk_priority;
909 transmit_time = tcp_transmit_time(sk);
910 ip_send_unicast_reply(ctl_sk,
911 skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 &arg, arg.iov[0].iov_len,
914 transmit_time);
915
916 sock_net_set(ctl_sk, &init_net);
917 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
918 local_bh_enable();
919 }
920
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)921 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
922 {
923 struct inet_timewait_sock *tw = inet_twsk(sk);
924 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
925
926 tcp_v4_send_ack(sk, skb,
927 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
928 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
929 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
930 tcptw->tw_ts_recent,
931 tw->tw_bound_dev_if,
932 tcp_twsk_md5_key(tcptw),
933 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
934 tw->tw_tos
935 );
936
937 inet_twsk_put(tw);
938 }
939
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)940 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
941 struct request_sock *req)
942 {
943 const union tcp_md5_addr *addr;
944 int l3index;
945
946 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
947 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
948 */
949 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
950 tcp_sk(sk)->snd_nxt;
951
952 /* RFC 7323 2.3
953 * The window field (SEG.WND) of every outgoing segment, with the
954 * exception of <SYN> segments, MUST be right-shifted by
955 * Rcv.Wind.Shift bits:
956 */
957 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
958 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
959 tcp_v4_send_ack(sk, skb, seq,
960 tcp_rsk(req)->rcv_nxt,
961 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
962 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
963 READ_ONCE(req->ts_recent),
964 0,
965 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
966 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
967 ip_hdr(skb)->tos);
968 }
969
970 /*
971 * Send a SYN-ACK after having received a SYN.
972 * This still operates on a request_sock only, not on a big
973 * socket.
974 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)975 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
976 struct flowi *fl,
977 struct request_sock *req,
978 struct tcp_fastopen_cookie *foc,
979 enum tcp_synack_type synack_type,
980 struct sk_buff *syn_skb)
981 {
982 const struct inet_request_sock *ireq = inet_rsk(req);
983 struct flowi4 fl4;
984 int err = -1;
985 struct sk_buff *skb;
986 u8 tos;
987
988 /* First, grab a route. */
989 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
990 return -1;
991
992 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
993
994 if (skb) {
995 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
996
997 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
998 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
999 (inet_sk(sk)->tos & INET_ECN_MASK) :
1000 inet_sk(sk)->tos;
1001
1002 if (!INET_ECN_is_capable(tos) &&
1003 tcp_bpf_ca_needs_ecn((struct sock *)req))
1004 tos |= INET_ECN_ECT_0;
1005
1006 rcu_read_lock();
1007 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1008 ireq->ir_rmt_addr,
1009 rcu_dereference(ireq->ireq_opt),
1010 tos);
1011 rcu_read_unlock();
1012 err = net_xmit_eval(err);
1013 }
1014
1015 return err;
1016 }
1017
1018 /*
1019 * IPv4 request_sock destructor.
1020 */
tcp_v4_reqsk_destructor(struct request_sock * req)1021 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1022 {
1023 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1024 }
1025
1026 #ifdef CONFIG_TCP_MD5SIG
1027 /*
1028 * RFC2385 MD5 checksumming requires a mapping of
1029 * IP address->MD5 Key.
1030 * We need to maintain these in the sk structure.
1031 */
1032
1033 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1034 EXPORT_SYMBOL(tcp_md5_needed);
1035
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1036 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1037 {
1038 if (!old)
1039 return true;
1040
1041 /* l3index always overrides non-l3index */
1042 if (old->l3index && new->l3index == 0)
1043 return false;
1044 if (old->l3index == 0 && new->l3index)
1045 return true;
1046
1047 return old->prefixlen < new->prefixlen;
1048 }
1049
1050 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1051 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1052 const union tcp_md5_addr *addr,
1053 int family)
1054 {
1055 const struct tcp_sock *tp = tcp_sk(sk);
1056 struct tcp_md5sig_key *key;
1057 const struct tcp_md5sig_info *md5sig;
1058 __be32 mask;
1059 struct tcp_md5sig_key *best_match = NULL;
1060 bool match;
1061
1062 /* caller either holds rcu_read_lock() or socket lock */
1063 md5sig = rcu_dereference_check(tp->md5sig_info,
1064 lockdep_sock_is_held(sk));
1065 if (!md5sig)
1066 return NULL;
1067
1068 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1069 lockdep_sock_is_held(sk)) {
1070 if (key->family != family)
1071 continue;
1072 if (key->l3index && key->l3index != l3index)
1073 continue;
1074 if (family == AF_INET) {
1075 mask = inet_make_mask(key->prefixlen);
1076 match = (key->addr.a4.s_addr & mask) ==
1077 (addr->a4.s_addr & mask);
1078 #if IS_ENABLED(CONFIG_IPV6)
1079 } else if (family == AF_INET6) {
1080 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1081 key->prefixlen);
1082 #endif
1083 } else {
1084 match = false;
1085 }
1086
1087 if (match && better_md5_match(best_match, key))
1088 best_match = key;
1089 }
1090 return best_match;
1091 }
1092 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1093
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1094 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1095 const union tcp_md5_addr *addr,
1096 int family, u8 prefixlen,
1097 int l3index)
1098 {
1099 const struct tcp_sock *tp = tcp_sk(sk);
1100 struct tcp_md5sig_key *key;
1101 unsigned int size = sizeof(struct in_addr);
1102 const struct tcp_md5sig_info *md5sig;
1103
1104 /* caller either holds rcu_read_lock() or socket lock */
1105 md5sig = rcu_dereference_check(tp->md5sig_info,
1106 lockdep_sock_is_held(sk));
1107 if (!md5sig)
1108 return NULL;
1109 #if IS_ENABLED(CONFIG_IPV6)
1110 if (family == AF_INET6)
1111 size = sizeof(struct in6_addr);
1112 #endif
1113 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1114 lockdep_sock_is_held(sk)) {
1115 if (key->family != family)
1116 continue;
1117 if (key->l3index != l3index)
1118 continue;
1119 if (!memcmp(&key->addr, addr, size) &&
1120 key->prefixlen == prefixlen)
1121 return key;
1122 }
1123 return NULL;
1124 }
1125
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1126 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1127 const struct sock *addr_sk)
1128 {
1129 const union tcp_md5_addr *addr;
1130 int l3index;
1131
1132 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1133 addr_sk->sk_bound_dev_if);
1134 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1135 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1136 }
1137 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1138
1139 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1140 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1141 int family, u8 prefixlen, int l3index,
1142 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1143 {
1144 /* Add Key to the list */
1145 struct tcp_md5sig_key *key;
1146 struct tcp_sock *tp = tcp_sk(sk);
1147 struct tcp_md5sig_info *md5sig;
1148
1149 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1150 if (key) {
1151 /* Pre-existing entry - just update that one.
1152 * Note that the key might be used concurrently.
1153 * data_race() is telling kcsan that we do not care of
1154 * key mismatches, since changing MD5 key on live flows
1155 * can lead to packet drops.
1156 */
1157 data_race(memcpy(key->key, newkey, newkeylen));
1158
1159 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1160 * Also note that a reader could catch new key->keylen value
1161 * but old key->key[], this is the reason we use __GFP_ZERO
1162 * at sock_kmalloc() time below these lines.
1163 */
1164 WRITE_ONCE(key->keylen, newkeylen);
1165
1166 return 0;
1167 }
1168
1169 md5sig = rcu_dereference_protected(tp->md5sig_info,
1170 lockdep_sock_is_held(sk));
1171 if (!md5sig) {
1172 md5sig = kmalloc(sizeof(*md5sig), gfp);
1173 if (!md5sig)
1174 return -ENOMEM;
1175
1176 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1177 INIT_HLIST_HEAD(&md5sig->head);
1178 rcu_assign_pointer(tp->md5sig_info, md5sig);
1179 }
1180
1181 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1182 if (!key)
1183 return -ENOMEM;
1184 if (!tcp_alloc_md5sig_pool()) {
1185 sock_kfree_s(sk, key, sizeof(*key));
1186 return -ENOMEM;
1187 }
1188
1189 memcpy(key->key, newkey, newkeylen);
1190 key->keylen = newkeylen;
1191 key->family = family;
1192 key->prefixlen = prefixlen;
1193 key->l3index = l3index;
1194 memcpy(&key->addr, addr,
1195 (family == AF_INET6) ? sizeof(struct in6_addr) :
1196 sizeof(struct in_addr));
1197 hlist_add_head_rcu(&key->node, &md5sig->head);
1198 return 0;
1199 }
1200 EXPORT_SYMBOL(tcp_md5_do_add);
1201
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1202 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1203 u8 prefixlen, int l3index)
1204 {
1205 struct tcp_md5sig_key *key;
1206
1207 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1208 if (!key)
1209 return -ENOENT;
1210 hlist_del_rcu(&key->node);
1211 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1212 kfree_rcu(key, rcu);
1213 return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_del);
1216
tcp_clear_md5_list(struct sock * sk)1217 static void tcp_clear_md5_list(struct sock *sk)
1218 {
1219 struct tcp_sock *tp = tcp_sk(sk);
1220 struct tcp_md5sig_key *key;
1221 struct hlist_node *n;
1222 struct tcp_md5sig_info *md5sig;
1223
1224 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1225
1226 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1227 hlist_del_rcu(&key->node);
1228 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 kfree_rcu(key, rcu);
1230 }
1231 }
1232
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1233 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1234 sockptr_t optval, int optlen)
1235 {
1236 struct tcp_md5sig cmd;
1237 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1238 const union tcp_md5_addr *addr;
1239 u8 prefixlen = 32;
1240 int l3index = 0;
1241
1242 if (optlen < sizeof(cmd))
1243 return -EINVAL;
1244
1245 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1246 return -EFAULT;
1247
1248 if (sin->sin_family != AF_INET)
1249 return -EINVAL;
1250
1251 if (optname == TCP_MD5SIG_EXT &&
1252 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1253 prefixlen = cmd.tcpm_prefixlen;
1254 if (prefixlen > 32)
1255 return -EINVAL;
1256 }
1257
1258 if (optname == TCP_MD5SIG_EXT &&
1259 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1260 struct net_device *dev;
1261
1262 rcu_read_lock();
1263 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1264 if (dev && netif_is_l3_master(dev))
1265 l3index = dev->ifindex;
1266
1267 rcu_read_unlock();
1268
1269 /* ok to reference set/not set outside of rcu;
1270 * right now device MUST be an L3 master
1271 */
1272 if (!dev || !l3index)
1273 return -EINVAL;
1274 }
1275
1276 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1277
1278 if (!cmd.tcpm_keylen)
1279 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1280
1281 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1282 return -EINVAL;
1283
1284 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1285 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1286 }
1287
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1288 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1289 __be32 daddr, __be32 saddr,
1290 const struct tcphdr *th, int nbytes)
1291 {
1292 struct tcp4_pseudohdr *bp;
1293 struct scatterlist sg;
1294 struct tcphdr *_th;
1295
1296 bp = hp->scratch;
1297 bp->saddr = saddr;
1298 bp->daddr = daddr;
1299 bp->pad = 0;
1300 bp->protocol = IPPROTO_TCP;
1301 bp->len = cpu_to_be16(nbytes);
1302
1303 _th = (struct tcphdr *)(bp + 1);
1304 memcpy(_th, th, sizeof(*th));
1305 _th->check = 0;
1306
1307 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1308 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1309 sizeof(*bp) + sizeof(*th));
1310 return crypto_ahash_update(hp->md5_req);
1311 }
1312
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1313 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1314 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1315 {
1316 struct tcp_md5sig_pool *hp;
1317 struct ahash_request *req;
1318
1319 hp = tcp_get_md5sig_pool();
1320 if (!hp)
1321 goto clear_hash_noput;
1322 req = hp->md5_req;
1323
1324 if (crypto_ahash_init(req))
1325 goto clear_hash;
1326 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1327 goto clear_hash;
1328 if (tcp_md5_hash_key(hp, key))
1329 goto clear_hash;
1330 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1331 if (crypto_ahash_final(req))
1332 goto clear_hash;
1333
1334 tcp_put_md5sig_pool();
1335 return 0;
1336
1337 clear_hash:
1338 tcp_put_md5sig_pool();
1339 clear_hash_noput:
1340 memset(md5_hash, 0, 16);
1341 return 1;
1342 }
1343
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1344 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1345 const struct sock *sk,
1346 const struct sk_buff *skb)
1347 {
1348 struct tcp_md5sig_pool *hp;
1349 struct ahash_request *req;
1350 const struct tcphdr *th = tcp_hdr(skb);
1351 __be32 saddr, daddr;
1352
1353 if (sk) { /* valid for establish/request sockets */
1354 saddr = sk->sk_rcv_saddr;
1355 daddr = sk->sk_daddr;
1356 } else {
1357 const struct iphdr *iph = ip_hdr(skb);
1358 saddr = iph->saddr;
1359 daddr = iph->daddr;
1360 }
1361
1362 hp = tcp_get_md5sig_pool();
1363 if (!hp)
1364 goto clear_hash_noput;
1365 req = hp->md5_req;
1366
1367 if (crypto_ahash_init(req))
1368 goto clear_hash;
1369
1370 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1371 goto clear_hash;
1372 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1373 goto clear_hash;
1374 if (tcp_md5_hash_key(hp, key))
1375 goto clear_hash;
1376 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1377 if (crypto_ahash_final(req))
1378 goto clear_hash;
1379
1380 tcp_put_md5sig_pool();
1381 return 0;
1382
1383 clear_hash:
1384 tcp_put_md5sig_pool();
1385 clear_hash_noput:
1386 memset(md5_hash, 0, 16);
1387 return 1;
1388 }
1389 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1390
1391 #endif
1392
1393 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1394 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1395 const struct sk_buff *skb,
1396 int dif, int sdif)
1397 {
1398 #ifdef CONFIG_TCP_MD5SIG
1399 /*
1400 * This gets called for each TCP segment that arrives
1401 * so we want to be efficient.
1402 * We have 3 drop cases:
1403 * o No MD5 hash and one expected.
1404 * o MD5 hash and we're not expecting one.
1405 * o MD5 hash and its wrong.
1406 */
1407 const __u8 *hash_location = NULL;
1408 struct tcp_md5sig_key *hash_expected;
1409 const struct iphdr *iph = ip_hdr(skb);
1410 const struct tcphdr *th = tcp_hdr(skb);
1411 const union tcp_md5_addr *addr;
1412 unsigned char newhash[16];
1413 int genhash, l3index;
1414
1415 /* sdif set, means packet ingressed via a device
1416 * in an L3 domain and dif is set to the l3mdev
1417 */
1418 l3index = sdif ? dif : 0;
1419
1420 addr = (union tcp_md5_addr *)&iph->saddr;
1421 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1422 hash_location = tcp_parse_md5sig_option(th);
1423
1424 /* We've parsed the options - do we have a hash? */
1425 if (!hash_expected && !hash_location)
1426 return false;
1427
1428 if (hash_expected && !hash_location) {
1429 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1430 return true;
1431 }
1432
1433 if (!hash_expected && hash_location) {
1434 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1435 return true;
1436 }
1437
1438 /* Okay, so this is hash_expected and hash_location -
1439 * so we need to calculate the checksum.
1440 */
1441 genhash = tcp_v4_md5_hash_skb(newhash,
1442 hash_expected,
1443 NULL, skb);
1444
1445 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1446 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1447 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1448 &iph->saddr, ntohs(th->source),
1449 &iph->daddr, ntohs(th->dest),
1450 genhash ? " tcp_v4_calc_md5_hash failed"
1451 : "", l3index);
1452 return true;
1453 }
1454 return false;
1455 #endif
1456 return false;
1457 }
1458
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1459 static void tcp_v4_init_req(struct request_sock *req,
1460 const struct sock *sk_listener,
1461 struct sk_buff *skb)
1462 {
1463 struct inet_request_sock *ireq = inet_rsk(req);
1464 struct net *net = sock_net(sk_listener);
1465
1466 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1467 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1468 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1469 }
1470
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1471 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1472 struct flowi *fl,
1473 const struct request_sock *req)
1474 {
1475 return inet_csk_route_req(sk, &fl->u.ip4, req);
1476 }
1477
1478 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1479 .family = PF_INET,
1480 .obj_size = sizeof(struct tcp_request_sock),
1481 .rtx_syn_ack = tcp_rtx_synack,
1482 .send_ack = tcp_v4_reqsk_send_ack,
1483 .destructor = tcp_v4_reqsk_destructor,
1484 .send_reset = tcp_v4_send_reset,
1485 .syn_ack_timeout = tcp_syn_ack_timeout,
1486 };
1487
1488 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1489 .mss_clamp = TCP_MSS_DEFAULT,
1490 #ifdef CONFIG_TCP_MD5SIG
1491 .req_md5_lookup = tcp_v4_md5_lookup,
1492 .calc_md5_hash = tcp_v4_md5_hash_skb,
1493 #endif
1494 .init_req = tcp_v4_init_req,
1495 #ifdef CONFIG_SYN_COOKIES
1496 .cookie_init_seq = cookie_v4_init_sequence,
1497 #endif
1498 .route_req = tcp_v4_route_req,
1499 .init_seq = tcp_v4_init_seq,
1500 .init_ts_off = tcp_v4_init_ts_off,
1501 .send_synack = tcp_v4_send_synack,
1502 };
1503
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1504 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1505 {
1506 /* Never answer to SYNs send to broadcast or multicast */
1507 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1508 goto drop;
1509
1510 return tcp_conn_request(&tcp_request_sock_ops,
1511 &tcp_request_sock_ipv4_ops, sk, skb);
1512
1513 drop:
1514 tcp_listendrop(sk);
1515 return 0;
1516 }
1517 EXPORT_SYMBOL(tcp_v4_conn_request);
1518
1519
1520 /*
1521 * The three way handshake has completed - we got a valid synack -
1522 * now create the new socket.
1523 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1524 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1525 struct request_sock *req,
1526 struct dst_entry *dst,
1527 struct request_sock *req_unhash,
1528 bool *own_req)
1529 {
1530 struct inet_request_sock *ireq;
1531 bool found_dup_sk = false;
1532 struct inet_sock *newinet;
1533 struct tcp_sock *newtp;
1534 struct sock *newsk;
1535 #ifdef CONFIG_TCP_MD5SIG
1536 const union tcp_md5_addr *addr;
1537 struct tcp_md5sig_key *key;
1538 int l3index;
1539 #endif
1540 struct ip_options_rcu *inet_opt;
1541
1542 if (sk_acceptq_is_full(sk))
1543 goto exit_overflow;
1544
1545 newsk = tcp_create_openreq_child(sk, req, skb);
1546 if (!newsk)
1547 goto exit_nonewsk;
1548
1549 newsk->sk_gso_type = SKB_GSO_TCPV4;
1550 inet_sk_rx_dst_set(newsk, skb);
1551
1552 newtp = tcp_sk(newsk);
1553 newinet = inet_sk(newsk);
1554 ireq = inet_rsk(req);
1555 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1556 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1557 newsk->sk_bound_dev_if = ireq->ir_iif;
1558 newinet->inet_saddr = ireq->ir_loc_addr;
1559 inet_opt = rcu_dereference(ireq->ireq_opt);
1560 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1561 newinet->mc_index = inet_iif(skb);
1562 newinet->mc_ttl = ip_hdr(skb)->ttl;
1563 newinet->rcv_tos = ip_hdr(skb)->tos;
1564 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1565 if (inet_opt)
1566 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1567 newinet->inet_id = prandom_u32();
1568
1569 /* Set ToS of the new socket based upon the value of incoming SYN.
1570 * ECT bits are set later in tcp_init_transfer().
1571 */
1572 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1573 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1574
1575 if (!dst) {
1576 dst = inet_csk_route_child_sock(sk, newsk, req);
1577 if (!dst)
1578 goto put_and_exit;
1579 } else {
1580 /* syncookie case : see end of cookie_v4_check() */
1581 }
1582 sk_setup_caps(newsk, dst);
1583
1584 tcp_ca_openreq_child(newsk, dst);
1585
1586 tcp_sync_mss(newsk, dst_mtu(dst));
1587 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1588
1589 tcp_initialize_rcv_mss(newsk);
1590
1591 #ifdef CONFIG_TCP_MD5SIG
1592 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1593 /* Copy over the MD5 key from the original socket */
1594 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1595 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1596 if (key) {
1597 /*
1598 * We're using one, so create a matching key
1599 * on the newsk structure. If we fail to get
1600 * memory, then we end up not copying the key
1601 * across. Shucks.
1602 */
1603 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1604 key->key, key->keylen, GFP_ATOMIC);
1605 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1606 }
1607 #endif
1608
1609 if (__inet_inherit_port(sk, newsk) < 0)
1610 goto put_and_exit;
1611 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1612 &found_dup_sk);
1613 if (likely(*own_req)) {
1614 tcp_move_syn(newtp, req);
1615 ireq->ireq_opt = NULL;
1616 } else {
1617 newinet->inet_opt = NULL;
1618
1619 if (!req_unhash && found_dup_sk) {
1620 /* This code path should only be executed in the
1621 * syncookie case only
1622 */
1623 bh_unlock_sock(newsk);
1624 sock_put(newsk);
1625 newsk = NULL;
1626 }
1627 }
1628 return newsk;
1629
1630 exit_overflow:
1631 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1632 exit_nonewsk:
1633 dst_release(dst);
1634 exit:
1635 tcp_listendrop(sk);
1636 return NULL;
1637 put_and_exit:
1638 newinet->inet_opt = NULL;
1639 inet_csk_prepare_forced_close(newsk);
1640 tcp_done(newsk);
1641 goto exit;
1642 }
1643 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1644
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1645 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1646 {
1647 #ifdef CONFIG_SYN_COOKIES
1648 const struct tcphdr *th = tcp_hdr(skb);
1649
1650 if (!th->syn)
1651 sk = cookie_v4_check(sk, skb);
1652 #endif
1653 return sk;
1654 }
1655
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1656 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1657 struct tcphdr *th, u32 *cookie)
1658 {
1659 u16 mss = 0;
1660 #ifdef CONFIG_SYN_COOKIES
1661 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1662 &tcp_request_sock_ipv4_ops, sk, th);
1663 if (mss) {
1664 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1665 tcp_synq_overflow(sk);
1666 }
1667 #endif
1668 return mss;
1669 }
1670
1671 /* The socket must have it's spinlock held when we get
1672 * here, unless it is a TCP_LISTEN socket.
1673 *
1674 * We have a potential double-lock case here, so even when
1675 * doing backlog processing we use the BH locking scheme.
1676 * This is because we cannot sleep with the original spinlock
1677 * held.
1678 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1679 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1680 {
1681 struct sock *rsk;
1682
1683 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1684 struct dst_entry *dst;
1685
1686 dst = rcu_dereference_protected(sk->sk_rx_dst,
1687 lockdep_sock_is_held(sk));
1688
1689 sock_rps_save_rxhash(sk, skb);
1690 sk_mark_napi_id(sk, skb);
1691 if (dst) {
1692 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1693 !dst->ops->check(dst, 0)) {
1694 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1695 dst_release(dst);
1696 }
1697 }
1698 tcp_rcv_established(sk, skb);
1699 return 0;
1700 }
1701
1702 if (tcp_checksum_complete(skb))
1703 goto csum_err;
1704
1705 if (sk->sk_state == TCP_LISTEN) {
1706 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1707
1708 if (!nsk)
1709 goto discard;
1710 if (nsk != sk) {
1711 if (tcp_child_process(sk, nsk, skb)) {
1712 rsk = nsk;
1713 goto reset;
1714 }
1715 return 0;
1716 }
1717 } else
1718 sock_rps_save_rxhash(sk, skb);
1719
1720 if (tcp_rcv_state_process(sk, skb)) {
1721 rsk = sk;
1722 goto reset;
1723 }
1724 return 0;
1725
1726 reset:
1727 tcp_v4_send_reset(rsk, skb);
1728 discard:
1729 kfree_skb(skb);
1730 /* Be careful here. If this function gets more complicated and
1731 * gcc suffers from register pressure on the x86, sk (in %ebx)
1732 * might be destroyed here. This current version compiles correctly,
1733 * but you have been warned.
1734 */
1735 return 0;
1736
1737 csum_err:
1738 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1739 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1740 goto discard;
1741 }
1742 EXPORT_SYMBOL(tcp_v4_do_rcv);
1743
tcp_v4_early_demux(struct sk_buff * skb)1744 int tcp_v4_early_demux(struct sk_buff *skb)
1745 {
1746 const struct iphdr *iph;
1747 const struct tcphdr *th;
1748 struct sock *sk;
1749
1750 if (skb->pkt_type != PACKET_HOST)
1751 return 0;
1752
1753 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1754 return 0;
1755
1756 iph = ip_hdr(skb);
1757 th = tcp_hdr(skb);
1758
1759 if (th->doff < sizeof(struct tcphdr) / 4)
1760 return 0;
1761
1762 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1763 iph->saddr, th->source,
1764 iph->daddr, ntohs(th->dest),
1765 skb->skb_iif, inet_sdif(skb));
1766 if (sk) {
1767 skb->sk = sk;
1768 skb->destructor = sock_edemux;
1769 if (sk_fullsock(sk)) {
1770 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1771
1772 if (dst)
1773 dst = dst_check(dst, 0);
1774 if (dst &&
1775 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1776 skb_dst_set_noref(skb, dst);
1777 }
1778 }
1779 return 0;
1780 }
1781
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1782 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1783 {
1784 u32 limit, tail_gso_size, tail_gso_segs;
1785 struct skb_shared_info *shinfo;
1786 const struct tcphdr *th;
1787 struct tcphdr *thtail;
1788 struct sk_buff *tail;
1789 unsigned int hdrlen;
1790 bool fragstolen;
1791 u32 gso_segs;
1792 u32 gso_size;
1793 int delta;
1794
1795 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1796 * we can fix skb->truesize to its real value to avoid future drops.
1797 * This is valid because skb is not yet charged to the socket.
1798 * It has been noticed pure SACK packets were sometimes dropped
1799 * (if cooked by drivers without copybreak feature).
1800 */
1801 skb_condense(skb);
1802
1803 skb_dst_drop(skb);
1804
1805 if (unlikely(tcp_checksum_complete(skb))) {
1806 bh_unlock_sock(sk);
1807 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1808 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1809 return true;
1810 }
1811
1812 /* Attempt coalescing to last skb in backlog, even if we are
1813 * above the limits.
1814 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1815 */
1816 th = (const struct tcphdr *)skb->data;
1817 hdrlen = th->doff * 4;
1818
1819 tail = sk->sk_backlog.tail;
1820 if (!tail)
1821 goto no_coalesce;
1822 thtail = (struct tcphdr *)tail->data;
1823
1824 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1825 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1826 ((TCP_SKB_CB(tail)->tcp_flags |
1827 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1828 !((TCP_SKB_CB(tail)->tcp_flags &
1829 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1830 ((TCP_SKB_CB(tail)->tcp_flags ^
1831 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1832 #ifdef CONFIG_TLS_DEVICE
1833 tail->decrypted != skb->decrypted ||
1834 #endif
1835 thtail->doff != th->doff ||
1836 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1837 goto no_coalesce;
1838
1839 __skb_pull(skb, hdrlen);
1840
1841 shinfo = skb_shinfo(skb);
1842 gso_size = shinfo->gso_size ?: skb->len;
1843 gso_segs = shinfo->gso_segs ?: 1;
1844
1845 shinfo = skb_shinfo(tail);
1846 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1847 tail_gso_segs = shinfo->gso_segs ?: 1;
1848
1849 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1850 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1851
1852 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1853 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1854 thtail->window = th->window;
1855 }
1856
1857 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1858 * thtail->fin, so that the fast path in tcp_rcv_established()
1859 * is not entered if we append a packet with a FIN.
1860 * SYN, RST, URG are not present.
1861 * ACK is set on both packets.
1862 * PSH : we do not really care in TCP stack,
1863 * at least for 'GRO' packets.
1864 */
1865 thtail->fin |= th->fin;
1866 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1867
1868 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1869 TCP_SKB_CB(tail)->has_rxtstamp = true;
1870 tail->tstamp = skb->tstamp;
1871 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1872 }
1873
1874 /* Not as strict as GRO. We only need to carry mss max value */
1875 shinfo->gso_size = max(gso_size, tail_gso_size);
1876 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1877
1878 sk->sk_backlog.len += delta;
1879 __NET_INC_STATS(sock_net(sk),
1880 LINUX_MIB_TCPBACKLOGCOALESCE);
1881 kfree_skb_partial(skb, fragstolen);
1882 return false;
1883 }
1884 __skb_push(skb, hdrlen);
1885
1886 no_coalesce:
1887 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1888
1889 /* Only socket owner can try to collapse/prune rx queues
1890 * to reduce memory overhead, so add a little headroom here.
1891 * Few sockets backlog are possibly concurrently non empty.
1892 */
1893 limit += 64 * 1024;
1894
1895 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1896 bh_unlock_sock(sk);
1897 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1898 return true;
1899 }
1900 return false;
1901 }
1902 EXPORT_SYMBOL(tcp_add_backlog);
1903
tcp_filter(struct sock * sk,struct sk_buff * skb)1904 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1905 {
1906 struct tcphdr *th = (struct tcphdr *)skb->data;
1907
1908 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1909 }
1910 EXPORT_SYMBOL(tcp_filter);
1911
tcp_v4_restore_cb(struct sk_buff * skb)1912 static void tcp_v4_restore_cb(struct sk_buff *skb)
1913 {
1914 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1915 sizeof(struct inet_skb_parm));
1916 }
1917
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1918 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1919 const struct tcphdr *th)
1920 {
1921 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1922 * barrier() makes sure compiler wont play fool^Waliasing games.
1923 */
1924 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1925 sizeof(struct inet_skb_parm));
1926 barrier();
1927
1928 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1929 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1930 skb->len - th->doff * 4);
1931 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1932 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1933 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1934 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1935 TCP_SKB_CB(skb)->sacked = 0;
1936 TCP_SKB_CB(skb)->has_rxtstamp =
1937 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1938 }
1939
1940 /*
1941 * From tcp_input.c
1942 */
1943
tcp_v4_rcv(struct sk_buff * skb)1944 int tcp_v4_rcv(struct sk_buff *skb)
1945 {
1946 struct net *net = dev_net(skb->dev);
1947 struct sk_buff *skb_to_free;
1948 int sdif = inet_sdif(skb);
1949 int dif = inet_iif(skb);
1950 const struct iphdr *iph;
1951 const struct tcphdr *th;
1952 bool refcounted;
1953 struct sock *sk;
1954 int ret;
1955
1956 if (skb->pkt_type != PACKET_HOST)
1957 goto discard_it;
1958
1959 /* Count it even if it's bad */
1960 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1961
1962 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1963 goto discard_it;
1964
1965 th = (const struct tcphdr *)skb->data;
1966
1967 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1968 goto bad_packet;
1969 if (!pskb_may_pull(skb, th->doff * 4))
1970 goto discard_it;
1971
1972 /* An explanation is required here, I think.
1973 * Packet length and doff are validated by header prediction,
1974 * provided case of th->doff==0 is eliminated.
1975 * So, we defer the checks. */
1976
1977 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1978 goto csum_error;
1979
1980 th = (const struct tcphdr *)skb->data;
1981 iph = ip_hdr(skb);
1982 lookup:
1983 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1984 th->dest, sdif, &refcounted);
1985 if (!sk)
1986 goto no_tcp_socket;
1987
1988 process:
1989 if (sk->sk_state == TCP_TIME_WAIT)
1990 goto do_time_wait;
1991
1992 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1993 struct request_sock *req = inet_reqsk(sk);
1994 bool req_stolen = false;
1995 struct sock *nsk;
1996
1997 sk = req->rsk_listener;
1998 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1999 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2000 sk_drops_add(sk, skb);
2001 reqsk_put(req);
2002 goto discard_it;
2003 }
2004 if (tcp_checksum_complete(skb)) {
2005 reqsk_put(req);
2006 goto csum_error;
2007 }
2008 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2009 inet_csk_reqsk_queue_drop_and_put(sk, req);
2010 goto lookup;
2011 }
2012 /* We own a reference on the listener, increase it again
2013 * as we might lose it too soon.
2014 */
2015 sock_hold(sk);
2016 refcounted = true;
2017 nsk = NULL;
2018 if (!tcp_filter(sk, skb)) {
2019 th = (const struct tcphdr *)skb->data;
2020 iph = ip_hdr(skb);
2021 tcp_v4_fill_cb(skb, iph, th);
2022 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2023 }
2024 if (!nsk) {
2025 reqsk_put(req);
2026 if (req_stolen) {
2027 /* Another cpu got exclusive access to req
2028 * and created a full blown socket.
2029 * Try to feed this packet to this socket
2030 * instead of discarding it.
2031 */
2032 tcp_v4_restore_cb(skb);
2033 sock_put(sk);
2034 goto lookup;
2035 }
2036 goto discard_and_relse;
2037 }
2038 nf_reset_ct(skb);
2039 if (nsk == sk) {
2040 reqsk_put(req);
2041 tcp_v4_restore_cb(skb);
2042 } else if (tcp_child_process(sk, nsk, skb)) {
2043 tcp_v4_send_reset(nsk, skb);
2044 goto discard_and_relse;
2045 } else {
2046 sock_put(sk);
2047 return 0;
2048 }
2049 }
2050 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2051 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2052 goto discard_and_relse;
2053 }
2054
2055 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2056 goto discard_and_relse;
2057
2058 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2059 goto discard_and_relse;
2060
2061 nf_reset_ct(skb);
2062
2063 if (tcp_filter(sk, skb))
2064 goto discard_and_relse;
2065 th = (const struct tcphdr *)skb->data;
2066 iph = ip_hdr(skb);
2067 tcp_v4_fill_cb(skb, iph, th);
2068
2069 skb->dev = NULL;
2070
2071 if (sk->sk_state == TCP_LISTEN) {
2072 ret = tcp_v4_do_rcv(sk, skb);
2073 goto put_and_return;
2074 }
2075
2076 sk_incoming_cpu_update(sk);
2077
2078 bh_lock_sock_nested(sk);
2079 tcp_segs_in(tcp_sk(sk), skb);
2080 ret = 0;
2081 if (!sock_owned_by_user(sk)) {
2082 skb_to_free = sk->sk_rx_skb_cache;
2083 sk->sk_rx_skb_cache = NULL;
2084 ret = tcp_v4_do_rcv(sk, skb);
2085 } else {
2086 if (tcp_add_backlog(sk, skb))
2087 goto discard_and_relse;
2088 skb_to_free = NULL;
2089 }
2090 bh_unlock_sock(sk);
2091 if (skb_to_free)
2092 __kfree_skb(skb_to_free);
2093
2094 put_and_return:
2095 if (refcounted)
2096 sock_put(sk);
2097
2098 return ret;
2099
2100 no_tcp_socket:
2101 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2102 goto discard_it;
2103
2104 tcp_v4_fill_cb(skb, iph, th);
2105
2106 if (tcp_checksum_complete(skb)) {
2107 csum_error:
2108 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2109 bad_packet:
2110 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2111 } else {
2112 tcp_v4_send_reset(NULL, skb);
2113 }
2114
2115 discard_it:
2116 /* Discard frame. */
2117 kfree_skb(skb);
2118 return 0;
2119
2120 discard_and_relse:
2121 sk_drops_add(sk, skb);
2122 if (refcounted)
2123 sock_put(sk);
2124 goto discard_it;
2125
2126 do_time_wait:
2127 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2128 inet_twsk_put(inet_twsk(sk));
2129 goto discard_it;
2130 }
2131
2132 tcp_v4_fill_cb(skb, iph, th);
2133
2134 if (tcp_checksum_complete(skb)) {
2135 inet_twsk_put(inet_twsk(sk));
2136 goto csum_error;
2137 }
2138 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2139 case TCP_TW_SYN: {
2140 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2141 &tcp_hashinfo, skb,
2142 __tcp_hdrlen(th),
2143 iph->saddr, th->source,
2144 iph->daddr, th->dest,
2145 inet_iif(skb),
2146 sdif);
2147 if (sk2) {
2148 inet_twsk_deschedule_put(inet_twsk(sk));
2149 sk = sk2;
2150 tcp_v4_restore_cb(skb);
2151 refcounted = false;
2152 goto process;
2153 }
2154 }
2155 /* to ACK */
2156 fallthrough;
2157 case TCP_TW_ACK:
2158 tcp_v4_timewait_ack(sk, skb);
2159 break;
2160 case TCP_TW_RST:
2161 tcp_v4_send_reset(sk, skb);
2162 inet_twsk_deschedule_put(inet_twsk(sk));
2163 goto discard_it;
2164 case TCP_TW_SUCCESS:;
2165 }
2166 goto discard_it;
2167 }
2168
2169 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2170 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2171 .twsk_unique = tcp_twsk_unique,
2172 .twsk_destructor= tcp_twsk_destructor,
2173 };
2174
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2175 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2176 {
2177 struct dst_entry *dst = skb_dst(skb);
2178
2179 if (dst && dst_hold_safe(dst)) {
2180 rcu_assign_pointer(sk->sk_rx_dst, dst);
2181 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2182 }
2183 }
2184 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2185
2186 const struct inet_connection_sock_af_ops ipv4_specific = {
2187 .queue_xmit = ip_queue_xmit,
2188 .send_check = tcp_v4_send_check,
2189 .rebuild_header = inet_sk_rebuild_header,
2190 .sk_rx_dst_set = inet_sk_rx_dst_set,
2191 .conn_request = tcp_v4_conn_request,
2192 .syn_recv_sock = tcp_v4_syn_recv_sock,
2193 .net_header_len = sizeof(struct iphdr),
2194 .setsockopt = ip_setsockopt,
2195 .getsockopt = ip_getsockopt,
2196 .addr2sockaddr = inet_csk_addr2sockaddr,
2197 .sockaddr_len = sizeof(struct sockaddr_in),
2198 .mtu_reduced = tcp_v4_mtu_reduced,
2199 };
2200 EXPORT_SYMBOL(ipv4_specific);
2201
2202 #ifdef CONFIG_TCP_MD5SIG
2203 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2204 .md5_lookup = tcp_v4_md5_lookup,
2205 .calc_md5_hash = tcp_v4_md5_hash_skb,
2206 .md5_parse = tcp_v4_parse_md5_keys,
2207 };
2208 #endif
2209
2210 /* NOTE: A lot of things set to zero explicitly by call to
2211 * sk_alloc() so need not be done here.
2212 */
tcp_v4_init_sock(struct sock * sk)2213 static int tcp_v4_init_sock(struct sock *sk)
2214 {
2215 struct inet_connection_sock *icsk = inet_csk(sk);
2216
2217 tcp_init_sock(sk);
2218
2219 icsk->icsk_af_ops = &ipv4_specific;
2220
2221 #ifdef CONFIG_TCP_MD5SIG
2222 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2223 #endif
2224
2225 return 0;
2226 }
2227
tcp_v4_destroy_sock(struct sock * sk)2228 void tcp_v4_destroy_sock(struct sock *sk)
2229 {
2230 struct tcp_sock *tp = tcp_sk(sk);
2231
2232 trace_tcp_destroy_sock(sk);
2233
2234 tcp_clear_xmit_timers(sk);
2235
2236 tcp_cleanup_congestion_control(sk);
2237
2238 tcp_cleanup_ulp(sk);
2239
2240 /* Cleanup up the write buffer. */
2241 tcp_write_queue_purge(sk);
2242
2243 /* Check if we want to disable active TFO */
2244 tcp_fastopen_active_disable_ofo_check(sk);
2245
2246 /* Cleans up our, hopefully empty, out_of_order_queue. */
2247 skb_rbtree_purge(&tp->out_of_order_queue);
2248
2249 #ifdef CONFIG_TCP_MD5SIG
2250 /* Clean up the MD5 key list, if any */
2251 if (tp->md5sig_info) {
2252 tcp_clear_md5_list(sk);
2253 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2254 tp->md5sig_info = NULL;
2255 }
2256 #endif
2257
2258 /* Clean up a referenced TCP bind bucket. */
2259 if (inet_csk(sk)->icsk_bind_hash)
2260 inet_put_port(sk);
2261
2262 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2263
2264 /* If socket is aborted during connect operation */
2265 tcp_free_fastopen_req(tp);
2266 tcp_fastopen_destroy_cipher(sk);
2267 tcp_saved_syn_free(tp);
2268
2269 sk_sockets_allocated_dec(sk);
2270 }
2271 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2272
2273 #ifdef CONFIG_PROC_FS
2274 /* Proc filesystem TCP sock list dumping. */
2275
2276 /*
2277 * Get next listener socket follow cur. If cur is NULL, get first socket
2278 * starting from bucket given in st->bucket; when st->bucket is zero the
2279 * very first socket in the hash table is returned.
2280 */
listening_get_next(struct seq_file * seq,void * cur)2281 static void *listening_get_next(struct seq_file *seq, void *cur)
2282 {
2283 struct tcp_seq_afinfo *afinfo;
2284 struct tcp_iter_state *st = seq->private;
2285 struct net *net = seq_file_net(seq);
2286 struct inet_listen_hashbucket *ilb;
2287 struct hlist_nulls_node *node;
2288 struct sock *sk = cur;
2289
2290 if (st->bpf_seq_afinfo)
2291 afinfo = st->bpf_seq_afinfo;
2292 else
2293 afinfo = PDE_DATA(file_inode(seq->file));
2294
2295 if (!sk) {
2296 get_head:
2297 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2298 spin_lock(&ilb->lock);
2299 sk = sk_nulls_head(&ilb->nulls_head);
2300 st->offset = 0;
2301 goto get_sk;
2302 }
2303 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2304 ++st->num;
2305 ++st->offset;
2306
2307 sk = sk_nulls_next(sk);
2308 get_sk:
2309 sk_nulls_for_each_from(sk, node) {
2310 if (!net_eq(sock_net(sk), net))
2311 continue;
2312 if (afinfo->family == AF_UNSPEC ||
2313 sk->sk_family == afinfo->family)
2314 return sk;
2315 }
2316 spin_unlock(&ilb->lock);
2317 st->offset = 0;
2318 if (++st->bucket < INET_LHTABLE_SIZE)
2319 goto get_head;
2320 return NULL;
2321 }
2322
listening_get_idx(struct seq_file * seq,loff_t * pos)2323 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2324 {
2325 struct tcp_iter_state *st = seq->private;
2326 void *rc;
2327
2328 st->bucket = 0;
2329 st->offset = 0;
2330 rc = listening_get_next(seq, NULL);
2331
2332 while (rc && *pos) {
2333 rc = listening_get_next(seq, rc);
2334 --*pos;
2335 }
2336 return rc;
2337 }
2338
empty_bucket(const struct tcp_iter_state * st)2339 static inline bool empty_bucket(const struct tcp_iter_state *st)
2340 {
2341 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2342 }
2343
2344 /*
2345 * Get first established socket starting from bucket given in st->bucket.
2346 * If st->bucket is zero, the very first socket in the hash is returned.
2347 */
established_get_first(struct seq_file * seq)2348 static void *established_get_first(struct seq_file *seq)
2349 {
2350 struct tcp_seq_afinfo *afinfo;
2351 struct tcp_iter_state *st = seq->private;
2352 struct net *net = seq_file_net(seq);
2353 void *rc = NULL;
2354
2355 if (st->bpf_seq_afinfo)
2356 afinfo = st->bpf_seq_afinfo;
2357 else
2358 afinfo = PDE_DATA(file_inode(seq->file));
2359
2360 st->offset = 0;
2361 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2362 struct sock *sk;
2363 struct hlist_nulls_node *node;
2364 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2365
2366 /* Lockless fast path for the common case of empty buckets */
2367 if (empty_bucket(st))
2368 continue;
2369
2370 spin_lock_bh(lock);
2371 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2372 if ((afinfo->family != AF_UNSPEC &&
2373 sk->sk_family != afinfo->family) ||
2374 !net_eq(sock_net(sk), net)) {
2375 continue;
2376 }
2377 rc = sk;
2378 goto out;
2379 }
2380 spin_unlock_bh(lock);
2381 }
2382 out:
2383 return rc;
2384 }
2385
established_get_next(struct seq_file * seq,void * cur)2386 static void *established_get_next(struct seq_file *seq, void *cur)
2387 {
2388 struct tcp_seq_afinfo *afinfo;
2389 struct sock *sk = cur;
2390 struct hlist_nulls_node *node;
2391 struct tcp_iter_state *st = seq->private;
2392 struct net *net = seq_file_net(seq);
2393
2394 if (st->bpf_seq_afinfo)
2395 afinfo = st->bpf_seq_afinfo;
2396 else
2397 afinfo = PDE_DATA(file_inode(seq->file));
2398
2399 ++st->num;
2400 ++st->offset;
2401
2402 sk = sk_nulls_next(sk);
2403
2404 sk_nulls_for_each_from(sk, node) {
2405 if ((afinfo->family == AF_UNSPEC ||
2406 sk->sk_family == afinfo->family) &&
2407 net_eq(sock_net(sk), net))
2408 return sk;
2409 }
2410
2411 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2412 ++st->bucket;
2413 return established_get_first(seq);
2414 }
2415
established_get_idx(struct seq_file * seq,loff_t pos)2416 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2417 {
2418 struct tcp_iter_state *st = seq->private;
2419 void *rc;
2420
2421 st->bucket = 0;
2422 rc = established_get_first(seq);
2423
2424 while (rc && pos) {
2425 rc = established_get_next(seq, rc);
2426 --pos;
2427 }
2428 return rc;
2429 }
2430
tcp_get_idx(struct seq_file * seq,loff_t pos)2431 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2432 {
2433 void *rc;
2434 struct tcp_iter_state *st = seq->private;
2435
2436 st->state = TCP_SEQ_STATE_LISTENING;
2437 rc = listening_get_idx(seq, &pos);
2438
2439 if (!rc) {
2440 st->state = TCP_SEQ_STATE_ESTABLISHED;
2441 rc = established_get_idx(seq, pos);
2442 }
2443
2444 return rc;
2445 }
2446
tcp_seek_last_pos(struct seq_file * seq)2447 static void *tcp_seek_last_pos(struct seq_file *seq)
2448 {
2449 struct tcp_iter_state *st = seq->private;
2450 int bucket = st->bucket;
2451 int offset = st->offset;
2452 int orig_num = st->num;
2453 void *rc = NULL;
2454
2455 switch (st->state) {
2456 case TCP_SEQ_STATE_LISTENING:
2457 if (st->bucket >= INET_LHTABLE_SIZE)
2458 break;
2459 st->state = TCP_SEQ_STATE_LISTENING;
2460 rc = listening_get_next(seq, NULL);
2461 while (offset-- && rc && bucket == st->bucket)
2462 rc = listening_get_next(seq, rc);
2463 if (rc)
2464 break;
2465 st->bucket = 0;
2466 st->state = TCP_SEQ_STATE_ESTABLISHED;
2467 fallthrough;
2468 case TCP_SEQ_STATE_ESTABLISHED:
2469 if (st->bucket > tcp_hashinfo.ehash_mask)
2470 break;
2471 rc = established_get_first(seq);
2472 while (offset-- && rc && bucket == st->bucket)
2473 rc = established_get_next(seq, rc);
2474 }
2475
2476 st->num = orig_num;
2477
2478 return rc;
2479 }
2480
tcp_seq_start(struct seq_file * seq,loff_t * pos)2481 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2482 {
2483 struct tcp_iter_state *st = seq->private;
2484 void *rc;
2485
2486 if (*pos && *pos == st->last_pos) {
2487 rc = tcp_seek_last_pos(seq);
2488 if (rc)
2489 goto out;
2490 }
2491
2492 st->state = TCP_SEQ_STATE_LISTENING;
2493 st->num = 0;
2494 st->bucket = 0;
2495 st->offset = 0;
2496 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2497
2498 out:
2499 st->last_pos = *pos;
2500 return rc;
2501 }
2502 EXPORT_SYMBOL(tcp_seq_start);
2503
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2504 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2505 {
2506 struct tcp_iter_state *st = seq->private;
2507 void *rc = NULL;
2508
2509 if (v == SEQ_START_TOKEN) {
2510 rc = tcp_get_idx(seq, 0);
2511 goto out;
2512 }
2513
2514 switch (st->state) {
2515 case TCP_SEQ_STATE_LISTENING:
2516 rc = listening_get_next(seq, v);
2517 if (!rc) {
2518 st->state = TCP_SEQ_STATE_ESTABLISHED;
2519 st->bucket = 0;
2520 st->offset = 0;
2521 rc = established_get_first(seq);
2522 }
2523 break;
2524 case TCP_SEQ_STATE_ESTABLISHED:
2525 rc = established_get_next(seq, v);
2526 break;
2527 }
2528 out:
2529 ++*pos;
2530 st->last_pos = *pos;
2531 return rc;
2532 }
2533 EXPORT_SYMBOL(tcp_seq_next);
2534
tcp_seq_stop(struct seq_file * seq,void * v)2535 void tcp_seq_stop(struct seq_file *seq, void *v)
2536 {
2537 struct tcp_iter_state *st = seq->private;
2538
2539 switch (st->state) {
2540 case TCP_SEQ_STATE_LISTENING:
2541 if (v != SEQ_START_TOKEN)
2542 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2543 break;
2544 case TCP_SEQ_STATE_ESTABLISHED:
2545 if (v)
2546 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2547 break;
2548 }
2549 }
2550 EXPORT_SYMBOL(tcp_seq_stop);
2551
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2552 static void get_openreq4(const struct request_sock *req,
2553 struct seq_file *f, int i)
2554 {
2555 const struct inet_request_sock *ireq = inet_rsk(req);
2556 long delta = req->rsk_timer.expires - jiffies;
2557
2558 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2559 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2560 i,
2561 ireq->ir_loc_addr,
2562 ireq->ir_num,
2563 ireq->ir_rmt_addr,
2564 ntohs(ireq->ir_rmt_port),
2565 TCP_SYN_RECV,
2566 0, 0, /* could print option size, but that is af dependent. */
2567 1, /* timers active (only the expire timer) */
2568 jiffies_delta_to_clock_t(delta),
2569 req->num_timeout,
2570 from_kuid_munged(seq_user_ns(f),
2571 sock_i_uid(req->rsk_listener)),
2572 0, /* non standard timer */
2573 0, /* open_requests have no inode */
2574 0,
2575 req);
2576 }
2577
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2578 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2579 {
2580 int timer_active;
2581 unsigned long timer_expires;
2582 const struct tcp_sock *tp = tcp_sk(sk);
2583 const struct inet_connection_sock *icsk = inet_csk(sk);
2584 const struct inet_sock *inet = inet_sk(sk);
2585 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2586 __be32 dest = inet->inet_daddr;
2587 __be32 src = inet->inet_rcv_saddr;
2588 __u16 destp = ntohs(inet->inet_dport);
2589 __u16 srcp = ntohs(inet->inet_sport);
2590 int rx_queue;
2591 int state;
2592
2593 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2594 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2595 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2596 timer_active = 1;
2597 timer_expires = icsk->icsk_timeout;
2598 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2599 timer_active = 4;
2600 timer_expires = icsk->icsk_timeout;
2601 } else if (timer_pending(&sk->sk_timer)) {
2602 timer_active = 2;
2603 timer_expires = sk->sk_timer.expires;
2604 } else {
2605 timer_active = 0;
2606 timer_expires = jiffies;
2607 }
2608
2609 state = inet_sk_state_load(sk);
2610 if (state == TCP_LISTEN)
2611 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2612 else
2613 /* Because we don't lock the socket,
2614 * we might find a transient negative value.
2615 */
2616 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2617 READ_ONCE(tp->copied_seq), 0);
2618
2619 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2620 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2621 i, src, srcp, dest, destp, state,
2622 READ_ONCE(tp->write_seq) - tp->snd_una,
2623 rx_queue,
2624 timer_active,
2625 jiffies_delta_to_clock_t(timer_expires - jiffies),
2626 icsk->icsk_retransmits,
2627 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2628 icsk->icsk_probes_out,
2629 sock_i_ino(sk),
2630 refcount_read(&sk->sk_refcnt), sk,
2631 jiffies_to_clock_t(icsk->icsk_rto),
2632 jiffies_to_clock_t(icsk->icsk_ack.ato),
2633 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2634 tp->snd_cwnd,
2635 state == TCP_LISTEN ?
2636 fastopenq->max_qlen :
2637 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2638 }
2639
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2640 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2641 struct seq_file *f, int i)
2642 {
2643 long delta = tw->tw_timer.expires - jiffies;
2644 __be32 dest, src;
2645 __u16 destp, srcp;
2646
2647 dest = tw->tw_daddr;
2648 src = tw->tw_rcv_saddr;
2649 destp = ntohs(tw->tw_dport);
2650 srcp = ntohs(tw->tw_sport);
2651
2652 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2653 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2654 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2655 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2656 refcount_read(&tw->tw_refcnt), tw);
2657 }
2658
2659 #define TMPSZ 150
2660
tcp4_seq_show(struct seq_file * seq,void * v)2661 static int tcp4_seq_show(struct seq_file *seq, void *v)
2662 {
2663 struct tcp_iter_state *st;
2664 struct sock *sk = v;
2665
2666 seq_setwidth(seq, TMPSZ - 1);
2667 if (v == SEQ_START_TOKEN) {
2668 seq_puts(seq, " sl local_address rem_address st tx_queue "
2669 "rx_queue tr tm->when retrnsmt uid timeout "
2670 "inode");
2671 goto out;
2672 }
2673 st = seq->private;
2674
2675 if (sk->sk_state == TCP_TIME_WAIT)
2676 get_timewait4_sock(v, seq, st->num);
2677 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2678 get_openreq4(v, seq, st->num);
2679 else
2680 get_tcp4_sock(v, seq, st->num);
2681 out:
2682 seq_pad(seq, '\n');
2683 return 0;
2684 }
2685
2686 #ifdef CONFIG_BPF_SYSCALL
2687 struct bpf_iter__tcp {
2688 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2689 __bpf_md_ptr(struct sock_common *, sk_common);
2690 uid_t uid __aligned(8);
2691 };
2692
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2693 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694 struct sock_common *sk_common, uid_t uid)
2695 {
2696 struct bpf_iter__tcp ctx;
2697
2698 meta->seq_num--; /* skip SEQ_START_TOKEN */
2699 ctx.meta = meta;
2700 ctx.sk_common = sk_common;
2701 ctx.uid = uid;
2702 return bpf_iter_run_prog(prog, &ctx);
2703 }
2704
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2705 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2706 {
2707 struct bpf_iter_meta meta;
2708 struct bpf_prog *prog;
2709 struct sock *sk = v;
2710 uid_t uid;
2711
2712 if (v == SEQ_START_TOKEN)
2713 return 0;
2714
2715 if (sk->sk_state == TCP_TIME_WAIT) {
2716 uid = 0;
2717 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2718 const struct request_sock *req = v;
2719
2720 uid = from_kuid_munged(seq_user_ns(seq),
2721 sock_i_uid(req->rsk_listener));
2722 } else {
2723 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2724 }
2725
2726 meta.seq = seq;
2727 prog = bpf_iter_get_info(&meta, false);
2728 return tcp_prog_seq_show(prog, &meta, v, uid);
2729 }
2730
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2731 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2732 {
2733 struct bpf_iter_meta meta;
2734 struct bpf_prog *prog;
2735
2736 if (!v) {
2737 meta.seq = seq;
2738 prog = bpf_iter_get_info(&meta, true);
2739 if (prog)
2740 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2741 }
2742
2743 tcp_seq_stop(seq, v);
2744 }
2745
2746 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2747 .show = bpf_iter_tcp_seq_show,
2748 .start = tcp_seq_start,
2749 .next = tcp_seq_next,
2750 .stop = bpf_iter_tcp_seq_stop,
2751 };
2752 #endif
2753
2754 static const struct seq_operations tcp4_seq_ops = {
2755 .show = tcp4_seq_show,
2756 .start = tcp_seq_start,
2757 .next = tcp_seq_next,
2758 .stop = tcp_seq_stop,
2759 };
2760
2761 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2762 .family = AF_INET,
2763 };
2764
tcp4_proc_init_net(struct net * net)2765 static int __net_init tcp4_proc_init_net(struct net *net)
2766 {
2767 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2768 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2769 return -ENOMEM;
2770 return 0;
2771 }
2772
tcp4_proc_exit_net(struct net * net)2773 static void __net_exit tcp4_proc_exit_net(struct net *net)
2774 {
2775 remove_proc_entry("tcp", net->proc_net);
2776 }
2777
2778 static struct pernet_operations tcp4_net_ops = {
2779 .init = tcp4_proc_init_net,
2780 .exit = tcp4_proc_exit_net,
2781 };
2782
tcp4_proc_init(void)2783 int __init tcp4_proc_init(void)
2784 {
2785 return register_pernet_subsys(&tcp4_net_ops);
2786 }
2787
tcp4_proc_exit(void)2788 void tcp4_proc_exit(void)
2789 {
2790 unregister_pernet_subsys(&tcp4_net_ops);
2791 }
2792 #endif /* CONFIG_PROC_FS */
2793
2794 struct proto tcp_prot = {
2795 .name = "TCP",
2796 .owner = THIS_MODULE,
2797 .close = tcp_close,
2798 .pre_connect = tcp_v4_pre_connect,
2799 .connect = tcp_v4_connect,
2800 .disconnect = tcp_disconnect,
2801 .accept = inet_csk_accept,
2802 .ioctl = tcp_ioctl,
2803 .init = tcp_v4_init_sock,
2804 .destroy = tcp_v4_destroy_sock,
2805 .shutdown = tcp_shutdown,
2806 .setsockopt = tcp_setsockopt,
2807 .getsockopt = tcp_getsockopt,
2808 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2809 .keepalive = tcp_set_keepalive,
2810 .recvmsg = tcp_recvmsg,
2811 .sendmsg = tcp_sendmsg,
2812 .sendpage = tcp_sendpage,
2813 .backlog_rcv = tcp_v4_do_rcv,
2814 .release_cb = tcp_release_cb,
2815 .hash = inet_hash,
2816 .unhash = inet_unhash,
2817 .get_port = inet_csk_get_port,
2818 .enter_memory_pressure = tcp_enter_memory_pressure,
2819 .leave_memory_pressure = tcp_leave_memory_pressure,
2820 .stream_memory_free = tcp_stream_memory_free,
2821 .sockets_allocated = &tcp_sockets_allocated,
2822 .orphan_count = &tcp_orphan_count,
2823 .memory_allocated = &tcp_memory_allocated,
2824 .memory_pressure = &tcp_memory_pressure,
2825 .sysctl_mem = sysctl_tcp_mem,
2826 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2827 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2828 .max_header = MAX_TCP_HEADER,
2829 .obj_size = sizeof(struct tcp_sock),
2830 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2831 .twsk_prot = &tcp_timewait_sock_ops,
2832 .rsk_prot = &tcp_request_sock_ops,
2833 .h.hashinfo = &tcp_hashinfo,
2834 .no_autobind = true,
2835 .diag_destroy = tcp_abort,
2836 };
2837 EXPORT_SYMBOL(tcp_prot);
2838
tcp_sk_exit(struct net * net)2839 static void __net_exit tcp_sk_exit(struct net *net)
2840 {
2841 if (net->ipv4.tcp_congestion_control)
2842 bpf_module_put(net->ipv4.tcp_congestion_control,
2843 net->ipv4.tcp_congestion_control->owner);
2844 }
2845
tcp_sk_init(struct net * net)2846 static int __net_init tcp_sk_init(struct net *net)
2847 {
2848 int cnt;
2849
2850 net->ipv4.sysctl_tcp_ecn = 2;
2851 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2852
2853 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2854 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2855 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2856 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2857 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2858
2859 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2860 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2861 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2862
2863 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2864 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2865 net->ipv4.sysctl_tcp_syncookies = 1;
2866 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2867 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2868 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2869 net->ipv4.sysctl_tcp_orphan_retries = 0;
2870 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2871 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2872 net->ipv4.sysctl_tcp_tw_reuse = 2;
2873 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2874
2875 cnt = tcp_hashinfo.ehash_mask + 1;
2876 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2877 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2878
2879 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2880 net->ipv4.sysctl_tcp_sack = 1;
2881 net->ipv4.sysctl_tcp_window_scaling = 1;
2882 net->ipv4.sysctl_tcp_timestamps = 1;
2883 net->ipv4.sysctl_tcp_early_retrans = 3;
2884 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2885 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2886 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2887 net->ipv4.sysctl_tcp_max_reordering = 300;
2888 net->ipv4.sysctl_tcp_dsack = 1;
2889 net->ipv4.sysctl_tcp_app_win = 31;
2890 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2891 net->ipv4.sysctl_tcp_frto = 2;
2892 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2893 /* This limits the percentage of the congestion window which we
2894 * will allow a single TSO frame to consume. Building TSO frames
2895 * which are too large can cause TCP streams to be bursty.
2896 */
2897 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2898 /* Default TSQ limit of 16 TSO segments */
2899 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2900 /* rfc5961 challenge ack rate limiting */
2901 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2902 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2903 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2904 net->ipv4.sysctl_tcp_autocorking = 1;
2905 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2906 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2907 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2908 if (net != &init_net) {
2909 memcpy(net->ipv4.sysctl_tcp_rmem,
2910 init_net.ipv4.sysctl_tcp_rmem,
2911 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2912 memcpy(net->ipv4.sysctl_tcp_wmem,
2913 init_net.ipv4.sysctl_tcp_wmem,
2914 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2915 }
2916 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2917 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2918 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2919 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2920 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2921 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2922 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2923
2924 /* Reno is always built in */
2925 if (!net_eq(net, &init_net) &&
2926 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2927 init_net.ipv4.tcp_congestion_control->owner))
2928 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2929 else
2930 net->ipv4.tcp_congestion_control = &tcp_reno;
2931
2932 return 0;
2933 }
2934
tcp_sk_exit_batch(struct list_head * net_exit_list)2935 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2936 {
2937 struct net *net;
2938
2939 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2940
2941 list_for_each_entry(net, net_exit_list, exit_list)
2942 tcp_fastopen_ctx_destroy(net);
2943 }
2944
2945 static struct pernet_operations __net_initdata tcp_sk_ops = {
2946 .init = tcp_sk_init,
2947 .exit = tcp_sk_exit,
2948 .exit_batch = tcp_sk_exit_batch,
2949 };
2950
2951 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2952 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2953 struct sock_common *sk_common, uid_t uid)
2954
2955 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2956 {
2957 struct tcp_iter_state *st = priv_data;
2958 struct tcp_seq_afinfo *afinfo;
2959 int ret;
2960
2961 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2962 if (!afinfo)
2963 return -ENOMEM;
2964
2965 afinfo->family = AF_UNSPEC;
2966 st->bpf_seq_afinfo = afinfo;
2967 ret = bpf_iter_init_seq_net(priv_data, aux);
2968 if (ret)
2969 kfree(afinfo);
2970 return ret;
2971 }
2972
bpf_iter_fini_tcp(void * priv_data)2973 static void bpf_iter_fini_tcp(void *priv_data)
2974 {
2975 struct tcp_iter_state *st = priv_data;
2976
2977 kfree(st->bpf_seq_afinfo);
2978 bpf_iter_fini_seq_net(priv_data);
2979 }
2980
2981 static const struct bpf_iter_seq_info tcp_seq_info = {
2982 .seq_ops = &bpf_iter_tcp_seq_ops,
2983 .init_seq_private = bpf_iter_init_tcp,
2984 .fini_seq_private = bpf_iter_fini_tcp,
2985 .seq_priv_size = sizeof(struct tcp_iter_state),
2986 };
2987
2988 static struct bpf_iter_reg tcp_reg_info = {
2989 .target = "tcp",
2990 .ctx_arg_info_size = 1,
2991 .ctx_arg_info = {
2992 { offsetof(struct bpf_iter__tcp, sk_common),
2993 PTR_TO_BTF_ID_OR_NULL },
2994 },
2995 .seq_info = &tcp_seq_info,
2996 };
2997
bpf_iter_register(void)2998 static void __init bpf_iter_register(void)
2999 {
3000 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3001 if (bpf_iter_reg_target(&tcp_reg_info))
3002 pr_warn("Warning: could not register bpf iterator tcp\n");
3003 }
3004
3005 #endif
3006
tcp_v4_init(void)3007 void __init tcp_v4_init(void)
3008 {
3009 int cpu, res;
3010
3011 for_each_possible_cpu(cpu) {
3012 struct sock *sk;
3013
3014 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3015 IPPROTO_TCP, &init_net);
3016 if (res)
3017 panic("Failed to create the TCP control socket.\n");
3018 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3019
3020 /* Please enforce IP_DF and IPID==0 for RST and
3021 * ACK sent in SYN-RECV and TIME-WAIT state.
3022 */
3023 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3024
3025 per_cpu(ipv4_tcp_sk, cpu) = sk;
3026 }
3027 if (register_pernet_subsys(&tcp_sk_ops))
3028 panic("Failed to create the TCP control socket.\n");
3029
3030 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3031 bpf_iter_register();
3032 #endif
3033 }
3034