1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 #ifdef CONFIG_TCP_NATA_URC
73 #include <net/nata.h>
74 #endif
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 #include <linux/inetdevice.h>
82 #include <linux/btf_ids.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 #include <trace/events/tcp.h>
88
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96
97 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
98
tcp_v4_init_seq(const struct sk_buff * skb)99 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
100 {
101 return secure_tcp_seq(ip_hdr(skb)->daddr,
102 ip_hdr(skb)->saddr,
103 tcp_hdr(skb)->dest,
104 tcp_hdr(skb)->source);
105 }
106
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)107 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
108 {
109 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
110 }
111
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 {
114 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
115 const struct inet_timewait_sock *tw = inet_twsk(sktw);
116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 struct tcp_sock *tp = tcp_sk(sk);
118
119 if (reuse == 2) {
120 /* Still does not detect *everything* that goes through
121 * lo, since we require a loopback src or dst address
122 * or direct binding to 'lo' interface.
123 */
124 bool loopback = false;
125 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
126 loopback = true;
127 #if IS_ENABLED(CONFIG_IPV6)
128 if (tw->tw_family == AF_INET6) {
129 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
133 loopback = true;
134 } else
135 #endif
136 {
137 if (ipv4_is_loopback(tw->tw_daddr) ||
138 ipv4_is_loopback(tw->tw_rcv_saddr))
139 loopback = true;
140 }
141 if (!loopback)
142 reuse = 0;
143 }
144
145 /* With PAWS, it is safe from the viewpoint
146 of data integrity. Even without PAWS it is safe provided sequence
147 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
148
149 Actually, the idea is close to VJ's one, only timestamp cache is
150 held not per host, but per port pair and TW bucket is used as state
151 holder.
152
153 If TW bucket has been already destroyed we fall back to VJ's scheme
154 and use initial timestamp retrieved from peer table.
155 */
156 if (tcptw->tw_ts_recent_stamp &&
157 (!twp || (reuse && time_after32(ktime_get_seconds(),
158 tcptw->tw_ts_recent_stamp)))) {
159 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk
160 * and releasing the bucket lock.
161 */
162 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
163 return 0;
164
165 /* In case of repair and re-using TIME-WAIT sockets we still
166 * want to be sure that it is safe as above but honor the
167 * sequence numbers and time stamps set as part of the repair
168 * process.
169 *
170 * Without this check re-using a TIME-WAIT socket with TCP
171 * repair would accumulate a -1 on the repair assigned
172 * sequence number. The first time it is reused the sequence
173 * is -1, the second time -2, etc. This fixes that issue
174 * without appearing to create any others.
175 */
176 if (likely(!tp->repair)) {
177 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
178
179 if (!seq)
180 seq = 1;
181 WRITE_ONCE(tp->write_seq, seq);
182 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
183 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
184 }
185
186 return 1;
187 }
188
189 return 0;
190 }
191 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
192
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)193 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
194 int addr_len)
195 {
196 /* This check is replicated from tcp_v4_connect() and intended to
197 * prevent BPF program called below from accessing bytes that are out
198 * of the bound specified by user in addr_len.
199 */
200 if (addr_len < sizeof(struct sockaddr_in))
201 return -EINVAL;
202
203 sock_owned_by_me(sk);
204
205 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
206 }
207
208 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)209 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
210 {
211 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
212 struct inet_sock *inet = inet_sk(sk);
213 struct tcp_sock *tp = tcp_sk(sk);
214 __be16 orig_sport, orig_dport;
215 __be32 daddr, nexthop;
216 struct flowi4 *fl4;
217 struct rtable *rt;
218 int err;
219 struct ip_options_rcu *inet_opt;
220 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
221
222 if (addr_len < sizeof(struct sockaddr_in))
223 return -EINVAL;
224
225 if (usin->sin_family != AF_INET)
226 return -EAFNOSUPPORT;
227
228 nexthop = daddr = usin->sin_addr.s_addr;
229 inet_opt = rcu_dereference_protected(inet->inet_opt,
230 lockdep_sock_is_held(sk));
231 if (inet_opt && inet_opt->opt.srr) {
232 if (!daddr)
233 return -EINVAL;
234 nexthop = inet_opt->opt.faddr;
235 }
236
237 orig_sport = inet->inet_sport;
238 orig_dport = usin->sin_port;
239 fl4 = &inet->cork.fl.u.ip4;
240 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
241 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
242 IPPROTO_TCP,
243 orig_sport, orig_dport, sk);
244 if (IS_ERR(rt)) {
245 err = PTR_ERR(rt);
246 if (err == -ENETUNREACH)
247 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
248 return err;
249 }
250
251 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
252 ip_rt_put(rt);
253 return -ENETUNREACH;
254 }
255
256 if (!inet_opt || !inet_opt->opt.srr)
257 daddr = fl4->daddr;
258
259 if (!inet->inet_saddr)
260 inet->inet_saddr = fl4->saddr;
261 sk_rcv_saddr_set(sk, inet->inet_saddr);
262
263 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264 /* Reset inherited state */
265 tp->rx_opt.ts_recent = 0;
266 tp->rx_opt.ts_recent_stamp = 0;
267 if (likely(!tp->repair))
268 WRITE_ONCE(tp->write_seq, 0);
269 }
270
271 inet->inet_dport = usin->sin_port;
272 sk_daddr_set(sk, daddr);
273
274 inet_csk(sk)->icsk_ext_hdr_len = 0;
275 if (inet_opt)
276 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280 /* Socket identity is still unknown (sport may be zero).
281 * However we set state to SYN-SENT and not releasing socket
282 * lock select source port, enter ourselves into the hash tables and
283 * complete initialization after this.
284 */
285 #ifdef CONFIG_TCP_NATA_URC
286 tcp_set_nata_push_urc(sk);
287 #endif /* CONFIG_TCP_NATA_URC */
288 tcp_set_state(sk, TCP_SYN_SENT);
289 err = inet_hash_connect(tcp_death_row, sk);
290 if (err)
291 goto failure;
292
293 sk_set_txhash(sk);
294
295 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
296 inet->inet_sport, inet->inet_dport, sk);
297 if (IS_ERR(rt)) {
298 err = PTR_ERR(rt);
299 rt = NULL;
300 goto failure;
301 }
302 /* OK, now commit destination to socket. */
303 sk->sk_gso_type = SKB_GSO_TCPV4;
304 sk_setup_caps(sk, &rt->dst);
305 rt = NULL;
306
307 if (likely(!tp->repair)) {
308 if (!tp->write_seq)
309 WRITE_ONCE(tp->write_seq,
310 secure_tcp_seq(inet->inet_saddr,
311 inet->inet_daddr,
312 inet->inet_sport,
313 usin->sin_port));
314 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
315 inet->inet_saddr,
316 inet->inet_daddr);
317 }
318
319 inet->inet_id = prandom_u32();
320
321 if (tcp_fastopen_defer_connect(sk, &err))
322 return err;
323 if (err)
324 goto failure;
325
326 err = tcp_connect(sk);
327
328 if (err)
329 goto failure;
330
331 return 0;
332
333 failure:
334 /*
335 * This unhashes the socket and releases the local port,
336 * if necessary.
337 */
338 tcp_set_state(sk, TCP_CLOSE);
339 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
340 inet_reset_saddr(sk);
341 ip_rt_put(rt);
342 sk->sk_route_caps = 0;
343 inet->inet_dport = 0;
344 return err;
345 }
346 EXPORT_SYMBOL(tcp_v4_connect);
347
348 /*
349 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
350 * It can be called through tcp_release_cb() if socket was owned by user
351 * at the time tcp_v4_err() was called to handle ICMP message.
352 */
tcp_v4_mtu_reduced(struct sock * sk)353 void tcp_v4_mtu_reduced(struct sock *sk)
354 {
355 struct inet_sock *inet = inet_sk(sk);
356 struct dst_entry *dst;
357 u32 mtu;
358
359 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
360 return;
361 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
362 dst = inet_csk_update_pmtu(sk, mtu);
363 if (!dst)
364 return;
365
366 /* Something is about to be wrong... Remember soft error
367 * for the case, if this connection will not able to recover.
368 */
369 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
370 sk->sk_err_soft = EMSGSIZE;
371
372 mtu = dst_mtu(dst);
373
374 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
375 ip_sk_accept_pmtu(sk) &&
376 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
377 tcp_sync_mss(sk, mtu);
378
379 /* Resend the TCP packet because it's
380 * clear that the old packet has been
381 * dropped. This is the new "fast" path mtu
382 * discovery.
383 */
384 tcp_simple_retransmit(sk);
385 } /* else let the usual retransmit timer handle it */
386 }
387 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
388
do_redirect(struct sk_buff * skb,struct sock * sk)389 static void do_redirect(struct sk_buff *skb, struct sock *sk)
390 {
391 struct dst_entry *dst = __sk_dst_check(sk, 0);
392
393 if (dst)
394 dst->ops->redirect(dst, sk, skb);
395 }
396
397
398 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)399 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
400 {
401 struct request_sock *req = inet_reqsk(sk);
402 struct net *net = sock_net(sk);
403
404 /* ICMPs are not backlogged, hence we cannot get
405 * an established socket here.
406 */
407 if (seq != tcp_rsk(req)->snt_isn) {
408 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
409 } else if (abort) {
410 /*
411 * Still in SYN_RECV, just remove it silently.
412 * There is no good way to pass the error to the newly
413 * created socket, and POSIX does not want network
414 * errors returned from accept().
415 */
416 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
417 tcp_listendrop(req->rsk_listener);
418 }
419 reqsk_put(req);
420 }
421 EXPORT_SYMBOL(tcp_req_err);
422
423 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)424 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
425 {
426 struct inet_connection_sock *icsk = inet_csk(sk);
427 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb;
429 s32 remaining;
430 u32 delta_us;
431
432 if (sock_owned_by_user(sk))
433 return;
434
435 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
436 !icsk->icsk_backoff)
437 return;
438
439 skb = tcp_rtx_queue_head(sk);
440 if (WARN_ON_ONCE(!skb))
441 return;
442
443 icsk->icsk_backoff--;
444 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
445 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
446
447 tcp_mstamp_refresh(tp);
448 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
449 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
450
451 if (remaining > 0) {
452 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
453 remaining, TCP_RTO_MAX);
454 } else {
455 /* RTO revert clocked out retransmission.
456 * Will retransmit now.
457 */
458 tcp_retransmit_timer(sk);
459 }
460 }
461 EXPORT_SYMBOL(tcp_ld_RTO_revert);
462
463 /*
464 * This routine is called by the ICMP module when it gets some
465 * sort of error condition. If err < 0 then the socket should
466 * be closed and the error returned to the user. If err > 0
467 * it's just the icmp type << 8 | icmp code. After adjustment
468 * header points to the first 8 bytes of the tcp header. We need
469 * to find the appropriate port.
470 *
471 * The locking strategy used here is very "optimistic". When
472 * someone else accesses the socket the ICMP is just dropped
473 * and for some paths there is no check at all.
474 * A more general error queue to queue errors for later handling
475 * is probably better.
476 *
477 */
478
tcp_v4_err(struct sk_buff * skb,u32 info)479 int tcp_v4_err(struct sk_buff *skb, u32 info)
480 {
481 const struct iphdr *iph = (const struct iphdr *)skb->data;
482 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
483 struct tcp_sock *tp;
484 struct inet_sock *inet;
485 const int type = icmp_hdr(skb)->type;
486 const int code = icmp_hdr(skb)->code;
487 struct sock *sk;
488 struct request_sock *fastopen;
489 u32 seq, snd_una;
490 int err;
491 struct net *net = dev_net(skb->dev);
492
493 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
494 th->dest, iph->saddr, ntohs(th->source),
495 inet_iif(skb), 0);
496 if (!sk) {
497 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
498 return -ENOENT;
499 }
500 if (sk->sk_state == TCP_TIME_WAIT) {
501 inet_twsk_put(inet_twsk(sk));
502 return 0;
503 }
504 seq = ntohl(th->seq);
505 if (sk->sk_state == TCP_NEW_SYN_RECV) {
506 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
507 type == ICMP_TIME_EXCEEDED ||
508 (type == ICMP_DEST_UNREACH &&
509 (code == ICMP_NET_UNREACH ||
510 code == ICMP_HOST_UNREACH)));
511 return 0;
512 }
513
514 bh_lock_sock(sk);
515 /* If too many ICMPs get dropped on busy
516 * servers this needs to be solved differently.
517 * We do take care of PMTU discovery (RFC1191) special case :
518 * we can receive locally generated ICMP messages while socket is held.
519 */
520 if (sock_owned_by_user(sk)) {
521 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
522 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
523 }
524 if (sk->sk_state == TCP_CLOSE)
525 goto out;
526
527 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
528 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
529 goto out;
530 }
531
532 tp = tcp_sk(sk);
533 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
534 fastopen = rcu_dereference(tp->fastopen_rsk);
535 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
536 if (sk->sk_state != TCP_LISTEN &&
537 !between(seq, snd_una, tp->snd_nxt)) {
538 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
539 goto out;
540 }
541
542 switch (type) {
543 case ICMP_REDIRECT:
544 if (!sock_owned_by_user(sk))
545 do_redirect(skb, sk);
546 goto out;
547 case ICMP_SOURCE_QUENCH:
548 /* Just silently ignore these. */
549 goto out;
550 case ICMP_PARAMETERPROB:
551 err = EPROTO;
552 break;
553 case ICMP_DEST_UNREACH:
554 if (code > NR_ICMP_UNREACH)
555 goto out;
556
557 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
558 /* We are not interested in TCP_LISTEN and open_requests
559 * (SYN-ACKs send out by Linux are always <576bytes so
560 * they should go through unfragmented).
561 */
562 if (sk->sk_state == TCP_LISTEN)
563 goto out;
564
565 WRITE_ONCE(tp->mtu_info, info);
566 if (!sock_owned_by_user(sk)) {
567 tcp_v4_mtu_reduced(sk);
568 } else {
569 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
570 sock_hold(sk);
571 }
572 goto out;
573 }
574
575 err = icmp_err_convert[code].errno;
576 /* check if this ICMP message allows revert of backoff.
577 * (see RFC 6069)
578 */
579 if (!fastopen &&
580 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
581 tcp_ld_RTO_revert(sk, seq);
582 break;
583 case ICMP_TIME_EXCEEDED:
584 err = EHOSTUNREACH;
585 break;
586 default:
587 goto out;
588 }
589
590 switch (sk->sk_state) {
591 case TCP_SYN_SENT:
592 case TCP_SYN_RECV:
593 /* Only in fast or simultaneous open. If a fast open socket is
594 * already accepted it is treated as a connected one below.
595 */
596 if (fastopen && !fastopen->sk)
597 break;
598
599 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
600
601 if (!sock_owned_by_user(sk)) {
602 sk->sk_err = err;
603
604 sk->sk_error_report(sk);
605
606 tcp_done(sk);
607 } else {
608 sk->sk_err_soft = err;
609 }
610 goto out;
611 }
612
613 /* If we've already connected we will keep trying
614 * until we time out, or the user gives up.
615 *
616 * rfc1122 4.2.3.9 allows to consider as hard errors
617 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
618 * but it is obsoleted by pmtu discovery).
619 *
620 * Note, that in modern internet, where routing is unreliable
621 * and in each dark corner broken firewalls sit, sending random
622 * errors ordered by their masters even this two messages finally lose
623 * their original sense (even Linux sends invalid PORT_UNREACHs)
624 *
625 * Now we are in compliance with RFCs.
626 * --ANK (980905)
627 */
628
629 inet = inet_sk(sk);
630 if (!sock_owned_by_user(sk) && inet->recverr) {
631 sk->sk_err = err;
632 sk->sk_error_report(sk);
633 } else { /* Only an error on timeout */
634 sk->sk_err_soft = err;
635 }
636
637 out:
638 bh_unlock_sock(sk);
639 sock_put(sk);
640 return 0;
641 }
642
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)643 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
644 {
645 struct tcphdr *th = tcp_hdr(skb);
646
647 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
648 skb->csum_start = skb_transport_header(skb) - skb->head;
649 skb->csum_offset = offsetof(struct tcphdr, check);
650 }
651
652 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)653 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
654 {
655 const struct inet_sock *inet = inet_sk(sk);
656
657 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
658 }
659 EXPORT_SYMBOL(tcp_v4_send_check);
660
661 /*
662 * This routine will send an RST to the other tcp.
663 *
664 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
665 * for reset.
666 * Answer: if a packet caused RST, it is not for a socket
667 * existing in our system, if it is matched to a socket,
668 * it is just duplicate segment or bug in other side's TCP.
669 * So that we build reply only basing on parameters
670 * arrived with segment.
671 * Exception: precedence violation. We do not implement it in any case.
672 */
673
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)674 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
675 {
676 const struct tcphdr *th = tcp_hdr(skb);
677 struct {
678 struct tcphdr th;
679 #ifdef CONFIG_TCP_MD5SIG
680 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
681 #endif
682 } rep;
683 struct ip_reply_arg arg;
684 #ifdef CONFIG_TCP_MD5SIG
685 struct tcp_md5sig_key *key = NULL;
686 const __u8 *hash_location = NULL;
687 unsigned char newhash[16];
688 int genhash;
689 struct sock *sk1 = NULL;
690 #endif
691 u64 transmit_time = 0;
692 struct sock *ctl_sk;
693 struct net *net;
694
695 /* Never send a reset in response to a reset. */
696 if (th->rst)
697 return;
698
699 /* If sk not NULL, it means we did a successful lookup and incoming
700 * route had to be correct. prequeue might have dropped our dst.
701 */
702 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
703 return;
704
705 /* Swap the send and the receive. */
706 memset(&rep, 0, sizeof(rep));
707 rep.th.dest = th->source;
708 rep.th.source = th->dest;
709 rep.th.doff = sizeof(struct tcphdr) / 4;
710 rep.th.rst = 1;
711
712 if (th->ack) {
713 rep.th.seq = th->ack_seq;
714 } else {
715 rep.th.ack = 1;
716 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
717 skb->len - (th->doff << 2));
718 }
719
720 memset(&arg, 0, sizeof(arg));
721 arg.iov[0].iov_base = (unsigned char *)&rep;
722 arg.iov[0].iov_len = sizeof(rep.th);
723
724 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
725 #ifdef CONFIG_TCP_MD5SIG
726 rcu_read_lock();
727 hash_location = tcp_parse_md5sig_option(th);
728 if (sk && sk_fullsock(sk)) {
729 const union tcp_md5_addr *addr;
730 int l3index;
731
732 /* sdif set, means packet ingressed via a device
733 * in an L3 domain and inet_iif is set to it.
734 */
735 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
736 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
737 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
738 } else if (hash_location) {
739 const union tcp_md5_addr *addr;
740 int sdif = tcp_v4_sdif(skb);
741 int dif = inet_iif(skb);
742 int l3index;
743
744 /*
745 * active side is lost. Try to find listening socket through
746 * source port, and then find md5 key through listening socket.
747 * we are not loose security here:
748 * Incoming packet is checked with md5 hash with finding key,
749 * no RST generated if md5 hash doesn't match.
750 */
751 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
752 ip_hdr(skb)->saddr,
753 th->source, ip_hdr(skb)->daddr,
754 ntohs(th->source), dif, sdif);
755 /* don't send rst if it can't find key */
756 if (!sk1)
757 goto out;
758
759 /* sdif set, means packet ingressed via a device
760 * in an L3 domain and dif is set to it.
761 */
762 l3index = sdif ? dif : 0;
763 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
764 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
765 if (!key)
766 goto out;
767
768
769 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
770 if (genhash || memcmp(hash_location, newhash, 16) != 0)
771 goto out;
772
773 }
774
775 if (key) {
776 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
777 (TCPOPT_NOP << 16) |
778 (TCPOPT_MD5SIG << 8) |
779 TCPOLEN_MD5SIG);
780 /* Update length and the length the header thinks exists */
781 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
782 rep.th.doff = arg.iov[0].iov_len / 4;
783
784 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
785 key, ip_hdr(skb)->saddr,
786 ip_hdr(skb)->daddr, &rep.th);
787 }
788 #endif
789 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
790 ip_hdr(skb)->saddr, /* XXX */
791 arg.iov[0].iov_len, IPPROTO_TCP, 0);
792 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
793 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
794
795 /* When socket is gone, all binding information is lost.
796 * routing might fail in this case. No choice here, if we choose to force
797 * input interface, we will misroute in case of asymmetric route.
798 */
799 if (sk) {
800 arg.bound_dev_if = sk->sk_bound_dev_if;
801 if (sk_fullsock(sk))
802 trace_tcp_send_reset(sk, skb);
803 }
804
805 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
806 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
807
808 arg.tos = ip_hdr(skb)->tos;
809 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
810 local_bh_disable();
811 ctl_sk = this_cpu_read(ipv4_tcp_sk);
812 sock_net_set(ctl_sk, net);
813 if (sk) {
814 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_mark : sk->sk_mark;
816 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
817 inet_twsk(sk)->tw_priority : sk->sk_priority;
818 transmit_time = tcp_transmit_time(sk);
819 xfrm_sk_clone_policy(ctl_sk, sk);
820 } else {
821 ctl_sk->sk_mark = 0;
822 ctl_sk->sk_priority = 0;
823 }
824 ip_send_unicast_reply(ctl_sk,
825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 &arg, arg.iov[0].iov_len,
828 transmit_time);
829
830 xfrm_sk_free_policy(ctl_sk);
831 sock_net_set(ctl_sk, &init_net);
832 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
833 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
834 local_bh_enable();
835
836 #ifdef CONFIG_TCP_MD5SIG
837 out:
838 rcu_read_unlock();
839 #endif
840 }
841
842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
843 outside socket context is ugly, certainly. What can I do?
844 */
845
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)846 static void tcp_v4_send_ack(const struct sock *sk,
847 struct sk_buff *skb, u32 seq, u32 ack,
848 u32 win, u32 tsval, u32 tsecr, int oif,
849 struct tcp_md5sig_key *key,
850 int reply_flags, u8 tos)
851 {
852 const struct tcphdr *th = tcp_hdr(skb);
853 struct {
854 struct tcphdr th;
855 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
856 #ifdef CONFIG_TCP_MD5SIG
857 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
858 #endif
859 ];
860 } rep;
861 struct net *net = sock_net(sk);
862 struct ip_reply_arg arg;
863 struct sock *ctl_sk;
864 u64 transmit_time;
865
866 memset(&rep.th, 0, sizeof(struct tcphdr));
867 memset(&arg, 0, sizeof(arg));
868
869 arg.iov[0].iov_base = (unsigned char *)&rep;
870 arg.iov[0].iov_len = sizeof(rep.th);
871 if (tsecr) {
872 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
873 (TCPOPT_TIMESTAMP << 8) |
874 TCPOLEN_TIMESTAMP);
875 rep.opt[1] = htonl(tsval);
876 rep.opt[2] = htonl(tsecr);
877 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
878 }
879
880 /* Swap the send and the receive. */
881 rep.th.dest = th->source;
882 rep.th.source = th->dest;
883 rep.th.doff = arg.iov[0].iov_len / 4;
884 rep.th.seq = htonl(seq);
885 rep.th.ack_seq = htonl(ack);
886 rep.th.ack = 1;
887 rep.th.window = htons(win);
888
889 #ifdef CONFIG_TCP_MD5SIG
890 if (key) {
891 int offset = (tsecr) ? 3 : 0;
892
893 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
894 (TCPOPT_NOP << 16) |
895 (TCPOPT_MD5SIG << 8) |
896 TCPOLEN_MD5SIG);
897 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
898 rep.th.doff = arg.iov[0].iov_len/4;
899
900 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
901 key, ip_hdr(skb)->saddr,
902 ip_hdr(skb)->daddr, &rep.th);
903 }
904 #endif
905 arg.flags = reply_flags;
906 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
907 ip_hdr(skb)->saddr, /* XXX */
908 arg.iov[0].iov_len, IPPROTO_TCP, 0);
909 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
910 if (oif)
911 arg.bound_dev_if = oif;
912 arg.tos = tos;
913 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
914 local_bh_disable();
915 ctl_sk = this_cpu_read(ipv4_tcp_sk);
916 sock_net_set(ctl_sk, net);
917 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
918 inet_twsk(sk)->tw_mark : sk->sk_mark;
919 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
920 inet_twsk(sk)->tw_priority : sk->sk_priority;
921 transmit_time = tcp_transmit_time(sk);
922 ip_send_unicast_reply(ctl_sk,
923 skb, &TCP_SKB_CB(skb)->header.h4.opt,
924 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
925 &arg, arg.iov[0].iov_len,
926 transmit_time);
927
928 sock_net_set(ctl_sk, &init_net);
929 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
930 local_bh_enable();
931 }
932
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
934 {
935 struct inet_timewait_sock *tw = inet_twsk(sk);
936 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
937
938 tcp_v4_send_ack(sk, skb,
939 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
940 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
941 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
942 tcptw->tw_ts_recent,
943 tw->tw_bound_dev_if,
944 tcp_twsk_md5_key(tcptw),
945 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
946 tw->tw_tos
947 );
948
949 inet_twsk_put(tw);
950 }
951
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
953 struct request_sock *req)
954 {
955 const union tcp_md5_addr *addr;
956 int l3index;
957
958 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
959 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
960 */
961 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
962 tcp_sk(sk)->snd_nxt;
963
964 /* RFC 7323 2.3
965 * The window field (SEG.WND) of every outgoing segment, with the
966 * exception of <SYN> segments, MUST be right-shifted by
967 * Rcv.Wind.Shift bits:
968 */
969 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
970 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
971 tcp_v4_send_ack(sk, skb, seq,
972 tcp_rsk(req)->rcv_nxt,
973 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
974 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
975 READ_ONCE(req->ts_recent),
976 0,
977 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
978 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
979 ip_hdr(skb)->tos);
980 }
981
982 /*
983 * Send a SYN-ACK after having received a SYN.
984 * This still operates on a request_sock only, not on a big
985 * socket.
986 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
988 struct flowi *fl,
989 struct request_sock *req,
990 struct tcp_fastopen_cookie *foc,
991 enum tcp_synack_type synack_type,
992 struct sk_buff *syn_skb)
993 {
994 const struct inet_request_sock *ireq = inet_rsk(req);
995 struct flowi4 fl4;
996 int err = -1;
997 struct sk_buff *skb;
998 u8 tos;
999
1000 /* First, grab a route. */
1001 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002 return -1;
1003
1004 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006 if (skb) {
1007 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1010 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011 (inet_sk(sk)->tos & INET_ECN_MASK) :
1012 inet_sk(sk)->tos;
1013
1014 if (!INET_ECN_is_capable(tos) &&
1015 tcp_bpf_ca_needs_ecn((struct sock *)req))
1016 tos |= INET_ECN_ECT_0;
1017
1018 rcu_read_lock();
1019 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020 ireq->ir_rmt_addr,
1021 rcu_dereference(ireq->ireq_opt),
1022 tos);
1023 rcu_read_unlock();
1024 err = net_xmit_eval(err);
1025 }
1026
1027 return err;
1028 }
1029
1030 /*
1031 * IPv4 request_sock destructor.
1032 */
tcp_v4_reqsk_destructor(struct request_sock * req)1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040 * RFC2385 MD5 checksumming requires a mapping of
1041 * IP address->MD5 Key.
1042 * We need to maintain these in the sk structure.
1043 */
1044
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050 if (!old)
1051 return true;
1052
1053 /* l3index always overrides non-l3index */
1054 if (old->l3index && new->l3index == 0)
1055 return false;
1056 if (old->l3index == 0 && new->l3index)
1057 return true;
1058
1059 return old->prefixlen < new->prefixlen;
1060 }
1061
1062 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064 const union tcp_md5_addr *addr,
1065 int family)
1066 {
1067 const struct tcp_sock *tp = tcp_sk(sk);
1068 struct tcp_md5sig_key *key;
1069 const struct tcp_md5sig_info *md5sig;
1070 __be32 mask;
1071 struct tcp_md5sig_key *best_match = NULL;
1072 bool match;
1073
1074 /* caller either holds rcu_read_lock() or socket lock */
1075 md5sig = rcu_dereference_check(tp->md5sig_info,
1076 lockdep_sock_is_held(sk));
1077 if (!md5sig)
1078 return NULL;
1079
1080 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081 lockdep_sock_is_held(sk)) {
1082 if (key->family != family)
1083 continue;
1084 if (key->l3index && key->l3index != l3index)
1085 continue;
1086 if (family == AF_INET) {
1087 mask = inet_make_mask(key->prefixlen);
1088 match = (key->addr.a4.s_addr & mask) ==
1089 (addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091 } else if (family == AF_INET6) {
1092 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093 key->prefixlen);
1094 #endif
1095 } else {
1096 match = false;
1097 }
1098
1099 if (match && better_md5_match(best_match, key))
1100 best_match = key;
1101 }
1102 return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107 const union tcp_md5_addr *addr,
1108 int family, u8 prefixlen,
1109 int l3index)
1110 {
1111 const struct tcp_sock *tp = tcp_sk(sk);
1112 struct tcp_md5sig_key *key;
1113 unsigned int size = sizeof(struct in_addr);
1114 const struct tcp_md5sig_info *md5sig;
1115
1116 /* caller either holds rcu_read_lock() or socket lock */
1117 md5sig = rcu_dereference_check(tp->md5sig_info,
1118 lockdep_sock_is_held(sk));
1119 if (!md5sig)
1120 return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122 if (family == AF_INET6)
1123 size = sizeof(struct in6_addr);
1124 #endif
1125 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126 lockdep_sock_is_held(sk)) {
1127 if (key->family != family)
1128 continue;
1129 if (key->l3index != l3index)
1130 continue;
1131 if (!memcmp(&key->addr, addr, size) &&
1132 key->prefixlen == prefixlen)
1133 return key;
1134 }
1135 return NULL;
1136 }
1137
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1138 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1139 const struct sock *addr_sk)
1140 {
1141 const union tcp_md5_addr *addr;
1142 int l3index;
1143
1144 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1145 addr_sk->sk_bound_dev_if);
1146 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1147 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1148 }
1149 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1150
1151 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1152 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1153 int family, u8 prefixlen, int l3index,
1154 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1155 {
1156 /* Add Key to the list */
1157 struct tcp_md5sig_key *key;
1158 struct tcp_sock *tp = tcp_sk(sk);
1159 struct tcp_md5sig_info *md5sig;
1160
1161 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1162 if (key) {
1163 /* Pre-existing entry - just update that one.
1164 * Note that the key might be used concurrently.
1165 * data_race() is telling kcsan that we do not care of
1166 * key mismatches, since changing MD5 key on live flows
1167 * can lead to packet drops.
1168 */
1169 data_race(memcpy(key->key, newkey, newkeylen));
1170
1171 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1172 * Also note that a reader could catch new key->keylen value
1173 * but old key->key[], this is the reason we use __GFP_ZERO
1174 * at sock_kmalloc() time below these lines.
1175 */
1176 WRITE_ONCE(key->keylen, newkeylen);
1177
1178 return 0;
1179 }
1180
1181 md5sig = rcu_dereference_protected(tp->md5sig_info,
1182 lockdep_sock_is_held(sk));
1183 if (!md5sig) {
1184 md5sig = kmalloc(sizeof(*md5sig), gfp);
1185 if (!md5sig)
1186 return -ENOMEM;
1187
1188 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1189 INIT_HLIST_HEAD(&md5sig->head);
1190 rcu_assign_pointer(tp->md5sig_info, md5sig);
1191 }
1192
1193 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1194 if (!key)
1195 return -ENOMEM;
1196 if (!tcp_alloc_md5sig_pool()) {
1197 sock_kfree_s(sk, key, sizeof(*key));
1198 return -ENOMEM;
1199 }
1200
1201 memcpy(key->key, newkey, newkeylen);
1202 key->keylen = newkeylen;
1203 key->family = family;
1204 key->prefixlen = prefixlen;
1205 key->l3index = l3index;
1206 memcpy(&key->addr, addr,
1207 (family == AF_INET6) ? sizeof(struct in6_addr) :
1208 sizeof(struct in_addr));
1209 hlist_add_head_rcu(&key->node, &md5sig->head);
1210 return 0;
1211 }
1212 EXPORT_SYMBOL(tcp_md5_do_add);
1213
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1214 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1215 u8 prefixlen, int l3index)
1216 {
1217 struct tcp_md5sig_key *key;
1218
1219 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1220 if (!key)
1221 return -ENOENT;
1222 hlist_del_rcu(&key->node);
1223 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1224 kfree_rcu(key, rcu);
1225 return 0;
1226 }
1227 EXPORT_SYMBOL(tcp_md5_do_del);
1228
tcp_clear_md5_list(struct sock * sk)1229 static void tcp_clear_md5_list(struct sock *sk)
1230 {
1231 struct tcp_sock *tp = tcp_sk(sk);
1232 struct tcp_md5sig_key *key;
1233 struct hlist_node *n;
1234 struct tcp_md5sig_info *md5sig;
1235
1236 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1237
1238 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1239 hlist_del_rcu(&key->node);
1240 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1241 kfree_rcu(key, rcu);
1242 }
1243 }
1244
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1245 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1246 sockptr_t optval, int optlen)
1247 {
1248 struct tcp_md5sig cmd;
1249 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1250 const union tcp_md5_addr *addr;
1251 u8 prefixlen = 32;
1252 int l3index = 0;
1253
1254 if (optlen < sizeof(cmd))
1255 return -EINVAL;
1256
1257 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1258 return -EFAULT;
1259
1260 if (sin->sin_family != AF_INET)
1261 return -EINVAL;
1262
1263 if (optname == TCP_MD5SIG_EXT &&
1264 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1265 prefixlen = cmd.tcpm_prefixlen;
1266 if (prefixlen > 32)
1267 return -EINVAL;
1268 }
1269
1270 if (optname == TCP_MD5SIG_EXT &&
1271 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1272 struct net_device *dev;
1273
1274 rcu_read_lock();
1275 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1276 if (dev && netif_is_l3_master(dev))
1277 l3index = dev->ifindex;
1278
1279 rcu_read_unlock();
1280
1281 /* ok to reference set/not set outside of rcu;
1282 * right now device MUST be an L3 master
1283 */
1284 if (!dev || !l3index)
1285 return -EINVAL;
1286 }
1287
1288 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1289
1290 if (!cmd.tcpm_keylen)
1291 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1292
1293 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1294 return -EINVAL;
1295
1296 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1297 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1298 }
1299
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1300 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1301 __be32 daddr, __be32 saddr,
1302 const struct tcphdr *th, int nbytes)
1303 {
1304 struct tcp4_pseudohdr *bp;
1305 struct scatterlist sg;
1306 struct tcphdr *_th;
1307
1308 bp = hp->scratch;
1309 bp->saddr = saddr;
1310 bp->daddr = daddr;
1311 bp->pad = 0;
1312 bp->protocol = IPPROTO_TCP;
1313 bp->len = cpu_to_be16(nbytes);
1314
1315 _th = (struct tcphdr *)(bp + 1);
1316 memcpy(_th, th, sizeof(*th));
1317 _th->check = 0;
1318
1319 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1320 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1321 sizeof(*bp) + sizeof(*th));
1322 return crypto_ahash_update(hp->md5_req);
1323 }
1324
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1325 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1326 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1327 {
1328 struct tcp_md5sig_pool *hp;
1329 struct ahash_request *req;
1330
1331 hp = tcp_get_md5sig_pool();
1332 if (!hp)
1333 goto clear_hash_noput;
1334 req = hp->md5_req;
1335
1336 if (crypto_ahash_init(req))
1337 goto clear_hash;
1338 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1339 goto clear_hash;
1340 if (tcp_md5_hash_key(hp, key))
1341 goto clear_hash;
1342 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1343 if (crypto_ahash_final(req))
1344 goto clear_hash;
1345
1346 tcp_put_md5sig_pool();
1347 return 0;
1348
1349 clear_hash:
1350 tcp_put_md5sig_pool();
1351 clear_hash_noput:
1352 memset(md5_hash, 0, 16);
1353 return 1;
1354 }
1355
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1356 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1357 const struct sock *sk,
1358 const struct sk_buff *skb)
1359 {
1360 struct tcp_md5sig_pool *hp;
1361 struct ahash_request *req;
1362 const struct tcphdr *th = tcp_hdr(skb);
1363 __be32 saddr, daddr;
1364
1365 if (sk) { /* valid for establish/request sockets */
1366 saddr = sk->sk_rcv_saddr;
1367 daddr = sk->sk_daddr;
1368 } else {
1369 const struct iphdr *iph = ip_hdr(skb);
1370 saddr = iph->saddr;
1371 daddr = iph->daddr;
1372 }
1373
1374 hp = tcp_get_md5sig_pool();
1375 if (!hp)
1376 goto clear_hash_noput;
1377 req = hp->md5_req;
1378
1379 if (crypto_ahash_init(req))
1380 goto clear_hash;
1381
1382 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1383 goto clear_hash;
1384 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1385 goto clear_hash;
1386 if (tcp_md5_hash_key(hp, key))
1387 goto clear_hash;
1388 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1389 if (crypto_ahash_final(req))
1390 goto clear_hash;
1391
1392 tcp_put_md5sig_pool();
1393 return 0;
1394
1395 clear_hash:
1396 tcp_put_md5sig_pool();
1397 clear_hash_noput:
1398 memset(md5_hash, 0, 16);
1399 return 1;
1400 }
1401 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1402
1403 #endif
1404
1405 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1406 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1407 const struct sk_buff *skb,
1408 int dif, int sdif)
1409 {
1410 #ifdef CONFIG_TCP_MD5SIG
1411 /*
1412 * This gets called for each TCP segment that arrives
1413 * so we want to be efficient.
1414 * We have 3 drop cases:
1415 * o No MD5 hash and one expected.
1416 * o MD5 hash and we're not expecting one.
1417 * o MD5 hash and its wrong.
1418 */
1419 const __u8 *hash_location = NULL;
1420 struct tcp_md5sig_key *hash_expected;
1421 const struct iphdr *iph = ip_hdr(skb);
1422 const struct tcphdr *th = tcp_hdr(skb);
1423 const union tcp_md5_addr *addr;
1424 unsigned char newhash[16];
1425 int genhash, l3index;
1426
1427 /* sdif set, means packet ingressed via a device
1428 * in an L3 domain and dif is set to the l3mdev
1429 */
1430 l3index = sdif ? dif : 0;
1431
1432 addr = (union tcp_md5_addr *)&iph->saddr;
1433 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1434 hash_location = tcp_parse_md5sig_option(th);
1435
1436 /* We've parsed the options - do we have a hash? */
1437 if (!hash_expected && !hash_location)
1438 return false;
1439
1440 if (hash_expected && !hash_location) {
1441 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1442 return true;
1443 }
1444
1445 if (!hash_expected && hash_location) {
1446 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1447 return true;
1448 }
1449
1450 /* Okay, so this is hash_expected and hash_location -
1451 * so we need to calculate the checksum.
1452 */
1453 genhash = tcp_v4_md5_hash_skb(newhash,
1454 hash_expected,
1455 NULL, skb);
1456
1457 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1459 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1460 &iph->saddr, ntohs(th->source),
1461 &iph->daddr, ntohs(th->dest),
1462 genhash ? " tcp_v4_calc_md5_hash failed"
1463 : "", l3index);
1464 return true;
1465 }
1466 return false;
1467 #endif
1468 return false;
1469 }
1470
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1471 static void tcp_v4_init_req(struct request_sock *req,
1472 const struct sock *sk_listener,
1473 struct sk_buff *skb)
1474 {
1475 struct inet_request_sock *ireq = inet_rsk(req);
1476 struct net *net = sock_net(sk_listener);
1477
1478 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1479 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1480 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1481 }
1482
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1483 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1484 struct flowi *fl,
1485 const struct request_sock *req)
1486 {
1487 return inet_csk_route_req(sk, &fl->u.ip4, req);
1488 }
1489
1490 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1491 .family = PF_INET,
1492 .obj_size = sizeof(struct tcp_request_sock),
1493 .rtx_syn_ack = tcp_rtx_synack,
1494 .send_ack = tcp_v4_reqsk_send_ack,
1495 .destructor = tcp_v4_reqsk_destructor,
1496 .send_reset = tcp_v4_send_reset,
1497 .syn_ack_timeout = tcp_syn_ack_timeout,
1498 };
1499
1500 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1501 .mss_clamp = TCP_MSS_DEFAULT,
1502 #ifdef CONFIG_TCP_MD5SIG
1503 .req_md5_lookup = tcp_v4_md5_lookup,
1504 .calc_md5_hash = tcp_v4_md5_hash_skb,
1505 #endif
1506 .init_req = tcp_v4_init_req,
1507 #ifdef CONFIG_SYN_COOKIES
1508 .cookie_init_seq = cookie_v4_init_sequence,
1509 #endif
1510 .route_req = tcp_v4_route_req,
1511 .init_seq = tcp_v4_init_seq,
1512 .init_ts_off = tcp_v4_init_ts_off,
1513 .send_synack = tcp_v4_send_synack,
1514 };
1515
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1516 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1517 {
1518 /* Never answer to SYNs send to broadcast or multicast */
1519 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1520 goto drop;
1521
1522 return tcp_conn_request(&tcp_request_sock_ops,
1523 &tcp_request_sock_ipv4_ops, sk, skb);
1524
1525 drop:
1526 tcp_listendrop(sk);
1527 return 0;
1528 }
1529 EXPORT_SYMBOL(tcp_v4_conn_request);
1530
1531
1532 /*
1533 * The three way handshake has completed - we got a valid synack -
1534 * now create the new socket.
1535 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1536 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1537 struct request_sock *req,
1538 struct dst_entry *dst,
1539 struct request_sock *req_unhash,
1540 bool *own_req)
1541 {
1542 struct inet_request_sock *ireq;
1543 bool found_dup_sk = false;
1544 struct inet_sock *newinet;
1545 struct tcp_sock *newtp;
1546 struct sock *newsk;
1547 #ifdef CONFIG_TCP_MD5SIG
1548 const union tcp_md5_addr *addr;
1549 struct tcp_md5sig_key *key;
1550 int l3index;
1551 #endif
1552 struct ip_options_rcu *inet_opt;
1553
1554 if (sk_acceptq_is_full(sk))
1555 goto exit_overflow;
1556
1557 newsk = tcp_create_openreq_child(sk, req, skb);
1558 if (!newsk)
1559 goto exit_nonewsk;
1560
1561 newsk->sk_gso_type = SKB_GSO_TCPV4;
1562 inet_sk_rx_dst_set(newsk, skb);
1563
1564 newtp = tcp_sk(newsk);
1565 newinet = inet_sk(newsk);
1566 ireq = inet_rsk(req);
1567 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1568 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1569 newsk->sk_bound_dev_if = ireq->ir_iif;
1570 newinet->inet_saddr = ireq->ir_loc_addr;
1571 inet_opt = rcu_dereference(ireq->ireq_opt);
1572 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1573 newinet->mc_index = inet_iif(skb);
1574 newinet->mc_ttl = ip_hdr(skb)->ttl;
1575 newinet->rcv_tos = ip_hdr(skb)->tos;
1576 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1577 if (inet_opt)
1578 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1579 newinet->inet_id = prandom_u32();
1580
1581 /* Set ToS of the new socket based upon the value of incoming SYN.
1582 * ECT bits are set later in tcp_init_transfer().
1583 */
1584 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1585 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1586
1587 if (!dst) {
1588 dst = inet_csk_route_child_sock(sk, newsk, req);
1589 if (!dst)
1590 goto put_and_exit;
1591 } else {
1592 /* syncookie case : see end of cookie_v4_check() */
1593 }
1594 sk_setup_caps(newsk, dst);
1595
1596 tcp_ca_openreq_child(newsk, dst);
1597
1598 tcp_sync_mss(newsk, dst_mtu(dst));
1599 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1600
1601 tcp_initialize_rcv_mss(newsk);
1602
1603 #ifdef CONFIG_TCP_MD5SIG
1604 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1605 /* Copy over the MD5 key from the original socket */
1606 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1607 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1608 if (key) {
1609 /*
1610 * We're using one, so create a matching key
1611 * on the newsk structure. If we fail to get
1612 * memory, then we end up not copying the key
1613 * across. Shucks.
1614 */
1615 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1616 key->key, key->keylen, GFP_ATOMIC);
1617 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1618 }
1619 #endif
1620
1621 if (__inet_inherit_port(sk, newsk) < 0)
1622 goto put_and_exit;
1623 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1624 &found_dup_sk);
1625 if (likely(*own_req)) {
1626 tcp_move_syn(newtp, req);
1627 ireq->ireq_opt = NULL;
1628 } else {
1629 newinet->inet_opt = NULL;
1630
1631 if (!req_unhash && found_dup_sk) {
1632 /* This code path should only be executed in the
1633 * syncookie case only
1634 */
1635 bh_unlock_sock(newsk);
1636 sock_put(newsk);
1637 newsk = NULL;
1638 }
1639 }
1640 return newsk;
1641
1642 exit_overflow:
1643 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1644 exit_nonewsk:
1645 dst_release(dst);
1646 exit:
1647 tcp_listendrop(sk);
1648 return NULL;
1649 put_and_exit:
1650 newinet->inet_opt = NULL;
1651 inet_csk_prepare_forced_close(newsk);
1652 tcp_done(newsk);
1653 goto exit;
1654 }
1655 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1656
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1657 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1658 {
1659 #ifdef CONFIG_SYN_COOKIES
1660 const struct tcphdr *th = tcp_hdr(skb);
1661
1662 if (!th->syn)
1663 sk = cookie_v4_check(sk, skb);
1664 #endif
1665 return sk;
1666 }
1667
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1668 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1669 struct tcphdr *th, u32 *cookie)
1670 {
1671 u16 mss = 0;
1672 #ifdef CONFIG_SYN_COOKIES
1673 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1674 &tcp_request_sock_ipv4_ops, sk, th);
1675 if (mss) {
1676 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1677 tcp_synq_overflow(sk);
1678 }
1679 #endif
1680 return mss;
1681 }
1682
1683 /* The socket must have it's spinlock held when we get
1684 * here, unless it is a TCP_LISTEN socket.
1685 *
1686 * We have a potential double-lock case here, so even when
1687 * doing backlog processing we use the BH locking scheme.
1688 * This is because we cannot sleep with the original spinlock
1689 * held.
1690 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1691 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1692 {
1693 struct sock *rsk;
1694
1695 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1696 struct dst_entry *dst;
1697
1698 dst = rcu_dereference_protected(sk->sk_rx_dst,
1699 lockdep_sock_is_held(sk));
1700
1701 sock_rps_save_rxhash(sk, skb);
1702 sk_mark_napi_id(sk, skb);
1703 if (dst) {
1704 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1705 !dst->ops->check(dst, 0)) {
1706 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1707 dst_release(dst);
1708 }
1709 }
1710 tcp_rcv_established(sk, skb);
1711 return 0;
1712 }
1713
1714 if (tcp_checksum_complete(skb))
1715 goto csum_err;
1716
1717 if (sk->sk_state == TCP_LISTEN) {
1718 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1719
1720 if (!nsk)
1721 goto discard;
1722 if (nsk != sk) {
1723 if (tcp_child_process(sk, nsk, skb)) {
1724 rsk = nsk;
1725 goto reset;
1726 }
1727 return 0;
1728 }
1729 } else
1730 sock_rps_save_rxhash(sk, skb);
1731
1732 if (tcp_rcv_state_process(sk, skb)) {
1733 rsk = sk;
1734 goto reset;
1735 }
1736 return 0;
1737
1738 reset:
1739 tcp_v4_send_reset(rsk, skb);
1740 discard:
1741 kfree_skb(skb);
1742 /* Be careful here. If this function gets more complicated and
1743 * gcc suffers from register pressure on the x86, sk (in %ebx)
1744 * might be destroyed here. This current version compiles correctly,
1745 * but you have been warned.
1746 */
1747 return 0;
1748
1749 csum_err:
1750 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1751 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1752 goto discard;
1753 }
1754 EXPORT_SYMBOL(tcp_v4_do_rcv);
1755
tcp_v4_early_demux(struct sk_buff * skb)1756 int tcp_v4_early_demux(struct sk_buff *skb)
1757 {
1758 const struct iphdr *iph;
1759 const struct tcphdr *th;
1760 struct sock *sk;
1761
1762 if (skb->pkt_type != PACKET_HOST)
1763 return 0;
1764
1765 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1766 return 0;
1767
1768 iph = ip_hdr(skb);
1769 th = tcp_hdr(skb);
1770
1771 if (th->doff < sizeof(struct tcphdr) / 4)
1772 return 0;
1773
1774 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1775 iph->saddr, th->source,
1776 iph->daddr, ntohs(th->dest),
1777 skb->skb_iif, inet_sdif(skb));
1778 if (sk) {
1779 skb->sk = sk;
1780 skb->destructor = sock_edemux;
1781 if (sk_fullsock(sk)) {
1782 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1783
1784 if (dst)
1785 dst = dst_check(dst, 0);
1786 if (dst &&
1787 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1788 skb_dst_set_noref(skb, dst);
1789 }
1790 }
1791 return 0;
1792 }
1793
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1794 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1795 {
1796 u32 limit, tail_gso_size, tail_gso_segs;
1797 struct skb_shared_info *shinfo;
1798 const struct tcphdr *th;
1799 struct tcphdr *thtail;
1800 struct sk_buff *tail;
1801 unsigned int hdrlen;
1802 bool fragstolen;
1803 u32 gso_segs;
1804 u32 gso_size;
1805 int delta;
1806
1807 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1808 * we can fix skb->truesize to its real value to avoid future drops.
1809 * This is valid because skb is not yet charged to the socket.
1810 * It has been noticed pure SACK packets were sometimes dropped
1811 * (if cooked by drivers without copybreak feature).
1812 */
1813 skb_condense(skb);
1814
1815 skb_dst_drop(skb);
1816
1817 if (unlikely(tcp_checksum_complete(skb))) {
1818 bh_unlock_sock(sk);
1819 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1820 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1821 return true;
1822 }
1823
1824 /* Attempt coalescing to last skb in backlog, even if we are
1825 * above the limits.
1826 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1827 */
1828 th = (const struct tcphdr *)skb->data;
1829 hdrlen = th->doff * 4;
1830
1831 tail = sk->sk_backlog.tail;
1832 if (!tail)
1833 goto no_coalesce;
1834 thtail = (struct tcphdr *)tail->data;
1835
1836 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1837 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1838 ((TCP_SKB_CB(tail)->tcp_flags |
1839 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1840 !((TCP_SKB_CB(tail)->tcp_flags &
1841 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1842 ((TCP_SKB_CB(tail)->tcp_flags ^
1843 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1844 #ifdef CONFIG_TLS_DEVICE
1845 tail->decrypted != skb->decrypted ||
1846 #endif
1847 !mptcp_skb_can_collapse(tail, skb) ||
1848 thtail->doff != th->doff ||
1849 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1850 goto no_coalesce;
1851
1852 __skb_pull(skb, hdrlen);
1853
1854 shinfo = skb_shinfo(skb);
1855 gso_size = shinfo->gso_size ?: skb->len;
1856 gso_segs = shinfo->gso_segs ?: 1;
1857
1858 shinfo = skb_shinfo(tail);
1859 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1860 tail_gso_segs = shinfo->gso_segs ?: 1;
1861
1862 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1863 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1864
1865 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1866 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1867 thtail->window = th->window;
1868 }
1869
1870 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1871 * thtail->fin, so that the fast path in tcp_rcv_established()
1872 * is not entered if we append a packet with a FIN.
1873 * SYN, RST, URG are not present.
1874 * ACK is set on both packets.
1875 * PSH : we do not really care in TCP stack,
1876 * at least for 'GRO' packets.
1877 */
1878 thtail->fin |= th->fin;
1879 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1880
1881 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1882 TCP_SKB_CB(tail)->has_rxtstamp = true;
1883 tail->tstamp = skb->tstamp;
1884 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1885 }
1886
1887 /* Not as strict as GRO. We only need to carry mss max value */
1888 shinfo->gso_size = max(gso_size, tail_gso_size);
1889 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1890
1891 sk->sk_backlog.len += delta;
1892 __NET_INC_STATS(sock_net(sk),
1893 LINUX_MIB_TCPBACKLOGCOALESCE);
1894 kfree_skb_partial(skb, fragstolen);
1895 return false;
1896 }
1897 __skb_push(skb, hdrlen);
1898
1899 no_coalesce:
1900 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1901
1902 /* Only socket owner can try to collapse/prune rx queues
1903 * to reduce memory overhead, so add a little headroom here.
1904 * Few sockets backlog are possibly concurrently non empty.
1905 */
1906 limit += 64 * 1024;
1907
1908 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1909 bh_unlock_sock(sk);
1910 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1911 return true;
1912 }
1913 return false;
1914 }
1915 EXPORT_SYMBOL(tcp_add_backlog);
1916
tcp_filter(struct sock * sk,struct sk_buff * skb)1917 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1918 {
1919 struct tcphdr *th = (struct tcphdr *)skb->data;
1920
1921 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1922 }
1923 EXPORT_SYMBOL(tcp_filter);
1924
tcp_v4_restore_cb(struct sk_buff * skb)1925 static void tcp_v4_restore_cb(struct sk_buff *skb)
1926 {
1927 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1928 sizeof(struct inet_skb_parm));
1929 }
1930
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1931 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1932 const struct tcphdr *th)
1933 {
1934 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1935 * barrier() makes sure compiler wont play fool^Waliasing games.
1936 */
1937 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1938 sizeof(struct inet_skb_parm));
1939 barrier();
1940
1941 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1942 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1943 skb->len - th->doff * 4);
1944 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1945 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1946 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1947 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1948 TCP_SKB_CB(skb)->sacked = 0;
1949 TCP_SKB_CB(skb)->has_rxtstamp =
1950 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1951 }
1952
1953 /*
1954 * From tcp_input.c
1955 */
1956
tcp_v4_rcv(struct sk_buff * skb)1957 int tcp_v4_rcv(struct sk_buff *skb)
1958 {
1959 struct net *net = dev_net(skb->dev);
1960 struct sk_buff *skb_to_free;
1961 int sdif = inet_sdif(skb);
1962 int dif = inet_iif(skb);
1963 const struct iphdr *iph;
1964 const struct tcphdr *th;
1965 bool refcounted;
1966 struct sock *sk;
1967 int ret;
1968
1969 if (skb->pkt_type != PACKET_HOST)
1970 goto discard_it;
1971
1972 /* Count it even if it's bad */
1973 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1974
1975 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1976 goto discard_it;
1977
1978 th = (const struct tcphdr *)skb->data;
1979
1980 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1981 goto bad_packet;
1982 if (!pskb_may_pull(skb, th->doff * 4))
1983 goto discard_it;
1984
1985 /* An explanation is required here, I think.
1986 * Packet length and doff are validated by header prediction,
1987 * provided case of th->doff==0 is eliminated.
1988 * So, we defer the checks. */
1989
1990 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1991 goto csum_error;
1992
1993 th = (const struct tcphdr *)skb->data;
1994 iph = ip_hdr(skb);
1995 lookup:
1996 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1997 th->dest, sdif, &refcounted);
1998 if (!sk)
1999 goto no_tcp_socket;
2000
2001 process:
2002 if (sk->sk_state == TCP_TIME_WAIT)
2003 goto do_time_wait;
2004
2005 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2006 struct request_sock *req = inet_reqsk(sk);
2007 bool req_stolen = false;
2008 struct sock *nsk;
2009
2010 sk = req->rsk_listener;
2011 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2012 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2013 sk_drops_add(sk, skb);
2014 reqsk_put(req);
2015 goto discard_it;
2016 }
2017 if (tcp_checksum_complete(skb)) {
2018 reqsk_put(req);
2019 goto csum_error;
2020 }
2021 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2022 inet_csk_reqsk_queue_drop_and_put(sk, req);
2023 goto lookup;
2024 }
2025 /* We own a reference on the listener, increase it again
2026 * as we might lose it too soon.
2027 */
2028 sock_hold(sk);
2029 refcounted = true;
2030 nsk = NULL;
2031 if (!tcp_filter(sk, skb)) {
2032 th = (const struct tcphdr *)skb->data;
2033 iph = ip_hdr(skb);
2034 tcp_v4_fill_cb(skb, iph, th);
2035 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2036 }
2037 if (!nsk) {
2038 reqsk_put(req);
2039 if (req_stolen) {
2040 /* Another cpu got exclusive access to req
2041 * and created a full blown socket.
2042 * Try to feed this packet to this socket
2043 * instead of discarding it.
2044 */
2045 tcp_v4_restore_cb(skb);
2046 sock_put(sk);
2047 goto lookup;
2048 }
2049 goto discard_and_relse;
2050 }
2051 nf_reset_ct(skb);
2052 if (nsk == sk) {
2053 reqsk_put(req);
2054 tcp_v4_restore_cb(skb);
2055 } else if (tcp_child_process(sk, nsk, skb)) {
2056 tcp_v4_send_reset(nsk, skb);
2057 goto discard_and_relse;
2058 } else {
2059 sock_put(sk);
2060 return 0;
2061 }
2062 }
2063 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2064 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2065 goto discard_and_relse;
2066 }
2067
2068 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2069 goto discard_and_relse;
2070
2071 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2072 goto discard_and_relse;
2073
2074 nf_reset_ct(skb);
2075
2076 if (tcp_filter(sk, skb))
2077 goto discard_and_relse;
2078 th = (const struct tcphdr *)skb->data;
2079 iph = ip_hdr(skb);
2080 tcp_v4_fill_cb(skb, iph, th);
2081
2082 skb->dev = NULL;
2083
2084 if (sk->sk_state == TCP_LISTEN) {
2085 ret = tcp_v4_do_rcv(sk, skb);
2086 goto put_and_return;
2087 }
2088
2089 sk_incoming_cpu_update(sk);
2090
2091 bh_lock_sock_nested(sk);
2092 tcp_segs_in(tcp_sk(sk), skb);
2093 ret = 0;
2094 if (!sock_owned_by_user(sk)) {
2095 skb_to_free = sk->sk_rx_skb_cache;
2096 sk->sk_rx_skb_cache = NULL;
2097 ret = tcp_v4_do_rcv(sk, skb);
2098 } else {
2099 if (tcp_add_backlog(sk, skb))
2100 goto discard_and_relse;
2101 skb_to_free = NULL;
2102 }
2103 bh_unlock_sock(sk);
2104 if (skb_to_free)
2105 __kfree_skb(skb_to_free);
2106
2107 put_and_return:
2108 if (refcounted)
2109 sock_put(sk);
2110
2111 return ret;
2112
2113 no_tcp_socket:
2114 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2115 goto discard_it;
2116
2117 tcp_v4_fill_cb(skb, iph, th);
2118
2119 if (tcp_checksum_complete(skb)) {
2120 csum_error:
2121 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2122 bad_packet:
2123 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2124 } else {
2125 tcp_v4_send_reset(NULL, skb);
2126 }
2127
2128 discard_it:
2129 /* Discard frame. */
2130 kfree_skb(skb);
2131 return 0;
2132
2133 discard_and_relse:
2134 sk_drops_add(sk, skb);
2135 if (refcounted)
2136 sock_put(sk);
2137 goto discard_it;
2138
2139 do_time_wait:
2140 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2141 inet_twsk_put(inet_twsk(sk));
2142 goto discard_it;
2143 }
2144
2145 tcp_v4_fill_cb(skb, iph, th);
2146
2147 if (tcp_checksum_complete(skb)) {
2148 inet_twsk_put(inet_twsk(sk));
2149 goto csum_error;
2150 }
2151 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2152 case TCP_TW_SYN: {
2153 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2154 &tcp_hashinfo, skb,
2155 __tcp_hdrlen(th),
2156 iph->saddr, th->source,
2157 iph->daddr, th->dest,
2158 inet_iif(skb),
2159 sdif);
2160 if (sk2) {
2161 inet_twsk_deschedule_put(inet_twsk(sk));
2162 sk = sk2;
2163 tcp_v4_restore_cb(skb);
2164 refcounted = false;
2165 goto process;
2166 }
2167 }
2168 /* to ACK */
2169 fallthrough;
2170 case TCP_TW_ACK:
2171 tcp_v4_timewait_ack(sk, skb);
2172 break;
2173 case TCP_TW_RST:
2174 tcp_v4_send_reset(sk, skb);
2175 inet_twsk_deschedule_put(inet_twsk(sk));
2176 goto discard_it;
2177 case TCP_TW_SUCCESS:;
2178 }
2179 goto discard_it;
2180 }
2181
2182 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2183 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2184 .twsk_unique = tcp_twsk_unique,
2185 .twsk_destructor= tcp_twsk_destructor,
2186 };
2187
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2188 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2189 {
2190 struct dst_entry *dst = skb_dst(skb);
2191
2192 if (dst && dst_hold_safe(dst)) {
2193 rcu_assign_pointer(sk->sk_rx_dst, dst);
2194 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2195 }
2196 }
2197 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2198
2199 const struct inet_connection_sock_af_ops ipv4_specific = {
2200 .queue_xmit = ip_queue_xmit,
2201 .send_check = tcp_v4_send_check,
2202 .rebuild_header = inet_sk_rebuild_header,
2203 .sk_rx_dst_set = inet_sk_rx_dst_set,
2204 .conn_request = tcp_v4_conn_request,
2205 .syn_recv_sock = tcp_v4_syn_recv_sock,
2206 .net_header_len = sizeof(struct iphdr),
2207 .setsockopt = ip_setsockopt,
2208 .getsockopt = ip_getsockopt,
2209 .addr2sockaddr = inet_csk_addr2sockaddr,
2210 .sockaddr_len = sizeof(struct sockaddr_in),
2211 .mtu_reduced = tcp_v4_mtu_reduced,
2212 };
2213 EXPORT_SYMBOL(ipv4_specific);
2214
2215 #ifdef CONFIG_TCP_MD5SIG
2216 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2217 .md5_lookup = tcp_v4_md5_lookup,
2218 .calc_md5_hash = tcp_v4_md5_hash_skb,
2219 .md5_parse = tcp_v4_parse_md5_keys,
2220 };
2221 #endif
2222
2223 /* NOTE: A lot of things set to zero explicitly by call to
2224 * sk_alloc() so need not be done here.
2225 */
tcp_v4_init_sock(struct sock * sk)2226 static int tcp_v4_init_sock(struct sock *sk)
2227 {
2228 struct inet_connection_sock *icsk = inet_csk(sk);
2229
2230 tcp_init_sock(sk);
2231
2232 icsk->icsk_af_ops = &ipv4_specific;
2233
2234 #ifdef CONFIG_TCP_MD5SIG
2235 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2236 #endif
2237
2238 return 0;
2239 }
2240
tcp_v4_destroy_sock(struct sock * sk)2241 void tcp_v4_destroy_sock(struct sock *sk)
2242 {
2243 struct tcp_sock *tp = tcp_sk(sk);
2244
2245 trace_tcp_destroy_sock(sk);
2246
2247 tcp_clear_xmit_timers(sk);
2248
2249 tcp_cleanup_congestion_control(sk);
2250
2251 tcp_cleanup_ulp(sk);
2252
2253 /* Cleanup up the write buffer. */
2254 tcp_write_queue_purge(sk);
2255
2256 /* Check if we want to disable active TFO */
2257 tcp_fastopen_active_disable_ofo_check(sk);
2258
2259 /* Cleans up our, hopefully empty, out_of_order_queue. */
2260 skb_rbtree_purge(&tp->out_of_order_queue);
2261
2262 #ifdef CONFIG_TCP_MD5SIG
2263 /* Clean up the MD5 key list, if any */
2264 if (tp->md5sig_info) {
2265 tcp_clear_md5_list(sk);
2266 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2267 tp->md5sig_info = NULL;
2268 }
2269 #endif
2270
2271 /* Clean up a referenced TCP bind bucket. */
2272 if (inet_csk(sk)->icsk_bind_hash)
2273 inet_put_port(sk);
2274
2275 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2276
2277 /* If socket is aborted during connect operation */
2278 tcp_free_fastopen_req(tp);
2279 tcp_fastopen_destroy_cipher(sk);
2280 tcp_saved_syn_free(tp);
2281
2282 sk_sockets_allocated_dec(sk);
2283 }
2284 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2285
2286 #ifdef CONFIG_PROC_FS
2287 /* Proc filesystem TCP sock list dumping. */
2288
2289 /*
2290 * Get next listener socket follow cur. If cur is NULL, get first socket
2291 * starting from bucket given in st->bucket; when st->bucket is zero the
2292 * very first socket in the hash table is returned.
2293 */
listening_get_next(struct seq_file * seq,void * cur)2294 static void *listening_get_next(struct seq_file *seq, void *cur)
2295 {
2296 struct tcp_seq_afinfo *afinfo;
2297 struct tcp_iter_state *st = seq->private;
2298 struct net *net = seq_file_net(seq);
2299 struct inet_listen_hashbucket *ilb;
2300 struct hlist_nulls_node *node;
2301 struct sock *sk = cur;
2302
2303 if (st->bpf_seq_afinfo)
2304 afinfo = st->bpf_seq_afinfo;
2305 else
2306 afinfo = PDE_DATA(file_inode(seq->file));
2307
2308 if (!sk) {
2309 get_head:
2310 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2311 spin_lock(&ilb->lock);
2312 sk = sk_nulls_head(&ilb->nulls_head);
2313 st->offset = 0;
2314 goto get_sk;
2315 }
2316 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2317 ++st->num;
2318 ++st->offset;
2319
2320 sk = sk_nulls_next(sk);
2321 get_sk:
2322 sk_nulls_for_each_from(sk, node) {
2323 if (!net_eq(sock_net(sk), net))
2324 continue;
2325 if (afinfo->family == AF_UNSPEC ||
2326 sk->sk_family == afinfo->family)
2327 return sk;
2328 }
2329 spin_unlock(&ilb->lock);
2330 st->offset = 0;
2331 if (++st->bucket < INET_LHTABLE_SIZE)
2332 goto get_head;
2333 return NULL;
2334 }
2335
listening_get_idx(struct seq_file * seq,loff_t * pos)2336 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337 {
2338 struct tcp_iter_state *st = seq->private;
2339 void *rc;
2340
2341 st->bucket = 0;
2342 st->offset = 0;
2343 rc = listening_get_next(seq, NULL);
2344
2345 while (rc && *pos) {
2346 rc = listening_get_next(seq, rc);
2347 --*pos;
2348 }
2349 return rc;
2350 }
2351
empty_bucket(const struct tcp_iter_state * st)2352 static inline bool empty_bucket(const struct tcp_iter_state *st)
2353 {
2354 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355 }
2356
2357 /*
2358 * Get first established socket starting from bucket given in st->bucket.
2359 * If st->bucket is zero, the very first socket in the hash is returned.
2360 */
established_get_first(struct seq_file * seq)2361 static void *established_get_first(struct seq_file *seq)
2362 {
2363 struct tcp_seq_afinfo *afinfo;
2364 struct tcp_iter_state *st = seq->private;
2365 struct net *net = seq_file_net(seq);
2366 void *rc = NULL;
2367
2368 if (st->bpf_seq_afinfo)
2369 afinfo = st->bpf_seq_afinfo;
2370 else
2371 afinfo = PDE_DATA(file_inode(seq->file));
2372
2373 st->offset = 0;
2374 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2375 struct sock *sk;
2376 struct hlist_nulls_node *node;
2377 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2378
2379 /* Lockless fast path for the common case of empty buckets */
2380 if (empty_bucket(st))
2381 continue;
2382
2383 spin_lock_bh(lock);
2384 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2385 if ((afinfo->family != AF_UNSPEC &&
2386 sk->sk_family != afinfo->family) ||
2387 !net_eq(sock_net(sk), net)) {
2388 continue;
2389 }
2390 rc = sk;
2391 goto out;
2392 }
2393 spin_unlock_bh(lock);
2394 }
2395 out:
2396 return rc;
2397 }
2398
established_get_next(struct seq_file * seq,void * cur)2399 static void *established_get_next(struct seq_file *seq, void *cur)
2400 {
2401 struct tcp_seq_afinfo *afinfo;
2402 struct sock *sk = cur;
2403 struct hlist_nulls_node *node;
2404 struct tcp_iter_state *st = seq->private;
2405 struct net *net = seq_file_net(seq);
2406
2407 if (st->bpf_seq_afinfo)
2408 afinfo = st->bpf_seq_afinfo;
2409 else
2410 afinfo = PDE_DATA(file_inode(seq->file));
2411
2412 ++st->num;
2413 ++st->offset;
2414
2415 sk = sk_nulls_next(sk);
2416
2417 sk_nulls_for_each_from(sk, node) {
2418 if ((afinfo->family == AF_UNSPEC ||
2419 sk->sk_family == afinfo->family) &&
2420 net_eq(sock_net(sk), net))
2421 return sk;
2422 }
2423
2424 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2425 ++st->bucket;
2426 return established_get_first(seq);
2427 }
2428
established_get_idx(struct seq_file * seq,loff_t pos)2429 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2430 {
2431 struct tcp_iter_state *st = seq->private;
2432 void *rc;
2433
2434 st->bucket = 0;
2435 rc = established_get_first(seq);
2436
2437 while (rc && pos) {
2438 rc = established_get_next(seq, rc);
2439 --pos;
2440 }
2441 return rc;
2442 }
2443
tcp_get_idx(struct seq_file * seq,loff_t pos)2444 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2445 {
2446 void *rc;
2447 struct tcp_iter_state *st = seq->private;
2448
2449 st->state = TCP_SEQ_STATE_LISTENING;
2450 rc = listening_get_idx(seq, &pos);
2451
2452 if (!rc) {
2453 st->state = TCP_SEQ_STATE_ESTABLISHED;
2454 rc = established_get_idx(seq, pos);
2455 }
2456
2457 return rc;
2458 }
2459
tcp_seek_last_pos(struct seq_file * seq)2460 static void *tcp_seek_last_pos(struct seq_file *seq)
2461 {
2462 struct tcp_iter_state *st = seq->private;
2463 int bucket = st->bucket;
2464 int offset = st->offset;
2465 int orig_num = st->num;
2466 void *rc = NULL;
2467
2468 switch (st->state) {
2469 case TCP_SEQ_STATE_LISTENING:
2470 if (st->bucket >= INET_LHTABLE_SIZE)
2471 break;
2472 st->state = TCP_SEQ_STATE_LISTENING;
2473 rc = listening_get_next(seq, NULL);
2474 while (offset-- && rc && bucket == st->bucket)
2475 rc = listening_get_next(seq, rc);
2476 if (rc)
2477 break;
2478 st->bucket = 0;
2479 st->state = TCP_SEQ_STATE_ESTABLISHED;
2480 fallthrough;
2481 case TCP_SEQ_STATE_ESTABLISHED:
2482 if (st->bucket > tcp_hashinfo.ehash_mask)
2483 break;
2484 rc = established_get_first(seq);
2485 while (offset-- && rc && bucket == st->bucket)
2486 rc = established_get_next(seq, rc);
2487 }
2488
2489 st->num = orig_num;
2490
2491 return rc;
2492 }
2493
tcp_seq_start(struct seq_file * seq,loff_t * pos)2494 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2495 {
2496 struct tcp_iter_state *st = seq->private;
2497 void *rc;
2498
2499 if (*pos && *pos == st->last_pos) {
2500 rc = tcp_seek_last_pos(seq);
2501 if (rc)
2502 goto out;
2503 }
2504
2505 st->state = TCP_SEQ_STATE_LISTENING;
2506 st->num = 0;
2507 st->bucket = 0;
2508 st->offset = 0;
2509 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2510
2511 out:
2512 st->last_pos = *pos;
2513 return rc;
2514 }
2515 EXPORT_SYMBOL(tcp_seq_start);
2516
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2517 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2518 {
2519 struct tcp_iter_state *st = seq->private;
2520 void *rc = NULL;
2521
2522 if (v == SEQ_START_TOKEN) {
2523 rc = tcp_get_idx(seq, 0);
2524 goto out;
2525 }
2526
2527 switch (st->state) {
2528 case TCP_SEQ_STATE_LISTENING:
2529 rc = listening_get_next(seq, v);
2530 if (!rc) {
2531 st->state = TCP_SEQ_STATE_ESTABLISHED;
2532 st->bucket = 0;
2533 st->offset = 0;
2534 rc = established_get_first(seq);
2535 }
2536 break;
2537 case TCP_SEQ_STATE_ESTABLISHED:
2538 rc = established_get_next(seq, v);
2539 break;
2540 }
2541 out:
2542 ++*pos;
2543 st->last_pos = *pos;
2544 return rc;
2545 }
2546 EXPORT_SYMBOL(tcp_seq_next);
2547
tcp_seq_stop(struct seq_file * seq,void * v)2548 void tcp_seq_stop(struct seq_file *seq, void *v)
2549 {
2550 struct tcp_iter_state *st = seq->private;
2551
2552 switch (st->state) {
2553 case TCP_SEQ_STATE_LISTENING:
2554 if (v != SEQ_START_TOKEN)
2555 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2556 break;
2557 case TCP_SEQ_STATE_ESTABLISHED:
2558 if (v)
2559 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2560 break;
2561 }
2562 }
2563 EXPORT_SYMBOL(tcp_seq_stop);
2564
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2565 static void get_openreq4(const struct request_sock *req,
2566 struct seq_file *f, int i)
2567 {
2568 const struct inet_request_sock *ireq = inet_rsk(req);
2569 long delta = req->rsk_timer.expires - jiffies;
2570
2571 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2572 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2573 i,
2574 ireq->ir_loc_addr,
2575 ireq->ir_num,
2576 ireq->ir_rmt_addr,
2577 ntohs(ireq->ir_rmt_port),
2578 TCP_SYN_RECV,
2579 0, 0, /* could print option size, but that is af dependent. */
2580 1, /* timers active (only the expire timer) */
2581 jiffies_delta_to_clock_t(delta),
2582 req->num_timeout,
2583 from_kuid_munged(seq_user_ns(f),
2584 sock_i_uid(req->rsk_listener)),
2585 0, /* non standard timer */
2586 0, /* open_requests have no inode */
2587 0,
2588 req);
2589 }
2590
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2591 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2592 {
2593 int timer_active;
2594 unsigned long timer_expires;
2595 const struct tcp_sock *tp = tcp_sk(sk);
2596 const struct inet_connection_sock *icsk = inet_csk(sk);
2597 const struct inet_sock *inet = inet_sk(sk);
2598 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2599 __be32 dest = inet->inet_daddr;
2600 __be32 src = inet->inet_rcv_saddr;
2601 __u16 destp = ntohs(inet->inet_dport);
2602 __u16 srcp = ntohs(inet->inet_sport);
2603 int rx_queue;
2604 int state;
2605
2606 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2607 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2608 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2609 timer_active = 1;
2610 timer_expires = icsk->icsk_timeout;
2611 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2612 timer_active = 4;
2613 timer_expires = icsk->icsk_timeout;
2614 } else if (timer_pending(&sk->sk_timer)) {
2615 timer_active = 2;
2616 timer_expires = sk->sk_timer.expires;
2617 } else {
2618 timer_active = 0;
2619 timer_expires = jiffies;
2620 }
2621
2622 state = inet_sk_state_load(sk);
2623 if (state == TCP_LISTEN)
2624 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2625 else
2626 /* Because we don't lock the socket,
2627 * we might find a transient negative value.
2628 */
2629 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2630 READ_ONCE(tp->copied_seq), 0);
2631
2632 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2633 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2634 i, src, srcp, dest, destp, state,
2635 READ_ONCE(tp->write_seq) - tp->snd_una,
2636 rx_queue,
2637 timer_active,
2638 jiffies_delta_to_clock_t(timer_expires - jiffies),
2639 icsk->icsk_retransmits,
2640 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2641 icsk->icsk_probes_out,
2642 sock_i_ino(sk),
2643 refcount_read(&sk->sk_refcnt), sk,
2644 jiffies_to_clock_t(icsk->icsk_rto),
2645 jiffies_to_clock_t(icsk->icsk_ack.ato),
2646 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2647 tp->snd_cwnd,
2648 state == TCP_LISTEN ?
2649 fastopenq->max_qlen :
2650 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2651 }
2652
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2653 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2654 struct seq_file *f, int i)
2655 {
2656 long delta = tw->tw_timer.expires - jiffies;
2657 __be32 dest, src;
2658 __u16 destp, srcp;
2659
2660 dest = tw->tw_daddr;
2661 src = tw->tw_rcv_saddr;
2662 destp = ntohs(tw->tw_dport);
2663 srcp = ntohs(tw->tw_sport);
2664
2665 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2666 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2667 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2668 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2669 refcount_read(&tw->tw_refcnt), tw);
2670 }
2671
2672 #define TMPSZ 150
2673
tcp4_seq_show(struct seq_file * seq,void * v)2674 static int tcp4_seq_show(struct seq_file *seq, void *v)
2675 {
2676 struct tcp_iter_state *st;
2677 struct sock *sk = v;
2678
2679 seq_setwidth(seq, TMPSZ - 1);
2680 if (v == SEQ_START_TOKEN) {
2681 seq_puts(seq, " sl local_address rem_address st tx_queue "
2682 "rx_queue tr tm->when retrnsmt uid timeout "
2683 "inode");
2684 goto out;
2685 }
2686 st = seq->private;
2687
2688 if (sk->sk_state == TCP_TIME_WAIT)
2689 get_timewait4_sock(v, seq, st->num);
2690 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2691 get_openreq4(v, seq, st->num);
2692 else
2693 get_tcp4_sock(v, seq, st->num);
2694 out:
2695 seq_pad(seq, '\n');
2696 return 0;
2697 }
2698
2699 #ifdef CONFIG_BPF_SYSCALL
2700 struct bpf_iter__tcp {
2701 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2702 __bpf_md_ptr(struct sock_common *, sk_common);
2703 uid_t uid __aligned(8);
2704 };
2705
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2706 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2707 struct sock_common *sk_common, uid_t uid)
2708 {
2709 struct bpf_iter__tcp ctx;
2710
2711 meta->seq_num--; /* skip SEQ_START_TOKEN */
2712 ctx.meta = meta;
2713 ctx.sk_common = sk_common;
2714 ctx.uid = uid;
2715 return bpf_iter_run_prog(prog, &ctx);
2716 }
2717
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2718 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2719 {
2720 struct bpf_iter_meta meta;
2721 struct bpf_prog *prog;
2722 struct sock *sk = v;
2723 uid_t uid;
2724
2725 if (v == SEQ_START_TOKEN)
2726 return 0;
2727
2728 if (sk->sk_state == TCP_TIME_WAIT) {
2729 uid = 0;
2730 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2731 const struct request_sock *req = v;
2732
2733 uid = from_kuid_munged(seq_user_ns(seq),
2734 sock_i_uid(req->rsk_listener));
2735 } else {
2736 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2737 }
2738
2739 meta.seq = seq;
2740 prog = bpf_iter_get_info(&meta, false);
2741 return tcp_prog_seq_show(prog, &meta, v, uid);
2742 }
2743
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2744 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2745 {
2746 struct bpf_iter_meta meta;
2747 struct bpf_prog *prog;
2748
2749 if (!v) {
2750 meta.seq = seq;
2751 prog = bpf_iter_get_info(&meta, true);
2752 if (prog)
2753 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2754 }
2755
2756 tcp_seq_stop(seq, v);
2757 }
2758
2759 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2760 .show = bpf_iter_tcp_seq_show,
2761 .start = tcp_seq_start,
2762 .next = tcp_seq_next,
2763 .stop = bpf_iter_tcp_seq_stop,
2764 };
2765 #endif
2766
2767 static const struct seq_operations tcp4_seq_ops = {
2768 .show = tcp4_seq_show,
2769 .start = tcp_seq_start,
2770 .next = tcp_seq_next,
2771 .stop = tcp_seq_stop,
2772 };
2773
2774 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2775 .family = AF_INET,
2776 };
2777
tcp4_proc_init_net(struct net * net)2778 static int __net_init tcp4_proc_init_net(struct net *net)
2779 {
2780 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2781 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2782 return -ENOMEM;
2783 return 0;
2784 }
2785
tcp4_proc_exit_net(struct net * net)2786 static void __net_exit tcp4_proc_exit_net(struct net *net)
2787 {
2788 remove_proc_entry("tcp", net->proc_net);
2789 }
2790
2791 static struct pernet_operations tcp4_net_ops = {
2792 .init = tcp4_proc_init_net,
2793 .exit = tcp4_proc_exit_net,
2794 };
2795
tcp4_proc_init(void)2796 int __init tcp4_proc_init(void)
2797 {
2798 return register_pernet_subsys(&tcp4_net_ops);
2799 }
2800
tcp4_proc_exit(void)2801 void tcp4_proc_exit(void)
2802 {
2803 unregister_pernet_subsys(&tcp4_net_ops);
2804 }
2805 #endif /* CONFIG_PROC_FS */
2806
2807 struct proto tcp_prot = {
2808 .name = "TCP",
2809 .owner = THIS_MODULE,
2810 .close = tcp_close,
2811 .pre_connect = tcp_v4_pre_connect,
2812 .connect = tcp_v4_connect,
2813 .disconnect = tcp_disconnect,
2814 .accept = inet_csk_accept,
2815 .ioctl = tcp_ioctl,
2816 .init = tcp_v4_init_sock,
2817 .destroy = tcp_v4_destroy_sock,
2818 .shutdown = tcp_shutdown,
2819 .setsockopt = tcp_setsockopt,
2820 .getsockopt = tcp_getsockopt,
2821 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2822 .keepalive = tcp_set_keepalive,
2823 .recvmsg = tcp_recvmsg,
2824 .sendmsg = tcp_sendmsg,
2825 .sendpage = tcp_sendpage,
2826 .backlog_rcv = tcp_v4_do_rcv,
2827 .release_cb = tcp_release_cb,
2828 .hash = inet_hash,
2829 .unhash = inet_unhash,
2830 .get_port = inet_csk_get_port,
2831 .enter_memory_pressure = tcp_enter_memory_pressure,
2832 .leave_memory_pressure = tcp_leave_memory_pressure,
2833 .stream_memory_free = tcp_stream_memory_free,
2834 .sockets_allocated = &tcp_sockets_allocated,
2835 .orphan_count = &tcp_orphan_count,
2836 .memory_allocated = &tcp_memory_allocated,
2837 .memory_pressure = &tcp_memory_pressure,
2838 .sysctl_mem = sysctl_tcp_mem,
2839 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2840 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2841 .max_header = MAX_TCP_HEADER,
2842 .obj_size = sizeof(struct tcp_sock),
2843 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2844 .twsk_prot = &tcp_timewait_sock_ops,
2845 .rsk_prot = &tcp_request_sock_ops,
2846 .h.hashinfo = &tcp_hashinfo,
2847 .no_autobind = true,
2848 .diag_destroy = tcp_abort,
2849 };
2850 EXPORT_SYMBOL(tcp_prot);
2851
tcp_sk_exit(struct net * net)2852 static void __net_exit tcp_sk_exit(struct net *net)
2853 {
2854 if (net->ipv4.tcp_congestion_control)
2855 bpf_module_put(net->ipv4.tcp_congestion_control,
2856 net->ipv4.tcp_congestion_control->owner);
2857 }
2858
tcp_sk_init(struct net * net)2859 static int __net_init tcp_sk_init(struct net *net)
2860 {
2861 int cnt;
2862
2863 net->ipv4.sysctl_tcp_ecn = 2;
2864 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2865
2866 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2867 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2868 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2869 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2870 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2871
2872 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2873 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2874 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2875
2876 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2877 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2878 net->ipv4.sysctl_tcp_syncookies = 1;
2879 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2880 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2881 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2882 net->ipv4.sysctl_tcp_orphan_retries = 0;
2883 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2884 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2885 net->ipv4.sysctl_tcp_tw_reuse = 2;
2886 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2887
2888 cnt = tcp_hashinfo.ehash_mask + 1;
2889 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2890 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2891
2892 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2893 net->ipv4.sysctl_tcp_sack = 1;
2894 net->ipv4.sysctl_tcp_window_scaling = 1;
2895 net->ipv4.sysctl_tcp_timestamps = 1;
2896 net->ipv4.sysctl_tcp_early_retrans = 3;
2897 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2898 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2899 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2900 net->ipv4.sysctl_tcp_max_reordering = 300;
2901 net->ipv4.sysctl_tcp_dsack = 1;
2902 net->ipv4.sysctl_tcp_app_win = 31;
2903 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2904 net->ipv4.sysctl_tcp_frto = 2;
2905 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2906 /* This limits the percentage of the congestion window which we
2907 * will allow a single TSO frame to consume. Building TSO frames
2908 * which are too large can cause TCP streams to be bursty.
2909 */
2910 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2911 /* Default TSQ limit of 16 TSO segments */
2912 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2913 /* rfc5961 challenge ack rate limiting */
2914 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2915 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2916 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2917 net->ipv4.sysctl_tcp_autocorking = 1;
2918 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2919 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2920 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2921 if (net != &init_net) {
2922 memcpy(net->ipv4.sysctl_tcp_rmem,
2923 init_net.ipv4.sysctl_tcp_rmem,
2924 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2925 memcpy(net->ipv4.sysctl_tcp_wmem,
2926 init_net.ipv4.sysctl_tcp_wmem,
2927 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2928 }
2929 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2930 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2931 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2932 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2933 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2934 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2935 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2936
2937 /* Reno is always built in */
2938 if (!net_eq(net, &init_net) &&
2939 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2940 init_net.ipv4.tcp_congestion_control->owner))
2941 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2942 else
2943 net->ipv4.tcp_congestion_control = &tcp_reno;
2944
2945 return 0;
2946 }
2947
tcp_sk_exit_batch(struct list_head * net_exit_list)2948 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2949 {
2950 struct net *net;
2951
2952 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2953
2954 list_for_each_entry(net, net_exit_list, exit_list)
2955 tcp_fastopen_ctx_destroy(net);
2956 }
2957
2958 static struct pernet_operations __net_initdata tcp_sk_ops = {
2959 .init = tcp_sk_init,
2960 .exit = tcp_sk_exit,
2961 .exit_batch = tcp_sk_exit_batch,
2962 };
2963
2964 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2965 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2966 struct sock_common *sk_common, uid_t uid)
2967
2968 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2969 {
2970 struct tcp_iter_state *st = priv_data;
2971 struct tcp_seq_afinfo *afinfo;
2972 int ret;
2973
2974 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2975 if (!afinfo)
2976 return -ENOMEM;
2977
2978 afinfo->family = AF_UNSPEC;
2979 st->bpf_seq_afinfo = afinfo;
2980 ret = bpf_iter_init_seq_net(priv_data, aux);
2981 if (ret)
2982 kfree(afinfo);
2983 return ret;
2984 }
2985
bpf_iter_fini_tcp(void * priv_data)2986 static void bpf_iter_fini_tcp(void *priv_data)
2987 {
2988 struct tcp_iter_state *st = priv_data;
2989
2990 kfree(st->bpf_seq_afinfo);
2991 bpf_iter_fini_seq_net(priv_data);
2992 }
2993
2994 static const struct bpf_iter_seq_info tcp_seq_info = {
2995 .seq_ops = &bpf_iter_tcp_seq_ops,
2996 .init_seq_private = bpf_iter_init_tcp,
2997 .fini_seq_private = bpf_iter_fini_tcp,
2998 .seq_priv_size = sizeof(struct tcp_iter_state),
2999 };
3000
3001 static struct bpf_iter_reg tcp_reg_info = {
3002 .target = "tcp",
3003 .ctx_arg_info_size = 1,
3004 .ctx_arg_info = {
3005 { offsetof(struct bpf_iter__tcp, sk_common),
3006 PTR_TO_BTF_ID_OR_NULL },
3007 },
3008 .seq_info = &tcp_seq_info,
3009 };
3010
bpf_iter_register(void)3011 static void __init bpf_iter_register(void)
3012 {
3013 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3014 if (bpf_iter_reg_target(&tcp_reg_info))
3015 pr_warn("Warning: could not register bpf iterator tcp\n");
3016 }
3017
3018 #endif
3019
tcp_v4_init(void)3020 void __init tcp_v4_init(void)
3021 {
3022 int cpu, res;
3023
3024 for_each_possible_cpu(cpu) {
3025 struct sock *sk;
3026
3027 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3028 IPPROTO_TCP, &init_net);
3029 if (res)
3030 panic("Failed to create the TCP control socket.\n");
3031 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3032
3033 /* Please enforce IP_DF and IPID==0 for RST and
3034 * ACK sent in SYN-RECV and TIME-WAIT state.
3035 */
3036 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3037
3038 per_cpu(ipv4_tcp_sk, cpu) = sk;
3039 }
3040 if (register_pernet_subsys(&tcp_sk_ops))
3041 panic("Failed to create the TCP control socket.\n");
3042
3043 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3044 bpf_iter_register();
3045 #endif
3046 }
3047