1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100 }
101
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
118 */
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 loopback = true;
129 } else
130 #endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
139
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
146 holder.
147
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
150 */
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
157 * process.
158 *
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
164 */
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184 {
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
188 */
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
205 struct flowi4 *fl4;
206 struct rtable *rt;
207 int err;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
221 if (!daddr)
222 return -EINVAL;
223 nexthop = inet_opt->opt.faddr;
224 }
225
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
232 orig_sport, orig_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 return err;
238 }
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
245 if (!inet_opt || !inet_opt->opt.srr)
246 daddr = fl4->daddr;
247
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
258 }
259
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
262
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
273 */
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
276 if (err)
277 goto failure;
278
279 sk_set_txhash(sk);
280
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
286 goto failure;
287 }
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
291 rt = NULL;
292
293 if (likely(!tp->repair)) {
294 if (!tp->write_seq)
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
302 inet->inet_daddr);
303 }
304
305 inet->inet_id = prandom_u32();
306
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
312 err = tcp_connect(sk);
313
314 if (err)
315 goto failure;
316
317 return 0;
318
319 failure:
320 /*
321 * This unhashes the socket and releases the local port,
322 * if necessary.
323 */
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
328 return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331
332 /*
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
336 */
tcp_v4_mtu_reduced(struct sock * sk)337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
341 u32 mtu;
342
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
348 return;
349
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
352 */
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
362
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
366 * discovery.
367 */
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372
do_redirect(struct sk_buff * skb,struct sock * sk)373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
377 if (dst)
378 dst->ops->redirect(dst, sk, skb);
379 }
380
381
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
390 */
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 } else if (abort) {
394 /*
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
399 */
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
402 }
403 reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406
407 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
441 */
442 tcp_retransmit_timer(sk);
443 }
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446
447 /*
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
454 *
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
460 *
461 */
462
tcp_v4_err(struct sk_buff * skb,u32 info)463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
471 struct sock *sk;
472 struct request_sock *fastopen;
473 u32 seq, snd_una;
474 int err;
475 struct net *net = dev_net(skb->dev);
476
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
479 inet_iif(skb), 0);
480 if (!sk) {
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 return -ENOENT;
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
486 return 0;
487 }
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
497
498 bh_lock_sock(sk);
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
503 */
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 }
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 goto out;
514 }
515
516 tp = tcp_sk(sk);
517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 goto out;
524 }
525
526 switch (type) {
527 case ICMP_REDIRECT:
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
530 goto out;
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
533 goto out;
534 case ICMP_PARAMETERPROB:
535 err = EPROTO;
536 break;
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
539 goto out;
540
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
545 */
546 if (sk->sk_state == TCP_LISTEN)
547 goto out;
548
549 WRITE_ONCE(tp->mtu_info, info);
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
552 } else {
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 sock_hold(sk);
555 }
556 goto out;
557 }
558
559 err = icmp_err_convert[code].errno;
560 /* check if this ICMP message allows revert of backoff.
561 * (see RFC 6069)
562 */
563 if (!fastopen &&
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
566 break;
567 case ICMP_TIME_EXCEEDED:
568 err = EHOSTUNREACH;
569 break;
570 default:
571 goto out;
572 }
573
574 switch (sk->sk_state) {
575 case TCP_SYN_SENT:
576 case TCP_SYN_RECV:
577 /* Only in fast or simultaneous open. If a fast open socket is
578 * already accepted it is treated as a connected one below.
579 */
580 if (fastopen && !fastopen->sk)
581 break;
582
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584
585 if (!sock_owned_by_user(sk)) {
586 sk->sk_err = err;
587
588 sk->sk_error_report(sk);
589
590 tcp_done(sk);
591 } else {
592 sk->sk_err_soft = err;
593 }
594 goto out;
595 }
596
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
599 *
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
603 *
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 *
609 * Now we are in compliance with RFCs.
610 * --ANK (980905)
611 */
612
613 inet = inet_sk(sk);
614 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_err = err;
616 sk->sk_error_report(sk);
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
619 }
620
621 out:
622 bh_unlock_sock(sk);
623 sock_put(sk);
624 return 0;
625 }
626
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 struct tcphdr *th = tcp_hdr(skb);
630
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635
636 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 const struct inet_sock *inet = inet_sk(sk);
640
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644
645 /*
646 * This routine will send an RST to the other tcp.
647 *
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649 * for reset.
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
656 */
657
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 const struct tcphdr *th = tcp_hdr(skb);
661 struct {
662 struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 } rep;
667 struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 struct tcp_md5sig_key *key = NULL;
670 const __u8 *hash_location = NULL;
671 unsigned char newhash[16];
672 int genhash;
673 struct sock *sk1 = NULL;
674 #endif
675 u64 transmit_time = 0;
676 struct sock *ctl_sk;
677 struct net *net;
678
679 /* Never send a reset in response to a reset. */
680 if (th->rst)
681 return;
682
683 /* If sk not NULL, it means we did a successful lookup and incoming
684 * route had to be correct. prequeue might have dropped our dst.
685 */
686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 return;
688
689 /* Swap the send and the receive. */
690 memset(&rep, 0, sizeof(rep));
691 rep.th.dest = th->source;
692 rep.th.source = th->dest;
693 rep.th.doff = sizeof(struct tcphdr) / 4;
694 rep.th.rst = 1;
695
696 if (th->ack) {
697 rep.th.seq = th->ack_seq;
698 } else {
699 rep.th.ack = 1;
700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 skb->len - (th->doff << 2));
702 }
703
704 memset(&arg, 0, sizeof(arg));
705 arg.iov[0].iov_base = (unsigned char *)&rep;
706 arg.iov[0].iov_len = sizeof(rep.th);
707
708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 rcu_read_lock();
711 hash_location = tcp_parse_md5sig_option(th);
712 if (sk && sk_fullsock(sk)) {
713 const union tcp_md5_addr *addr;
714 int l3index;
715
716 /* sdif set, means packet ingressed via a device
717 * in an L3 domain and inet_iif is set to it.
718 */
719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 } else if (hash_location) {
723 const union tcp_md5_addr *addr;
724 int sdif = tcp_v4_sdif(skb);
725 int dif = inet_iif(skb);
726 int l3index;
727
728 /*
729 * active side is lost. Try to find listening socket through
730 * source port, and then find md5 key through listening socket.
731 * we are not loose security here:
732 * Incoming packet is checked with md5 hash with finding key,
733 * no RST generated if md5 hash doesn't match.
734 */
735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 ip_hdr(skb)->saddr,
737 th->source, ip_hdr(skb)->daddr,
738 ntohs(th->source), dif, sdif);
739 /* don't send rst if it can't find key */
740 if (!sk1)
741 goto out;
742
743 /* sdif set, means packet ingressed via a device
744 * in an L3 domain and dif is set to it.
745 */
746 l3index = sdif ? dif : 0;
747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 if (!key)
750 goto out;
751
752
753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 goto out;
756
757 }
758
759 if (key) {
760 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 (TCPOPT_NOP << 16) |
762 (TCPOPT_MD5SIG << 8) |
763 TCPOLEN_MD5SIG);
764 /* Update length and the length the header thinks exists */
765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 rep.th.doff = arg.iov[0].iov_len / 4;
767
768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 key, ip_hdr(skb)->saddr,
770 ip_hdr(skb)->daddr, &rep.th);
771 }
772 #endif
773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 ip_hdr(skb)->saddr, /* XXX */
775 arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778
779 /* When socket is gone, all binding information is lost.
780 * routing might fail in this case. No choice here, if we choose to force
781 * input interface, we will misroute in case of asymmetric route.
782 */
783 if (sk) {
784 arg.bound_dev_if = sk->sk_bound_dev_if;
785 if (sk_fullsock(sk))
786 trace_tcp_send_reset(sk, skb);
787 }
788
789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791
792 arg.tos = ip_hdr(skb)->tos;
793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 local_bh_disable();
795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 if (sk) {
797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 inet_twsk(sk)->tw_mark : sk->sk_mark;
799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 inet_twsk(sk)->tw_priority : sk->sk_priority;
801 transmit_time = tcp_transmit_time(sk);
802 }
803 ip_send_unicast_reply(ctl_sk,
804 skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 &arg, arg.iov[0].iov_len,
807 transmit_time);
808
809 ctl_sk->sk_mark = 0;
810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 local_bh_enable();
813
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 rcu_read_unlock();
817 #endif
818 }
819
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821 outside socket context is ugly, certainly. What can I do?
822 */
823
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)824 static void tcp_v4_send_ack(const struct sock *sk,
825 struct sk_buff *skb, u32 seq, u32 ack,
826 u32 win, u32 tsval, u32 tsecr, int oif,
827 struct tcp_md5sig_key *key,
828 int reply_flags, u8 tos)
829 {
830 const struct tcphdr *th = tcp_hdr(skb);
831 struct {
832 struct tcphdr th;
833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 ];
838 } rep;
839 struct net *net = sock_net(sk);
840 struct ip_reply_arg arg;
841 struct sock *ctl_sk;
842 u64 transmit_time;
843
844 memset(&rep.th, 0, sizeof(struct tcphdr));
845 memset(&arg, 0, sizeof(arg));
846
847 arg.iov[0].iov_base = (unsigned char *)&rep;
848 arg.iov[0].iov_len = sizeof(rep.th);
849 if (tsecr) {
850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 (TCPOPT_TIMESTAMP << 8) |
852 TCPOLEN_TIMESTAMP);
853 rep.opt[1] = htonl(tsval);
854 rep.opt[2] = htonl(tsecr);
855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 }
857
858 /* Swap the send and the receive. */
859 rep.th.dest = th->source;
860 rep.th.source = th->dest;
861 rep.th.doff = arg.iov[0].iov_len / 4;
862 rep.th.seq = htonl(seq);
863 rep.th.ack_seq = htonl(ack);
864 rep.th.ack = 1;
865 rep.th.window = htons(win);
866
867 #ifdef CONFIG_TCP_MD5SIG
868 if (key) {
869 int offset = (tsecr) ? 3 : 0;
870
871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 (TCPOPT_NOP << 16) |
873 (TCPOPT_MD5SIG << 8) |
874 TCPOLEN_MD5SIG);
875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 rep.th.doff = arg.iov[0].iov_len/4;
877
878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 key, ip_hdr(skb)->saddr,
880 ip_hdr(skb)->daddr, &rep.th);
881 }
882 #endif
883 arg.flags = reply_flags;
884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 ip_hdr(skb)->saddr, /* XXX */
886 arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 if (oif)
889 arg.bound_dev_if = oif;
890 arg.tos = tos;
891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 local_bh_disable();
893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 inet_twsk(sk)->tw_mark : sk->sk_mark;
896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 inet_twsk(sk)->tw_priority : sk->sk_priority;
898 transmit_time = tcp_transmit_time(sk);
899 ip_send_unicast_reply(ctl_sk,
900 skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 &arg, arg.iov[0].iov_len,
903 transmit_time);
904
905 ctl_sk->sk_mark = 0;
906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 local_bh_enable();
908 }
909
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 struct inet_timewait_sock *tw = inet_twsk(sk);
913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914
915 tcp_v4_send_ack(sk, skb,
916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 tcptw->tw_ts_recent,
920 tw->tw_bound_dev_if,
921 tcp_twsk_md5_key(tcptw),
922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 tw->tw_tos
924 );
925
926 inet_twsk_put(tw);
927 }
928
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 struct request_sock *req)
931 {
932 const union tcp_md5_addr *addr;
933 int l3index;
934
935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 */
938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 tcp_sk(sk)->snd_nxt;
940
941 /* RFC 7323 2.3
942 * The window field (SEG.WND) of every outgoing segment, with the
943 * exception of <SYN> segments, MUST be right-shifted by
944 * Rcv.Wind.Shift bits:
945 */
946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 tcp_v4_send_ack(sk, skb, seq,
949 tcp_rsk(req)->rcv_nxt,
950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 req->ts_recent,
953 0,
954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 ip_hdr(skb)->tos);
957 }
958
959 /*
960 * Send a SYN-ACK after having received a SYN.
961 * This still operates on a request_sock only, not on a big
962 * socket.
963 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 struct flowi *fl,
966 struct request_sock *req,
967 struct tcp_fastopen_cookie *foc,
968 enum tcp_synack_type synack_type,
969 struct sk_buff *syn_skb)
970 {
971 const struct inet_request_sock *ireq = inet_rsk(req);
972 struct flowi4 fl4;
973 int err = -1;
974 struct sk_buff *skb;
975 u8 tos;
976
977 /* First, grab a route. */
978 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
979 return -1;
980
981 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
982
983 if (skb) {
984 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
985
986 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
987 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
988 (inet_sk(sk)->tos & INET_ECN_MASK) :
989 inet_sk(sk)->tos;
990
991 if (!INET_ECN_is_capable(tos) &&
992 tcp_bpf_ca_needs_ecn((struct sock *)req))
993 tos |= INET_ECN_ECT_0;
994
995 rcu_read_lock();
996 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
997 ireq->ir_rmt_addr,
998 rcu_dereference(ireq->ireq_opt),
999 tos);
1000 rcu_read_unlock();
1001 err = net_xmit_eval(err);
1002 }
1003
1004 return err;
1005 }
1006
1007 /*
1008 * IPv4 request_sock destructor.
1009 */
tcp_v4_reqsk_destructor(struct request_sock * req)1010 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 {
1012 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013 }
1014
1015 #ifdef CONFIG_TCP_MD5SIG
1016 /*
1017 * RFC2385 MD5 checksumming requires a mapping of
1018 * IP address->MD5 Key.
1019 * We need to maintain these in the sk structure.
1020 */
1021
1022 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 EXPORT_SYMBOL(tcp_md5_needed);
1024
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1025 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1026 {
1027 if (!old)
1028 return true;
1029
1030 /* l3index always overrides non-l3index */
1031 if (old->l3index && new->l3index == 0)
1032 return false;
1033 if (old->l3index == 0 && new->l3index)
1034 return true;
1035
1036 return old->prefixlen < new->prefixlen;
1037 }
1038
1039 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1040 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1041 const union tcp_md5_addr *addr,
1042 int family)
1043 {
1044 const struct tcp_sock *tp = tcp_sk(sk);
1045 struct tcp_md5sig_key *key;
1046 const struct tcp_md5sig_info *md5sig;
1047 __be32 mask;
1048 struct tcp_md5sig_key *best_match = NULL;
1049 bool match;
1050
1051 /* caller either holds rcu_read_lock() or socket lock */
1052 md5sig = rcu_dereference_check(tp->md5sig_info,
1053 lockdep_sock_is_held(sk));
1054 if (!md5sig)
1055 return NULL;
1056
1057 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1058 lockdep_sock_is_held(sk)) {
1059 if (key->family != family)
1060 continue;
1061 if (key->l3index && key->l3index != l3index)
1062 continue;
1063 if (family == AF_INET) {
1064 mask = inet_make_mask(key->prefixlen);
1065 match = (key->addr.a4.s_addr & mask) ==
1066 (addr->a4.s_addr & mask);
1067 #if IS_ENABLED(CONFIG_IPV6)
1068 } else if (family == AF_INET6) {
1069 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1070 key->prefixlen);
1071 #endif
1072 } else {
1073 match = false;
1074 }
1075
1076 if (match && better_md5_match(best_match, key))
1077 best_match = key;
1078 }
1079 return best_match;
1080 }
1081 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1082
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1083 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1084 const union tcp_md5_addr *addr,
1085 int family, u8 prefixlen,
1086 int l3index)
1087 {
1088 const struct tcp_sock *tp = tcp_sk(sk);
1089 struct tcp_md5sig_key *key;
1090 unsigned int size = sizeof(struct in_addr);
1091 const struct tcp_md5sig_info *md5sig;
1092
1093 /* caller either holds rcu_read_lock() or socket lock */
1094 md5sig = rcu_dereference_check(tp->md5sig_info,
1095 lockdep_sock_is_held(sk));
1096 if (!md5sig)
1097 return NULL;
1098 #if IS_ENABLED(CONFIG_IPV6)
1099 if (family == AF_INET6)
1100 size = sizeof(struct in6_addr);
1101 #endif
1102 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103 lockdep_sock_is_held(sk)) {
1104 if (key->family != family)
1105 continue;
1106 if (key->l3index != l3index)
1107 continue;
1108 if (!memcmp(&key->addr, addr, size) &&
1109 key->prefixlen == prefixlen)
1110 return key;
1111 }
1112 return NULL;
1113 }
1114
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1115 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1116 const struct sock *addr_sk)
1117 {
1118 const union tcp_md5_addr *addr;
1119 int l3index;
1120
1121 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1122 addr_sk->sk_bound_dev_if);
1123 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1124 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1125 }
1126 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1127
1128 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1129 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1130 int family, u8 prefixlen, int l3index,
1131 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1132 {
1133 /* Add Key to the list */
1134 struct tcp_md5sig_key *key;
1135 struct tcp_sock *tp = tcp_sk(sk);
1136 struct tcp_md5sig_info *md5sig;
1137
1138 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1139 if (key) {
1140 /* Pre-existing entry - just update that one.
1141 * Note that the key might be used concurrently.
1142 * data_race() is telling kcsan that we do not care of
1143 * key mismatches, since changing MD5 key on live flows
1144 * can lead to packet drops.
1145 */
1146 data_race(memcpy(key->key, newkey, newkeylen));
1147
1148 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1149 * Also note that a reader could catch new key->keylen value
1150 * but old key->key[], this is the reason we use __GFP_ZERO
1151 * at sock_kmalloc() time below these lines.
1152 */
1153 WRITE_ONCE(key->keylen, newkeylen);
1154
1155 return 0;
1156 }
1157
1158 md5sig = rcu_dereference_protected(tp->md5sig_info,
1159 lockdep_sock_is_held(sk));
1160 if (!md5sig) {
1161 md5sig = kmalloc(sizeof(*md5sig), gfp);
1162 if (!md5sig)
1163 return -ENOMEM;
1164
1165 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1166 INIT_HLIST_HEAD(&md5sig->head);
1167 rcu_assign_pointer(tp->md5sig_info, md5sig);
1168 }
1169
1170 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1171 if (!key)
1172 return -ENOMEM;
1173 if (!tcp_alloc_md5sig_pool()) {
1174 sock_kfree_s(sk, key, sizeof(*key));
1175 return -ENOMEM;
1176 }
1177
1178 memcpy(key->key, newkey, newkeylen);
1179 key->keylen = newkeylen;
1180 key->family = family;
1181 key->prefixlen = prefixlen;
1182 key->l3index = l3index;
1183 memcpy(&key->addr, addr,
1184 (family == AF_INET6) ? sizeof(struct in6_addr) :
1185 sizeof(struct in_addr));
1186 hlist_add_head_rcu(&key->node, &md5sig->head);
1187 return 0;
1188 }
1189 EXPORT_SYMBOL(tcp_md5_do_add);
1190
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1191 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1192 u8 prefixlen, int l3index)
1193 {
1194 struct tcp_md5sig_key *key;
1195
1196 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1197 if (!key)
1198 return -ENOENT;
1199 hlist_del_rcu(&key->node);
1200 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1201 kfree_rcu(key, rcu);
1202 return 0;
1203 }
1204 EXPORT_SYMBOL(tcp_md5_do_del);
1205
tcp_clear_md5_list(struct sock * sk)1206 static void tcp_clear_md5_list(struct sock *sk)
1207 {
1208 struct tcp_sock *tp = tcp_sk(sk);
1209 struct tcp_md5sig_key *key;
1210 struct hlist_node *n;
1211 struct tcp_md5sig_info *md5sig;
1212
1213 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1214
1215 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1216 hlist_del_rcu(&key->node);
1217 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1218 kfree_rcu(key, rcu);
1219 }
1220 }
1221
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1222 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1223 sockptr_t optval, int optlen)
1224 {
1225 struct tcp_md5sig cmd;
1226 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1227 const union tcp_md5_addr *addr;
1228 u8 prefixlen = 32;
1229 int l3index = 0;
1230
1231 if (optlen < sizeof(cmd))
1232 return -EINVAL;
1233
1234 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1235 return -EFAULT;
1236
1237 if (sin->sin_family != AF_INET)
1238 return -EINVAL;
1239
1240 if (optname == TCP_MD5SIG_EXT &&
1241 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1242 prefixlen = cmd.tcpm_prefixlen;
1243 if (prefixlen > 32)
1244 return -EINVAL;
1245 }
1246
1247 if (optname == TCP_MD5SIG_EXT &&
1248 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1249 struct net_device *dev;
1250
1251 rcu_read_lock();
1252 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1253 if (dev && netif_is_l3_master(dev))
1254 l3index = dev->ifindex;
1255
1256 rcu_read_unlock();
1257
1258 /* ok to reference set/not set outside of rcu;
1259 * right now device MUST be an L3 master
1260 */
1261 if (!dev || !l3index)
1262 return -EINVAL;
1263 }
1264
1265 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1266
1267 if (!cmd.tcpm_keylen)
1268 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1269
1270 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1271 return -EINVAL;
1272
1273 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1274 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1275 }
1276
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1277 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1278 __be32 daddr, __be32 saddr,
1279 const struct tcphdr *th, int nbytes)
1280 {
1281 struct tcp4_pseudohdr *bp;
1282 struct scatterlist sg;
1283 struct tcphdr *_th;
1284
1285 bp = hp->scratch;
1286 bp->saddr = saddr;
1287 bp->daddr = daddr;
1288 bp->pad = 0;
1289 bp->protocol = IPPROTO_TCP;
1290 bp->len = cpu_to_be16(nbytes);
1291
1292 _th = (struct tcphdr *)(bp + 1);
1293 memcpy(_th, th, sizeof(*th));
1294 _th->check = 0;
1295
1296 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1297 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1298 sizeof(*bp) + sizeof(*th));
1299 return crypto_ahash_update(hp->md5_req);
1300 }
1301
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1302 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1303 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1304 {
1305 struct tcp_md5sig_pool *hp;
1306 struct ahash_request *req;
1307
1308 hp = tcp_get_md5sig_pool();
1309 if (!hp)
1310 goto clear_hash_noput;
1311 req = hp->md5_req;
1312
1313 if (crypto_ahash_init(req))
1314 goto clear_hash;
1315 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1316 goto clear_hash;
1317 if (tcp_md5_hash_key(hp, key))
1318 goto clear_hash;
1319 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320 if (crypto_ahash_final(req))
1321 goto clear_hash;
1322
1323 tcp_put_md5sig_pool();
1324 return 0;
1325
1326 clear_hash:
1327 tcp_put_md5sig_pool();
1328 clear_hash_noput:
1329 memset(md5_hash, 0, 16);
1330 return 1;
1331 }
1332
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1333 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1334 const struct sock *sk,
1335 const struct sk_buff *skb)
1336 {
1337 struct tcp_md5sig_pool *hp;
1338 struct ahash_request *req;
1339 const struct tcphdr *th = tcp_hdr(skb);
1340 __be32 saddr, daddr;
1341
1342 if (sk) { /* valid for establish/request sockets */
1343 saddr = sk->sk_rcv_saddr;
1344 daddr = sk->sk_daddr;
1345 } else {
1346 const struct iphdr *iph = ip_hdr(skb);
1347 saddr = iph->saddr;
1348 daddr = iph->daddr;
1349 }
1350
1351 hp = tcp_get_md5sig_pool();
1352 if (!hp)
1353 goto clear_hash_noput;
1354 req = hp->md5_req;
1355
1356 if (crypto_ahash_init(req))
1357 goto clear_hash;
1358
1359 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1360 goto clear_hash;
1361 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1362 goto clear_hash;
1363 if (tcp_md5_hash_key(hp, key))
1364 goto clear_hash;
1365 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1366 if (crypto_ahash_final(req))
1367 goto clear_hash;
1368
1369 tcp_put_md5sig_pool();
1370 return 0;
1371
1372 clear_hash:
1373 tcp_put_md5sig_pool();
1374 clear_hash_noput:
1375 memset(md5_hash, 0, 16);
1376 return 1;
1377 }
1378 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1379
1380 #endif
1381
1382 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1383 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1384 const struct sk_buff *skb,
1385 int dif, int sdif)
1386 {
1387 #ifdef CONFIG_TCP_MD5SIG
1388 /*
1389 * This gets called for each TCP segment that arrives
1390 * so we want to be efficient.
1391 * We have 3 drop cases:
1392 * o No MD5 hash and one expected.
1393 * o MD5 hash and we're not expecting one.
1394 * o MD5 hash and its wrong.
1395 */
1396 const __u8 *hash_location = NULL;
1397 struct tcp_md5sig_key *hash_expected;
1398 const struct iphdr *iph = ip_hdr(skb);
1399 const struct tcphdr *th = tcp_hdr(skb);
1400 const union tcp_md5_addr *addr;
1401 unsigned char newhash[16];
1402 int genhash, l3index;
1403
1404 /* sdif set, means packet ingressed via a device
1405 * in an L3 domain and dif is set to the l3mdev
1406 */
1407 l3index = sdif ? dif : 0;
1408
1409 addr = (union tcp_md5_addr *)&iph->saddr;
1410 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1411 hash_location = tcp_parse_md5sig_option(th);
1412
1413 /* We've parsed the options - do we have a hash? */
1414 if (!hash_expected && !hash_location)
1415 return false;
1416
1417 if (hash_expected && !hash_location) {
1418 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1419 return true;
1420 }
1421
1422 if (!hash_expected && hash_location) {
1423 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1424 return true;
1425 }
1426
1427 /* Okay, so this is hash_expected and hash_location -
1428 * so we need to calculate the checksum.
1429 */
1430 genhash = tcp_v4_md5_hash_skb(newhash,
1431 hash_expected,
1432 NULL, skb);
1433
1434 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1435 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1436 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1437 &iph->saddr, ntohs(th->source),
1438 &iph->daddr, ntohs(th->dest),
1439 genhash ? " tcp_v4_calc_md5_hash failed"
1440 : "", l3index);
1441 return true;
1442 }
1443 return false;
1444 #endif
1445 return false;
1446 }
1447
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1448 static void tcp_v4_init_req(struct request_sock *req,
1449 const struct sock *sk_listener,
1450 struct sk_buff *skb)
1451 {
1452 struct inet_request_sock *ireq = inet_rsk(req);
1453 struct net *net = sock_net(sk_listener);
1454
1455 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1456 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1457 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1458 }
1459
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1460 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1461 struct flowi *fl,
1462 const struct request_sock *req)
1463 {
1464 return inet_csk_route_req(sk, &fl->u.ip4, req);
1465 }
1466
1467 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1468 .family = PF_INET,
1469 .obj_size = sizeof(struct tcp_request_sock),
1470 .rtx_syn_ack = tcp_rtx_synack,
1471 .send_ack = tcp_v4_reqsk_send_ack,
1472 .destructor = tcp_v4_reqsk_destructor,
1473 .send_reset = tcp_v4_send_reset,
1474 .syn_ack_timeout = tcp_syn_ack_timeout,
1475 };
1476
1477 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1478 .mss_clamp = TCP_MSS_DEFAULT,
1479 #ifdef CONFIG_TCP_MD5SIG
1480 .req_md5_lookup = tcp_v4_md5_lookup,
1481 .calc_md5_hash = tcp_v4_md5_hash_skb,
1482 #endif
1483 .init_req = tcp_v4_init_req,
1484 #ifdef CONFIG_SYN_COOKIES
1485 .cookie_init_seq = cookie_v4_init_sequence,
1486 #endif
1487 .route_req = tcp_v4_route_req,
1488 .init_seq = tcp_v4_init_seq,
1489 .init_ts_off = tcp_v4_init_ts_off,
1490 .send_synack = tcp_v4_send_synack,
1491 };
1492
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1493 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1494 {
1495 /* Never answer to SYNs send to broadcast or multicast */
1496 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1497 goto drop;
1498
1499 return tcp_conn_request(&tcp_request_sock_ops,
1500 &tcp_request_sock_ipv4_ops, sk, skb);
1501
1502 drop:
1503 tcp_listendrop(sk);
1504 return 0;
1505 }
1506 EXPORT_SYMBOL(tcp_v4_conn_request);
1507
1508
1509 /*
1510 * The three way handshake has completed - we got a valid synack -
1511 * now create the new socket.
1512 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1513 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1514 struct request_sock *req,
1515 struct dst_entry *dst,
1516 struct request_sock *req_unhash,
1517 bool *own_req)
1518 {
1519 struct inet_request_sock *ireq;
1520 bool found_dup_sk = false;
1521 struct inet_sock *newinet;
1522 struct tcp_sock *newtp;
1523 struct sock *newsk;
1524 #ifdef CONFIG_TCP_MD5SIG
1525 const union tcp_md5_addr *addr;
1526 struct tcp_md5sig_key *key;
1527 int l3index;
1528 #endif
1529 struct ip_options_rcu *inet_opt;
1530
1531 if (sk_acceptq_is_full(sk))
1532 goto exit_overflow;
1533
1534 newsk = tcp_create_openreq_child(sk, req, skb);
1535 if (!newsk)
1536 goto exit_nonewsk;
1537
1538 newsk->sk_gso_type = SKB_GSO_TCPV4;
1539 inet_sk_rx_dst_set(newsk, skb);
1540
1541 newtp = tcp_sk(newsk);
1542 newinet = inet_sk(newsk);
1543 ireq = inet_rsk(req);
1544 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1545 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1546 newsk->sk_bound_dev_if = ireq->ir_iif;
1547 newinet->inet_saddr = ireq->ir_loc_addr;
1548 inet_opt = rcu_dereference(ireq->ireq_opt);
1549 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1550 newinet->mc_index = inet_iif(skb);
1551 newinet->mc_ttl = ip_hdr(skb)->ttl;
1552 newinet->rcv_tos = ip_hdr(skb)->tos;
1553 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1554 if (inet_opt)
1555 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1556 newinet->inet_id = prandom_u32();
1557
1558 /* Set ToS of the new socket based upon the value of incoming SYN.
1559 * ECT bits are set later in tcp_init_transfer().
1560 */
1561 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1562 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1563
1564 if (!dst) {
1565 dst = inet_csk_route_child_sock(sk, newsk, req);
1566 if (!dst)
1567 goto put_and_exit;
1568 } else {
1569 /* syncookie case : see end of cookie_v4_check() */
1570 }
1571 sk_setup_caps(newsk, dst);
1572
1573 tcp_ca_openreq_child(newsk, dst);
1574
1575 tcp_sync_mss(newsk, dst_mtu(dst));
1576 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1577
1578 tcp_initialize_rcv_mss(newsk);
1579
1580 #ifdef CONFIG_TCP_MD5SIG
1581 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1582 /* Copy over the MD5 key from the original socket */
1583 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1584 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1585 if (key) {
1586 /*
1587 * We're using one, so create a matching key
1588 * on the newsk structure. If we fail to get
1589 * memory, then we end up not copying the key
1590 * across. Shucks.
1591 */
1592 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1593 key->key, key->keylen, GFP_ATOMIC);
1594 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1595 }
1596 #endif
1597
1598 if (__inet_inherit_port(sk, newsk) < 0)
1599 goto put_and_exit;
1600 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1601 &found_dup_sk);
1602 if (likely(*own_req)) {
1603 tcp_move_syn(newtp, req);
1604 ireq->ireq_opt = NULL;
1605 } else {
1606 newinet->inet_opt = NULL;
1607
1608 if (!req_unhash && found_dup_sk) {
1609 /* This code path should only be executed in the
1610 * syncookie case only
1611 */
1612 bh_unlock_sock(newsk);
1613 sock_put(newsk);
1614 newsk = NULL;
1615 }
1616 }
1617 return newsk;
1618
1619 exit_overflow:
1620 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1621 exit_nonewsk:
1622 dst_release(dst);
1623 exit:
1624 tcp_listendrop(sk);
1625 return NULL;
1626 put_and_exit:
1627 newinet->inet_opt = NULL;
1628 inet_csk_prepare_forced_close(newsk);
1629 tcp_done(newsk);
1630 goto exit;
1631 }
1632 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1633
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1634 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1635 {
1636 #ifdef CONFIG_SYN_COOKIES
1637 const struct tcphdr *th = tcp_hdr(skb);
1638
1639 if (!th->syn)
1640 sk = cookie_v4_check(sk, skb);
1641 #endif
1642 return sk;
1643 }
1644
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1645 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1646 struct tcphdr *th, u32 *cookie)
1647 {
1648 u16 mss = 0;
1649 #ifdef CONFIG_SYN_COOKIES
1650 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1651 &tcp_request_sock_ipv4_ops, sk, th);
1652 if (mss) {
1653 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1654 tcp_synq_overflow(sk);
1655 }
1656 #endif
1657 return mss;
1658 }
1659
1660 /* The socket must have it's spinlock held when we get
1661 * here, unless it is a TCP_LISTEN socket.
1662 *
1663 * We have a potential double-lock case here, so even when
1664 * doing backlog processing we use the BH locking scheme.
1665 * This is because we cannot sleep with the original spinlock
1666 * held.
1667 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1668 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1669 {
1670 struct sock *rsk;
1671
1672 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1673 struct dst_entry *dst = sk->sk_rx_dst;
1674
1675 sock_rps_save_rxhash(sk, skb);
1676 sk_mark_napi_id(sk, skb);
1677 if (dst) {
1678 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1679 !dst->ops->check(dst, 0)) {
1680 dst_release(dst);
1681 sk->sk_rx_dst = NULL;
1682 }
1683 }
1684 tcp_rcv_established(sk, skb);
1685 return 0;
1686 }
1687
1688 if (tcp_checksum_complete(skb))
1689 goto csum_err;
1690
1691 if (sk->sk_state == TCP_LISTEN) {
1692 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1693
1694 if (!nsk)
1695 goto discard;
1696 if (nsk != sk) {
1697 if (tcp_child_process(sk, nsk, skb)) {
1698 rsk = nsk;
1699 goto reset;
1700 }
1701 return 0;
1702 }
1703 } else
1704 sock_rps_save_rxhash(sk, skb);
1705
1706 if (tcp_rcv_state_process(sk, skb)) {
1707 rsk = sk;
1708 goto reset;
1709 }
1710 return 0;
1711
1712 reset:
1713 tcp_v4_send_reset(rsk, skb);
1714 discard:
1715 kfree_skb(skb);
1716 /* Be careful here. If this function gets more complicated and
1717 * gcc suffers from register pressure on the x86, sk (in %ebx)
1718 * might be destroyed here. This current version compiles correctly,
1719 * but you have been warned.
1720 */
1721 return 0;
1722
1723 csum_err:
1724 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1725 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1726 goto discard;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_do_rcv);
1729
tcp_v4_early_demux(struct sk_buff * skb)1730 int tcp_v4_early_demux(struct sk_buff *skb)
1731 {
1732 const struct iphdr *iph;
1733 const struct tcphdr *th;
1734 struct sock *sk;
1735
1736 if (skb->pkt_type != PACKET_HOST)
1737 return 0;
1738
1739 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1740 return 0;
1741
1742 iph = ip_hdr(skb);
1743 th = tcp_hdr(skb);
1744
1745 if (th->doff < sizeof(struct tcphdr) / 4)
1746 return 0;
1747
1748 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1749 iph->saddr, th->source,
1750 iph->daddr, ntohs(th->dest),
1751 skb->skb_iif, inet_sdif(skb));
1752 if (sk) {
1753 skb->sk = sk;
1754 skb->destructor = sock_edemux;
1755 if (sk_fullsock(sk)) {
1756 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1757
1758 if (dst)
1759 dst = dst_check(dst, 0);
1760 if (dst &&
1761 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1762 skb_dst_set_noref(skb, dst);
1763 }
1764 }
1765 return 0;
1766 }
1767
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1768 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1769 {
1770 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1771 u32 tail_gso_size, tail_gso_segs;
1772 struct skb_shared_info *shinfo;
1773 const struct tcphdr *th;
1774 struct tcphdr *thtail;
1775 struct sk_buff *tail;
1776 unsigned int hdrlen;
1777 bool fragstolen;
1778 u32 gso_segs;
1779 u32 gso_size;
1780 int delta;
1781
1782 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1783 * we can fix skb->truesize to its real value to avoid future drops.
1784 * This is valid because skb is not yet charged to the socket.
1785 * It has been noticed pure SACK packets were sometimes dropped
1786 * (if cooked by drivers without copybreak feature).
1787 */
1788 skb_condense(skb);
1789
1790 skb_dst_drop(skb);
1791
1792 if (unlikely(tcp_checksum_complete(skb))) {
1793 bh_unlock_sock(sk);
1794 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1795 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1796 return true;
1797 }
1798
1799 /* Attempt coalescing to last skb in backlog, even if we are
1800 * above the limits.
1801 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1802 */
1803 th = (const struct tcphdr *)skb->data;
1804 hdrlen = th->doff * 4;
1805
1806 tail = sk->sk_backlog.tail;
1807 if (!tail)
1808 goto no_coalesce;
1809 thtail = (struct tcphdr *)tail->data;
1810
1811 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1812 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1813 ((TCP_SKB_CB(tail)->tcp_flags |
1814 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1815 !((TCP_SKB_CB(tail)->tcp_flags &
1816 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1817 ((TCP_SKB_CB(tail)->tcp_flags ^
1818 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1819 #ifdef CONFIG_TLS_DEVICE
1820 tail->decrypted != skb->decrypted ||
1821 #endif
1822 thtail->doff != th->doff ||
1823 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1824 goto no_coalesce;
1825
1826 __skb_pull(skb, hdrlen);
1827
1828 shinfo = skb_shinfo(skb);
1829 gso_size = shinfo->gso_size ?: skb->len;
1830 gso_segs = shinfo->gso_segs ?: 1;
1831
1832 shinfo = skb_shinfo(tail);
1833 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1834 tail_gso_segs = shinfo->gso_segs ?: 1;
1835
1836 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1837 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1838
1839 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1840 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1841 thtail->window = th->window;
1842 }
1843
1844 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1845 * thtail->fin, so that the fast path in tcp_rcv_established()
1846 * is not entered if we append a packet with a FIN.
1847 * SYN, RST, URG are not present.
1848 * ACK is set on both packets.
1849 * PSH : we do not really care in TCP stack,
1850 * at least for 'GRO' packets.
1851 */
1852 thtail->fin |= th->fin;
1853 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1854
1855 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1856 TCP_SKB_CB(tail)->has_rxtstamp = true;
1857 tail->tstamp = skb->tstamp;
1858 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1859 }
1860
1861 /* Not as strict as GRO. We only need to carry mss max value */
1862 shinfo->gso_size = max(gso_size, tail_gso_size);
1863 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1864
1865 sk->sk_backlog.len += delta;
1866 __NET_INC_STATS(sock_net(sk),
1867 LINUX_MIB_TCPBACKLOGCOALESCE);
1868 kfree_skb_partial(skb, fragstolen);
1869 return false;
1870 }
1871 __skb_push(skb, hdrlen);
1872
1873 no_coalesce:
1874 /* Only socket owner can try to collapse/prune rx queues
1875 * to reduce memory overhead, so add a little headroom here.
1876 * Few sockets backlog are possibly concurrently non empty.
1877 */
1878 limit += 64*1024;
1879
1880 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1881 bh_unlock_sock(sk);
1882 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1883 return true;
1884 }
1885 return false;
1886 }
1887 EXPORT_SYMBOL(tcp_add_backlog);
1888
tcp_filter(struct sock * sk,struct sk_buff * skb)1889 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1890 {
1891 struct tcphdr *th = (struct tcphdr *)skb->data;
1892
1893 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1894 }
1895 EXPORT_SYMBOL(tcp_filter);
1896
tcp_v4_restore_cb(struct sk_buff * skb)1897 static void tcp_v4_restore_cb(struct sk_buff *skb)
1898 {
1899 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1900 sizeof(struct inet_skb_parm));
1901 }
1902
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1903 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1904 const struct tcphdr *th)
1905 {
1906 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1907 * barrier() makes sure compiler wont play fool^Waliasing games.
1908 */
1909 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1910 sizeof(struct inet_skb_parm));
1911 barrier();
1912
1913 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1914 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1915 skb->len - th->doff * 4);
1916 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1917 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1918 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1919 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1920 TCP_SKB_CB(skb)->sacked = 0;
1921 TCP_SKB_CB(skb)->has_rxtstamp =
1922 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1923 }
1924
1925 /*
1926 * From tcp_input.c
1927 */
1928
tcp_v4_rcv(struct sk_buff * skb)1929 int tcp_v4_rcv(struct sk_buff *skb)
1930 {
1931 struct net *net = dev_net(skb->dev);
1932 struct sk_buff *skb_to_free;
1933 int sdif = inet_sdif(skb);
1934 int dif = inet_iif(skb);
1935 const struct iphdr *iph;
1936 const struct tcphdr *th;
1937 bool refcounted;
1938 struct sock *sk;
1939 int ret;
1940
1941 if (skb->pkt_type != PACKET_HOST)
1942 goto discard_it;
1943
1944 /* Count it even if it's bad */
1945 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1946
1947 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1948 goto discard_it;
1949
1950 th = (const struct tcphdr *)skb->data;
1951
1952 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1953 goto bad_packet;
1954 if (!pskb_may_pull(skb, th->doff * 4))
1955 goto discard_it;
1956
1957 /* An explanation is required here, I think.
1958 * Packet length and doff are validated by header prediction,
1959 * provided case of th->doff==0 is eliminated.
1960 * So, we defer the checks. */
1961
1962 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1963 goto csum_error;
1964
1965 th = (const struct tcphdr *)skb->data;
1966 iph = ip_hdr(skb);
1967 lookup:
1968 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1969 th->dest, sdif, &refcounted);
1970 if (!sk)
1971 goto no_tcp_socket;
1972
1973 process:
1974 if (sk->sk_state == TCP_TIME_WAIT)
1975 goto do_time_wait;
1976
1977 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1978 struct request_sock *req = inet_reqsk(sk);
1979 bool req_stolen = false;
1980 struct sock *nsk;
1981
1982 sk = req->rsk_listener;
1983 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1984 sk_drops_add(sk, skb);
1985 reqsk_put(req);
1986 goto discard_it;
1987 }
1988 if (tcp_checksum_complete(skb)) {
1989 reqsk_put(req);
1990 goto csum_error;
1991 }
1992 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1993 inet_csk_reqsk_queue_drop_and_put(sk, req);
1994 goto lookup;
1995 }
1996 /* We own a reference on the listener, increase it again
1997 * as we might lose it too soon.
1998 */
1999 sock_hold(sk);
2000 refcounted = true;
2001 nsk = NULL;
2002 if (!tcp_filter(sk, skb)) {
2003 th = (const struct tcphdr *)skb->data;
2004 iph = ip_hdr(skb);
2005 tcp_v4_fill_cb(skb, iph, th);
2006 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007 }
2008 if (!nsk) {
2009 reqsk_put(req);
2010 if (req_stolen) {
2011 /* Another cpu got exclusive access to req
2012 * and created a full blown socket.
2013 * Try to feed this packet to this socket
2014 * instead of discarding it.
2015 */
2016 tcp_v4_restore_cb(skb);
2017 sock_put(sk);
2018 goto lookup;
2019 }
2020 goto discard_and_relse;
2021 }
2022 if (nsk == sk) {
2023 reqsk_put(req);
2024 tcp_v4_restore_cb(skb);
2025 } else if (tcp_child_process(sk, nsk, skb)) {
2026 tcp_v4_send_reset(nsk, skb);
2027 goto discard_and_relse;
2028 } else {
2029 sock_put(sk);
2030 return 0;
2031 }
2032 }
2033 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2034 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2035 goto discard_and_relse;
2036 }
2037
2038 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2039 goto discard_and_relse;
2040
2041 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2042 goto discard_and_relse;
2043
2044 nf_reset_ct(skb);
2045
2046 if (tcp_filter(sk, skb))
2047 goto discard_and_relse;
2048 th = (const struct tcphdr *)skb->data;
2049 iph = ip_hdr(skb);
2050 tcp_v4_fill_cb(skb, iph, th);
2051
2052 skb->dev = NULL;
2053
2054 if (sk->sk_state == TCP_LISTEN) {
2055 ret = tcp_v4_do_rcv(sk, skb);
2056 goto put_and_return;
2057 }
2058
2059 sk_incoming_cpu_update(sk);
2060
2061 bh_lock_sock_nested(sk);
2062 tcp_segs_in(tcp_sk(sk), skb);
2063 ret = 0;
2064 if (!sock_owned_by_user(sk)) {
2065 skb_to_free = sk->sk_rx_skb_cache;
2066 sk->sk_rx_skb_cache = NULL;
2067 ret = tcp_v4_do_rcv(sk, skb);
2068 } else {
2069 if (tcp_add_backlog(sk, skb))
2070 goto discard_and_relse;
2071 skb_to_free = NULL;
2072 }
2073 bh_unlock_sock(sk);
2074 if (skb_to_free)
2075 __kfree_skb(skb_to_free);
2076
2077 put_and_return:
2078 if (refcounted)
2079 sock_put(sk);
2080
2081 return ret;
2082
2083 no_tcp_socket:
2084 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2085 goto discard_it;
2086
2087 tcp_v4_fill_cb(skb, iph, th);
2088
2089 if (tcp_checksum_complete(skb)) {
2090 csum_error:
2091 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2092 bad_packet:
2093 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2094 } else {
2095 tcp_v4_send_reset(NULL, skb);
2096 }
2097
2098 discard_it:
2099 /* Discard frame. */
2100 kfree_skb(skb);
2101 return 0;
2102
2103 discard_and_relse:
2104 sk_drops_add(sk, skb);
2105 if (refcounted)
2106 sock_put(sk);
2107 goto discard_it;
2108
2109 do_time_wait:
2110 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2111 inet_twsk_put(inet_twsk(sk));
2112 goto discard_it;
2113 }
2114
2115 tcp_v4_fill_cb(skb, iph, th);
2116
2117 if (tcp_checksum_complete(skb)) {
2118 inet_twsk_put(inet_twsk(sk));
2119 goto csum_error;
2120 }
2121 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2122 case TCP_TW_SYN: {
2123 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2124 &tcp_hashinfo, skb,
2125 __tcp_hdrlen(th),
2126 iph->saddr, th->source,
2127 iph->daddr, th->dest,
2128 inet_iif(skb),
2129 sdif);
2130 if (sk2) {
2131 inet_twsk_deschedule_put(inet_twsk(sk));
2132 sk = sk2;
2133 tcp_v4_restore_cb(skb);
2134 refcounted = false;
2135 goto process;
2136 }
2137 }
2138 /* to ACK */
2139 fallthrough;
2140 case TCP_TW_ACK:
2141 tcp_v4_timewait_ack(sk, skb);
2142 break;
2143 case TCP_TW_RST:
2144 tcp_v4_send_reset(sk, skb);
2145 inet_twsk_deschedule_put(inet_twsk(sk));
2146 goto discard_it;
2147 case TCP_TW_SUCCESS:;
2148 }
2149 goto discard_it;
2150 }
2151
2152 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2153 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2154 .twsk_unique = tcp_twsk_unique,
2155 .twsk_destructor= tcp_twsk_destructor,
2156 };
2157
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2158 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2159 {
2160 struct dst_entry *dst = skb_dst(skb);
2161
2162 if (dst && dst_hold_safe(dst)) {
2163 sk->sk_rx_dst = dst;
2164 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2165 }
2166 }
2167 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2168
2169 const struct inet_connection_sock_af_ops ipv4_specific = {
2170 .queue_xmit = ip_queue_xmit,
2171 .send_check = tcp_v4_send_check,
2172 .rebuild_header = inet_sk_rebuild_header,
2173 .sk_rx_dst_set = inet_sk_rx_dst_set,
2174 .conn_request = tcp_v4_conn_request,
2175 .syn_recv_sock = tcp_v4_syn_recv_sock,
2176 .net_header_len = sizeof(struct iphdr),
2177 .setsockopt = ip_setsockopt,
2178 .getsockopt = ip_getsockopt,
2179 .addr2sockaddr = inet_csk_addr2sockaddr,
2180 .sockaddr_len = sizeof(struct sockaddr_in),
2181 .mtu_reduced = tcp_v4_mtu_reduced,
2182 };
2183 EXPORT_SYMBOL(ipv4_specific);
2184
2185 #ifdef CONFIG_TCP_MD5SIG
2186 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2187 .md5_lookup = tcp_v4_md5_lookup,
2188 .calc_md5_hash = tcp_v4_md5_hash_skb,
2189 .md5_parse = tcp_v4_parse_md5_keys,
2190 };
2191 #endif
2192
2193 /* NOTE: A lot of things set to zero explicitly by call to
2194 * sk_alloc() so need not be done here.
2195 */
tcp_v4_init_sock(struct sock * sk)2196 static int tcp_v4_init_sock(struct sock *sk)
2197 {
2198 struct inet_connection_sock *icsk = inet_csk(sk);
2199
2200 tcp_init_sock(sk);
2201
2202 icsk->icsk_af_ops = &ipv4_specific;
2203
2204 #ifdef CONFIG_TCP_MD5SIG
2205 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2206 #endif
2207
2208 return 0;
2209 }
2210
tcp_v4_destroy_sock(struct sock * sk)2211 void tcp_v4_destroy_sock(struct sock *sk)
2212 {
2213 struct tcp_sock *tp = tcp_sk(sk);
2214
2215 trace_tcp_destroy_sock(sk);
2216
2217 tcp_clear_xmit_timers(sk);
2218
2219 tcp_cleanup_congestion_control(sk);
2220
2221 tcp_cleanup_ulp(sk);
2222
2223 /* Cleanup up the write buffer. */
2224 tcp_write_queue_purge(sk);
2225
2226 /* Check if we want to disable active TFO */
2227 tcp_fastopen_active_disable_ofo_check(sk);
2228
2229 /* Cleans up our, hopefully empty, out_of_order_queue. */
2230 skb_rbtree_purge(&tp->out_of_order_queue);
2231
2232 #ifdef CONFIG_TCP_MD5SIG
2233 /* Clean up the MD5 key list, if any */
2234 if (tp->md5sig_info) {
2235 tcp_clear_md5_list(sk);
2236 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2237 tp->md5sig_info = NULL;
2238 }
2239 #endif
2240
2241 /* Clean up a referenced TCP bind bucket. */
2242 if (inet_csk(sk)->icsk_bind_hash)
2243 inet_put_port(sk);
2244
2245 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2246
2247 /* If socket is aborted during connect operation */
2248 tcp_free_fastopen_req(tp);
2249 tcp_fastopen_destroy_cipher(sk);
2250 tcp_saved_syn_free(tp);
2251
2252 sk_sockets_allocated_dec(sk);
2253 }
2254 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2255
2256 #ifdef CONFIG_PROC_FS
2257 /* Proc filesystem TCP sock list dumping. */
2258
2259 /*
2260 * Get next listener socket follow cur. If cur is NULL, get first socket
2261 * starting from bucket given in st->bucket; when st->bucket is zero the
2262 * very first socket in the hash table is returned.
2263 */
listening_get_next(struct seq_file * seq,void * cur)2264 static void *listening_get_next(struct seq_file *seq, void *cur)
2265 {
2266 struct tcp_seq_afinfo *afinfo;
2267 struct tcp_iter_state *st = seq->private;
2268 struct net *net = seq_file_net(seq);
2269 struct inet_listen_hashbucket *ilb;
2270 struct hlist_nulls_node *node;
2271 struct sock *sk = cur;
2272
2273 if (st->bpf_seq_afinfo)
2274 afinfo = st->bpf_seq_afinfo;
2275 else
2276 afinfo = PDE_DATA(file_inode(seq->file));
2277
2278 if (!sk) {
2279 get_head:
2280 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2281 spin_lock(&ilb->lock);
2282 sk = sk_nulls_head(&ilb->nulls_head);
2283 st->offset = 0;
2284 goto get_sk;
2285 }
2286 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2287 ++st->num;
2288 ++st->offset;
2289
2290 sk = sk_nulls_next(sk);
2291 get_sk:
2292 sk_nulls_for_each_from(sk, node) {
2293 if (!net_eq(sock_net(sk), net))
2294 continue;
2295 if (afinfo->family == AF_UNSPEC ||
2296 sk->sk_family == afinfo->family)
2297 return sk;
2298 }
2299 spin_unlock(&ilb->lock);
2300 st->offset = 0;
2301 if (++st->bucket < INET_LHTABLE_SIZE)
2302 goto get_head;
2303 return NULL;
2304 }
2305
listening_get_idx(struct seq_file * seq,loff_t * pos)2306 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2307 {
2308 struct tcp_iter_state *st = seq->private;
2309 void *rc;
2310
2311 st->bucket = 0;
2312 st->offset = 0;
2313 rc = listening_get_next(seq, NULL);
2314
2315 while (rc && *pos) {
2316 rc = listening_get_next(seq, rc);
2317 --*pos;
2318 }
2319 return rc;
2320 }
2321
empty_bucket(const struct tcp_iter_state * st)2322 static inline bool empty_bucket(const struct tcp_iter_state *st)
2323 {
2324 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2325 }
2326
2327 /*
2328 * Get first established socket starting from bucket given in st->bucket.
2329 * If st->bucket is zero, the very first socket in the hash is returned.
2330 */
established_get_first(struct seq_file * seq)2331 static void *established_get_first(struct seq_file *seq)
2332 {
2333 struct tcp_seq_afinfo *afinfo;
2334 struct tcp_iter_state *st = seq->private;
2335 struct net *net = seq_file_net(seq);
2336 void *rc = NULL;
2337
2338 if (st->bpf_seq_afinfo)
2339 afinfo = st->bpf_seq_afinfo;
2340 else
2341 afinfo = PDE_DATA(file_inode(seq->file));
2342
2343 st->offset = 0;
2344 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2345 struct sock *sk;
2346 struct hlist_nulls_node *node;
2347 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2348
2349 /* Lockless fast path for the common case of empty buckets */
2350 if (empty_bucket(st))
2351 continue;
2352
2353 spin_lock_bh(lock);
2354 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2355 if ((afinfo->family != AF_UNSPEC &&
2356 sk->sk_family != afinfo->family) ||
2357 !net_eq(sock_net(sk), net)) {
2358 continue;
2359 }
2360 rc = sk;
2361 goto out;
2362 }
2363 spin_unlock_bh(lock);
2364 }
2365 out:
2366 return rc;
2367 }
2368
established_get_next(struct seq_file * seq,void * cur)2369 static void *established_get_next(struct seq_file *seq, void *cur)
2370 {
2371 struct tcp_seq_afinfo *afinfo;
2372 struct sock *sk = cur;
2373 struct hlist_nulls_node *node;
2374 struct tcp_iter_state *st = seq->private;
2375 struct net *net = seq_file_net(seq);
2376
2377 if (st->bpf_seq_afinfo)
2378 afinfo = st->bpf_seq_afinfo;
2379 else
2380 afinfo = PDE_DATA(file_inode(seq->file));
2381
2382 ++st->num;
2383 ++st->offset;
2384
2385 sk = sk_nulls_next(sk);
2386
2387 sk_nulls_for_each_from(sk, node) {
2388 if ((afinfo->family == AF_UNSPEC ||
2389 sk->sk_family == afinfo->family) &&
2390 net_eq(sock_net(sk), net))
2391 return sk;
2392 }
2393
2394 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2395 ++st->bucket;
2396 return established_get_first(seq);
2397 }
2398
established_get_idx(struct seq_file * seq,loff_t pos)2399 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2400 {
2401 struct tcp_iter_state *st = seq->private;
2402 void *rc;
2403
2404 st->bucket = 0;
2405 rc = established_get_first(seq);
2406
2407 while (rc && pos) {
2408 rc = established_get_next(seq, rc);
2409 --pos;
2410 }
2411 return rc;
2412 }
2413
tcp_get_idx(struct seq_file * seq,loff_t pos)2414 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2415 {
2416 void *rc;
2417 struct tcp_iter_state *st = seq->private;
2418
2419 st->state = TCP_SEQ_STATE_LISTENING;
2420 rc = listening_get_idx(seq, &pos);
2421
2422 if (!rc) {
2423 st->state = TCP_SEQ_STATE_ESTABLISHED;
2424 rc = established_get_idx(seq, pos);
2425 }
2426
2427 return rc;
2428 }
2429
tcp_seek_last_pos(struct seq_file * seq)2430 static void *tcp_seek_last_pos(struct seq_file *seq)
2431 {
2432 struct tcp_iter_state *st = seq->private;
2433 int bucket = st->bucket;
2434 int offset = st->offset;
2435 int orig_num = st->num;
2436 void *rc = NULL;
2437
2438 switch (st->state) {
2439 case TCP_SEQ_STATE_LISTENING:
2440 if (st->bucket >= INET_LHTABLE_SIZE)
2441 break;
2442 st->state = TCP_SEQ_STATE_LISTENING;
2443 rc = listening_get_next(seq, NULL);
2444 while (offset-- && rc && bucket == st->bucket)
2445 rc = listening_get_next(seq, rc);
2446 if (rc)
2447 break;
2448 st->bucket = 0;
2449 st->state = TCP_SEQ_STATE_ESTABLISHED;
2450 fallthrough;
2451 case TCP_SEQ_STATE_ESTABLISHED:
2452 if (st->bucket > tcp_hashinfo.ehash_mask)
2453 break;
2454 rc = established_get_first(seq);
2455 while (offset-- && rc && bucket == st->bucket)
2456 rc = established_get_next(seq, rc);
2457 }
2458
2459 st->num = orig_num;
2460
2461 return rc;
2462 }
2463
tcp_seq_start(struct seq_file * seq,loff_t * pos)2464 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2465 {
2466 struct tcp_iter_state *st = seq->private;
2467 void *rc;
2468
2469 if (*pos && *pos == st->last_pos) {
2470 rc = tcp_seek_last_pos(seq);
2471 if (rc)
2472 goto out;
2473 }
2474
2475 st->state = TCP_SEQ_STATE_LISTENING;
2476 st->num = 0;
2477 st->bucket = 0;
2478 st->offset = 0;
2479 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2480
2481 out:
2482 st->last_pos = *pos;
2483 return rc;
2484 }
2485 EXPORT_SYMBOL(tcp_seq_start);
2486
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2487 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2488 {
2489 struct tcp_iter_state *st = seq->private;
2490 void *rc = NULL;
2491
2492 if (v == SEQ_START_TOKEN) {
2493 rc = tcp_get_idx(seq, 0);
2494 goto out;
2495 }
2496
2497 switch (st->state) {
2498 case TCP_SEQ_STATE_LISTENING:
2499 rc = listening_get_next(seq, v);
2500 if (!rc) {
2501 st->state = TCP_SEQ_STATE_ESTABLISHED;
2502 st->bucket = 0;
2503 st->offset = 0;
2504 rc = established_get_first(seq);
2505 }
2506 break;
2507 case TCP_SEQ_STATE_ESTABLISHED:
2508 rc = established_get_next(seq, v);
2509 break;
2510 }
2511 out:
2512 ++*pos;
2513 st->last_pos = *pos;
2514 return rc;
2515 }
2516 EXPORT_SYMBOL(tcp_seq_next);
2517
tcp_seq_stop(struct seq_file * seq,void * v)2518 void tcp_seq_stop(struct seq_file *seq, void *v)
2519 {
2520 struct tcp_iter_state *st = seq->private;
2521
2522 switch (st->state) {
2523 case TCP_SEQ_STATE_LISTENING:
2524 if (v != SEQ_START_TOKEN)
2525 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2526 break;
2527 case TCP_SEQ_STATE_ESTABLISHED:
2528 if (v)
2529 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2530 break;
2531 }
2532 }
2533 EXPORT_SYMBOL(tcp_seq_stop);
2534
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2535 static void get_openreq4(const struct request_sock *req,
2536 struct seq_file *f, int i)
2537 {
2538 const struct inet_request_sock *ireq = inet_rsk(req);
2539 long delta = req->rsk_timer.expires - jiffies;
2540
2541 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2542 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2543 i,
2544 ireq->ir_loc_addr,
2545 ireq->ir_num,
2546 ireq->ir_rmt_addr,
2547 ntohs(ireq->ir_rmt_port),
2548 TCP_SYN_RECV,
2549 0, 0, /* could print option size, but that is af dependent. */
2550 1, /* timers active (only the expire timer) */
2551 jiffies_delta_to_clock_t(delta),
2552 req->num_timeout,
2553 from_kuid_munged(seq_user_ns(f),
2554 sock_i_uid(req->rsk_listener)),
2555 0, /* non standard timer */
2556 0, /* open_requests have no inode */
2557 0,
2558 req);
2559 }
2560
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2561 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2562 {
2563 int timer_active;
2564 unsigned long timer_expires;
2565 const struct tcp_sock *tp = tcp_sk(sk);
2566 const struct inet_connection_sock *icsk = inet_csk(sk);
2567 const struct inet_sock *inet = inet_sk(sk);
2568 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2569 __be32 dest = inet->inet_daddr;
2570 __be32 src = inet->inet_rcv_saddr;
2571 __u16 destp = ntohs(inet->inet_dport);
2572 __u16 srcp = ntohs(inet->inet_sport);
2573 int rx_queue;
2574 int state;
2575
2576 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2577 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2578 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2579 timer_active = 1;
2580 timer_expires = icsk->icsk_timeout;
2581 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2582 timer_active = 4;
2583 timer_expires = icsk->icsk_timeout;
2584 } else if (timer_pending(&sk->sk_timer)) {
2585 timer_active = 2;
2586 timer_expires = sk->sk_timer.expires;
2587 } else {
2588 timer_active = 0;
2589 timer_expires = jiffies;
2590 }
2591
2592 state = inet_sk_state_load(sk);
2593 if (state == TCP_LISTEN)
2594 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2595 else
2596 /* Because we don't lock the socket,
2597 * we might find a transient negative value.
2598 */
2599 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2600 READ_ONCE(tp->copied_seq), 0);
2601
2602 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2603 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2604 i, src, srcp, dest, destp, state,
2605 READ_ONCE(tp->write_seq) - tp->snd_una,
2606 rx_queue,
2607 timer_active,
2608 jiffies_delta_to_clock_t(timer_expires - jiffies),
2609 icsk->icsk_retransmits,
2610 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2611 icsk->icsk_probes_out,
2612 sock_i_ino(sk),
2613 refcount_read(&sk->sk_refcnt), sk,
2614 jiffies_to_clock_t(icsk->icsk_rto),
2615 jiffies_to_clock_t(icsk->icsk_ack.ato),
2616 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2617 tp->snd_cwnd,
2618 state == TCP_LISTEN ?
2619 fastopenq->max_qlen :
2620 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2621 }
2622
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2623 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2624 struct seq_file *f, int i)
2625 {
2626 long delta = tw->tw_timer.expires - jiffies;
2627 __be32 dest, src;
2628 __u16 destp, srcp;
2629
2630 dest = tw->tw_daddr;
2631 src = tw->tw_rcv_saddr;
2632 destp = ntohs(tw->tw_dport);
2633 srcp = ntohs(tw->tw_sport);
2634
2635 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2636 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2637 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2638 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2639 refcount_read(&tw->tw_refcnt), tw);
2640 }
2641
2642 #define TMPSZ 150
2643
tcp4_seq_show(struct seq_file * seq,void * v)2644 static int tcp4_seq_show(struct seq_file *seq, void *v)
2645 {
2646 struct tcp_iter_state *st;
2647 struct sock *sk = v;
2648
2649 seq_setwidth(seq, TMPSZ - 1);
2650 if (v == SEQ_START_TOKEN) {
2651 seq_puts(seq, " sl local_address rem_address st tx_queue "
2652 "rx_queue tr tm->when retrnsmt uid timeout "
2653 "inode");
2654 goto out;
2655 }
2656 st = seq->private;
2657
2658 if (sk->sk_state == TCP_TIME_WAIT)
2659 get_timewait4_sock(v, seq, st->num);
2660 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2661 get_openreq4(v, seq, st->num);
2662 else
2663 get_tcp4_sock(v, seq, st->num);
2664 out:
2665 seq_pad(seq, '\n');
2666 return 0;
2667 }
2668
2669 #ifdef CONFIG_BPF_SYSCALL
2670 struct bpf_iter__tcp {
2671 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2672 __bpf_md_ptr(struct sock_common *, sk_common);
2673 uid_t uid __aligned(8);
2674 };
2675
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2676 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2677 struct sock_common *sk_common, uid_t uid)
2678 {
2679 struct bpf_iter__tcp ctx;
2680
2681 meta->seq_num--; /* skip SEQ_START_TOKEN */
2682 ctx.meta = meta;
2683 ctx.sk_common = sk_common;
2684 ctx.uid = uid;
2685 return bpf_iter_run_prog(prog, &ctx);
2686 }
2687
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2688 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2689 {
2690 struct bpf_iter_meta meta;
2691 struct bpf_prog *prog;
2692 struct sock *sk = v;
2693 uid_t uid;
2694
2695 if (v == SEQ_START_TOKEN)
2696 return 0;
2697
2698 if (sk->sk_state == TCP_TIME_WAIT) {
2699 uid = 0;
2700 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2701 const struct request_sock *req = v;
2702
2703 uid = from_kuid_munged(seq_user_ns(seq),
2704 sock_i_uid(req->rsk_listener));
2705 } else {
2706 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2707 }
2708
2709 meta.seq = seq;
2710 prog = bpf_iter_get_info(&meta, false);
2711 return tcp_prog_seq_show(prog, &meta, v, uid);
2712 }
2713
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2714 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2715 {
2716 struct bpf_iter_meta meta;
2717 struct bpf_prog *prog;
2718
2719 if (!v) {
2720 meta.seq = seq;
2721 prog = bpf_iter_get_info(&meta, true);
2722 if (prog)
2723 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2724 }
2725
2726 tcp_seq_stop(seq, v);
2727 }
2728
2729 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2730 .show = bpf_iter_tcp_seq_show,
2731 .start = tcp_seq_start,
2732 .next = tcp_seq_next,
2733 .stop = bpf_iter_tcp_seq_stop,
2734 };
2735 #endif
2736
2737 static const struct seq_operations tcp4_seq_ops = {
2738 .show = tcp4_seq_show,
2739 .start = tcp_seq_start,
2740 .next = tcp_seq_next,
2741 .stop = tcp_seq_stop,
2742 };
2743
2744 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2745 .family = AF_INET,
2746 };
2747
tcp4_proc_init_net(struct net * net)2748 static int __net_init tcp4_proc_init_net(struct net *net)
2749 {
2750 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2751 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2752 return -ENOMEM;
2753 return 0;
2754 }
2755
tcp4_proc_exit_net(struct net * net)2756 static void __net_exit tcp4_proc_exit_net(struct net *net)
2757 {
2758 remove_proc_entry("tcp", net->proc_net);
2759 }
2760
2761 static struct pernet_operations tcp4_net_ops = {
2762 .init = tcp4_proc_init_net,
2763 .exit = tcp4_proc_exit_net,
2764 };
2765
tcp4_proc_init(void)2766 int __init tcp4_proc_init(void)
2767 {
2768 return register_pernet_subsys(&tcp4_net_ops);
2769 }
2770
tcp4_proc_exit(void)2771 void tcp4_proc_exit(void)
2772 {
2773 unregister_pernet_subsys(&tcp4_net_ops);
2774 }
2775 #endif /* CONFIG_PROC_FS */
2776
2777 struct proto tcp_prot = {
2778 .name = "TCP",
2779 .owner = THIS_MODULE,
2780 .close = tcp_close,
2781 .pre_connect = tcp_v4_pre_connect,
2782 .connect = tcp_v4_connect,
2783 .disconnect = tcp_disconnect,
2784 .accept = inet_csk_accept,
2785 .ioctl = tcp_ioctl,
2786 .init = tcp_v4_init_sock,
2787 .destroy = tcp_v4_destroy_sock,
2788 .shutdown = tcp_shutdown,
2789 .setsockopt = tcp_setsockopt,
2790 .getsockopt = tcp_getsockopt,
2791 .keepalive = tcp_set_keepalive,
2792 .recvmsg = tcp_recvmsg,
2793 .sendmsg = tcp_sendmsg,
2794 .sendpage = tcp_sendpage,
2795 .backlog_rcv = tcp_v4_do_rcv,
2796 .release_cb = tcp_release_cb,
2797 .hash = inet_hash,
2798 .unhash = inet_unhash,
2799 .get_port = inet_csk_get_port,
2800 .enter_memory_pressure = tcp_enter_memory_pressure,
2801 .leave_memory_pressure = tcp_leave_memory_pressure,
2802 .stream_memory_free = tcp_stream_memory_free,
2803 .sockets_allocated = &tcp_sockets_allocated,
2804 .orphan_count = &tcp_orphan_count,
2805 .memory_allocated = &tcp_memory_allocated,
2806 .memory_pressure = &tcp_memory_pressure,
2807 .sysctl_mem = sysctl_tcp_mem,
2808 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2809 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2810 .max_header = MAX_TCP_HEADER,
2811 .obj_size = sizeof(struct tcp_sock),
2812 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2813 .twsk_prot = &tcp_timewait_sock_ops,
2814 .rsk_prot = &tcp_request_sock_ops,
2815 .h.hashinfo = &tcp_hashinfo,
2816 .no_autobind = true,
2817 .diag_destroy = tcp_abort,
2818 };
2819 EXPORT_SYMBOL(tcp_prot);
2820
tcp_sk_exit(struct net * net)2821 static void __net_exit tcp_sk_exit(struct net *net)
2822 {
2823 int cpu;
2824
2825 if (net->ipv4.tcp_congestion_control)
2826 bpf_module_put(net->ipv4.tcp_congestion_control,
2827 net->ipv4.tcp_congestion_control->owner);
2828
2829 for_each_possible_cpu(cpu)
2830 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2831 free_percpu(net->ipv4.tcp_sk);
2832 }
2833
tcp_sk_init(struct net * net)2834 static int __net_init tcp_sk_init(struct net *net)
2835 {
2836 int res, cpu, cnt;
2837
2838 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2839 if (!net->ipv4.tcp_sk)
2840 return -ENOMEM;
2841
2842 for_each_possible_cpu(cpu) {
2843 struct sock *sk;
2844
2845 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2846 IPPROTO_TCP, net);
2847 if (res)
2848 goto fail;
2849 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2850
2851 /* Please enforce IP_DF and IPID==0 for RST and
2852 * ACK sent in SYN-RECV and TIME-WAIT state.
2853 */
2854 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2855
2856 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2857 }
2858
2859 net->ipv4.sysctl_tcp_ecn = 2;
2860 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2861
2862 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2863 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2864 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2865 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2866 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2867
2868 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2869 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2870 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2871
2872 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2873 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2874 net->ipv4.sysctl_tcp_syncookies = 1;
2875 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2876 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2877 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2878 net->ipv4.sysctl_tcp_orphan_retries = 0;
2879 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2880 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2881 net->ipv4.sysctl_tcp_tw_reuse = 2;
2882 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2883
2884 cnt = tcp_hashinfo.ehash_mask + 1;
2885 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2886 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2887
2888 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2889 net->ipv4.sysctl_tcp_sack = 1;
2890 net->ipv4.sysctl_tcp_window_scaling = 1;
2891 net->ipv4.sysctl_tcp_timestamps = 1;
2892 net->ipv4.sysctl_tcp_early_retrans = 3;
2893 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2894 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2895 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2896 net->ipv4.sysctl_tcp_max_reordering = 300;
2897 net->ipv4.sysctl_tcp_dsack = 1;
2898 net->ipv4.sysctl_tcp_app_win = 31;
2899 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2900 net->ipv4.sysctl_tcp_frto = 2;
2901 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2902 /* This limits the percentage of the congestion window which we
2903 * will allow a single TSO frame to consume. Building TSO frames
2904 * which are too large can cause TCP streams to be bursty.
2905 */
2906 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2907 /* Default TSQ limit of 16 TSO segments */
2908 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2909 /* rfc5961 challenge ack rate limiting */
2910 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2911 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2912 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2913 net->ipv4.sysctl_tcp_autocorking = 1;
2914 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2915 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2916 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2917 if (net != &init_net) {
2918 memcpy(net->ipv4.sysctl_tcp_rmem,
2919 init_net.ipv4.sysctl_tcp_rmem,
2920 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2921 memcpy(net->ipv4.sysctl_tcp_wmem,
2922 init_net.ipv4.sysctl_tcp_wmem,
2923 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2924 }
2925 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2926 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2927 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2928 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2929 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2930 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2931 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2932
2933 /* Reno is always built in */
2934 if (!net_eq(net, &init_net) &&
2935 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2936 init_net.ipv4.tcp_congestion_control->owner))
2937 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2938 else
2939 net->ipv4.tcp_congestion_control = &tcp_reno;
2940
2941 return 0;
2942 fail:
2943 tcp_sk_exit(net);
2944
2945 return res;
2946 }
2947
tcp_sk_exit_batch(struct list_head * net_exit_list)2948 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2949 {
2950 struct net *net;
2951
2952 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2953
2954 list_for_each_entry(net, net_exit_list, exit_list)
2955 tcp_fastopen_ctx_destroy(net);
2956 }
2957
2958 static struct pernet_operations __net_initdata tcp_sk_ops = {
2959 .init = tcp_sk_init,
2960 .exit = tcp_sk_exit,
2961 .exit_batch = tcp_sk_exit_batch,
2962 };
2963
2964 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2965 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2966 struct sock_common *sk_common, uid_t uid)
2967
2968 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2969 {
2970 struct tcp_iter_state *st = priv_data;
2971 struct tcp_seq_afinfo *afinfo;
2972 int ret;
2973
2974 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2975 if (!afinfo)
2976 return -ENOMEM;
2977
2978 afinfo->family = AF_UNSPEC;
2979 st->bpf_seq_afinfo = afinfo;
2980 ret = bpf_iter_init_seq_net(priv_data, aux);
2981 if (ret)
2982 kfree(afinfo);
2983 return ret;
2984 }
2985
bpf_iter_fini_tcp(void * priv_data)2986 static void bpf_iter_fini_tcp(void *priv_data)
2987 {
2988 struct tcp_iter_state *st = priv_data;
2989
2990 kfree(st->bpf_seq_afinfo);
2991 bpf_iter_fini_seq_net(priv_data);
2992 }
2993
2994 static const struct bpf_iter_seq_info tcp_seq_info = {
2995 .seq_ops = &bpf_iter_tcp_seq_ops,
2996 .init_seq_private = bpf_iter_init_tcp,
2997 .fini_seq_private = bpf_iter_fini_tcp,
2998 .seq_priv_size = sizeof(struct tcp_iter_state),
2999 };
3000
3001 static struct bpf_iter_reg tcp_reg_info = {
3002 .target = "tcp",
3003 .ctx_arg_info_size = 1,
3004 .ctx_arg_info = {
3005 { offsetof(struct bpf_iter__tcp, sk_common),
3006 PTR_TO_BTF_ID_OR_NULL },
3007 },
3008 .seq_info = &tcp_seq_info,
3009 };
3010
bpf_iter_register(void)3011 static void __init bpf_iter_register(void)
3012 {
3013 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3014 if (bpf_iter_reg_target(&tcp_reg_info))
3015 pr_warn("Warning: could not register bpf iterator tcp\n");
3016 }
3017
3018 #endif
3019
tcp_v4_init(void)3020 void __init tcp_v4_init(void)
3021 {
3022 if (register_pernet_subsys(&tcp_sk_ops))
3023 panic("Failed to create the TCP control socket.\n");
3024
3025 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3026 bpf_iter_register();
3027 #endif
3028 }
3029