1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102 }
103
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132 #endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
159 * process.
160 *
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
166 */
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186 {
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
190 */
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
207 struct flowi4 *fl4;
208 struct rtable *rt;
209 int err;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
212
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
218
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
223 if (!daddr)
224 return -EINVAL;
225 nexthop = inet_opt->opt.faddr;
226 }
227
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
233 IPPROTO_TCP,
234 orig_sport, orig_dport, sk);
235 if (IS_ERR(rt)) {
236 err = PTR_ERR(rt);
237 if (err == -ENETUNREACH)
238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 return err;
240 }
241
242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 ip_rt_put(rt);
244 return -ENETUNREACH;
245 }
246
247 if (!inet_opt || !inet_opt->opt.srr)
248 daddr = fl4->daddr;
249
250 if (!inet->inet_saddr)
251 inet->inet_saddr = fl4->saddr;
252 sk_rcv_saddr_set(sk, inet->inet_saddr);
253
254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
255 /* Reset inherited state */
256 tp->rx_opt.ts_recent = 0;
257 tp->rx_opt.ts_recent_stamp = 0;
258 if (likely(!tp->repair))
259 WRITE_ONCE(tp->write_seq, 0);
260 }
261
262 inet->inet_dport = usin->sin_port;
263 sk_daddr_set(sk, daddr);
264
265 inet_csk(sk)->icsk_ext_hdr_len = 0;
266 if (inet_opt)
267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268
269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270
271 /* Socket identity is still unknown (sport may be zero).
272 * However we set state to SYN-SENT and not releasing socket
273 * lock select source port, enter ourselves into the hash tables and
274 * complete initialization after this.
275 */
276 tcp_set_state(sk, TCP_SYN_SENT);
277 err = inet_hash_connect(tcp_death_row, sk);
278 if (err)
279 goto failure;
280
281 sk_set_txhash(sk);
282
283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
284 inet->inet_sport, inet->inet_dport, sk);
285 if (IS_ERR(rt)) {
286 err = PTR_ERR(rt);
287 rt = NULL;
288 goto failure;
289 }
290 /* OK, now commit destination to socket. */
291 sk->sk_gso_type = SKB_GSO_TCPV4;
292 sk_setup_caps(sk, &rt->dst);
293 rt = NULL;
294
295 if (likely(!tp->repair)) {
296 if (!tp->write_seq)
297 WRITE_ONCE(tp->write_seq,
298 secure_tcp_seq(inet->inet_saddr,
299 inet->inet_daddr,
300 inet->inet_sport,
301 usin->sin_port));
302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
303 inet->inet_saddr,
304 inet->inet_daddr);
305 }
306
307 inet->inet_id = prandom_u32();
308
309 if (tcp_fastopen_defer_connect(sk, &err))
310 return err;
311 if (err)
312 goto failure;
313
314 err = tcp_connect(sk);
315
316 if (err)
317 goto failure;
318
319 return 0;
320
321 failure:
322 /*
323 * This unhashes the socket and releases the local port,
324 * if necessary.
325 */
326 tcp_set_state(sk, TCP_CLOSE);
327 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
328 inet_reset_saddr(sk);
329 ip_rt_put(rt);
330 sk->sk_route_caps = 0;
331 inet->inet_dport = 0;
332 return err;
333 }
334 EXPORT_SYMBOL(tcp_v4_connect);
335
336 /*
337 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
338 * It can be called through tcp_release_cb() if socket was owned by user
339 * at the time tcp_v4_err() was called to handle ICMP message.
340 */
tcp_v4_mtu_reduced(struct sock * sk)341 void tcp_v4_mtu_reduced(struct sock *sk)
342 {
343 struct inet_sock *inet = inet_sk(sk);
344 struct dst_entry *dst;
345 u32 mtu;
346
347 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
348 return;
349 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
350 dst = inet_csk_update_pmtu(sk, mtu);
351 if (!dst)
352 return;
353
354 /* Something is about to be wrong... Remember soft error
355 * for the case, if this connection will not able to recover.
356 */
357 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
358 sk->sk_err_soft = EMSGSIZE;
359
360 mtu = dst_mtu(dst);
361
362 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
363 ip_sk_accept_pmtu(sk) &&
364 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
365 tcp_sync_mss(sk, mtu);
366
367 /* Resend the TCP packet because it's
368 * clear that the old packet has been
369 * dropped. This is the new "fast" path mtu
370 * discovery.
371 */
372 tcp_simple_retransmit(sk);
373 } /* else let the usual retransmit timer handle it */
374 }
375 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
376
do_redirect(struct sk_buff * skb,struct sock * sk)377 static void do_redirect(struct sk_buff *skb, struct sock *sk)
378 {
379 struct dst_entry *dst = __sk_dst_check(sk, 0);
380
381 if (dst)
382 dst->ops->redirect(dst, sk, skb);
383 }
384
385
386 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)387 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
388 {
389 struct request_sock *req = inet_reqsk(sk);
390 struct net *net = sock_net(sk);
391
392 /* ICMPs are not backlogged, hence we cannot get
393 * an established socket here.
394 */
395 if (seq != tcp_rsk(req)->snt_isn) {
396 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
397 } else if (abort) {
398 /*
399 * Still in SYN_RECV, just remove it silently.
400 * There is no good way to pass the error to the newly
401 * created socket, and POSIX does not want network
402 * errors returned from accept().
403 */
404 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
405 tcp_listendrop(req->rsk_listener);
406 }
407 reqsk_put(req);
408 }
409 EXPORT_SYMBOL(tcp_req_err);
410
411 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)412 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
413 {
414 struct inet_connection_sock *icsk = inet_csk(sk);
415 struct tcp_sock *tp = tcp_sk(sk);
416 struct sk_buff *skb;
417 s32 remaining;
418 u32 delta_us;
419
420 if (sock_owned_by_user(sk))
421 return;
422
423 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
424 !icsk->icsk_backoff)
425 return;
426
427 skb = tcp_rtx_queue_head(sk);
428 if (WARN_ON_ONCE(!skb))
429 return;
430
431 icsk->icsk_backoff--;
432 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
433 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
434
435 tcp_mstamp_refresh(tp);
436 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
437 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
438
439 if (remaining > 0) {
440 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
441 remaining, TCP_RTO_MAX);
442 } else {
443 /* RTO revert clocked out retransmission.
444 * Will retransmit now.
445 */
446 tcp_retransmit_timer(sk);
447 }
448 }
449 EXPORT_SYMBOL(tcp_ld_RTO_revert);
450
451 /*
452 * This routine is called by the ICMP module when it gets some
453 * sort of error condition. If err < 0 then the socket should
454 * be closed and the error returned to the user. If err > 0
455 * it's just the icmp type << 8 | icmp code. After adjustment
456 * header points to the first 8 bytes of the tcp header. We need
457 * to find the appropriate port.
458 *
459 * The locking strategy used here is very "optimistic". When
460 * someone else accesses the socket the ICMP is just dropped
461 * and for some paths there is no check at all.
462 * A more general error queue to queue errors for later handling
463 * is probably better.
464 *
465 */
466
tcp_v4_err(struct sk_buff * skb,u32 info)467 int tcp_v4_err(struct sk_buff *skb, u32 info)
468 {
469 const struct iphdr *iph = (const struct iphdr *)skb->data;
470 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
471 struct tcp_sock *tp;
472 struct inet_sock *inet;
473 const int type = icmp_hdr(skb)->type;
474 const int code = icmp_hdr(skb)->code;
475 struct sock *sk;
476 struct request_sock *fastopen;
477 u32 seq, snd_una;
478 int err;
479 struct net *net = dev_net(skb->dev);
480
481 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
482 th->dest, iph->saddr, ntohs(th->source),
483 inet_iif(skb), 0);
484 if (!sk) {
485 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
486 return -ENOENT;
487 }
488 if (sk->sk_state == TCP_TIME_WAIT) {
489 inet_twsk_put(inet_twsk(sk));
490 return 0;
491 }
492 seq = ntohl(th->seq);
493 if (sk->sk_state == TCP_NEW_SYN_RECV) {
494 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
495 type == ICMP_TIME_EXCEEDED ||
496 (type == ICMP_DEST_UNREACH &&
497 (code == ICMP_NET_UNREACH ||
498 code == ICMP_HOST_UNREACH)));
499 return 0;
500 }
501
502 bh_lock_sock(sk);
503 /* If too many ICMPs get dropped on busy
504 * servers this needs to be solved differently.
505 * We do take care of PMTU discovery (RFC1191) special case :
506 * we can receive locally generated ICMP messages while socket is held.
507 */
508 if (sock_owned_by_user(sk)) {
509 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
510 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
511 }
512 if (sk->sk_state == TCP_CLOSE)
513 goto out;
514
515 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 goto out;
518 }
519
520 tp = tcp_sk(sk);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 goto out;
528 }
529
530 switch (type) {
531 case ICMP_REDIRECT:
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
534 goto out;
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
549 */
550 if (sk->sk_state == TCP_LISTEN)
551 goto out;
552
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
556 } else {
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 sock_hold(sk);
559 }
560 goto out;
561 }
562
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
565 * (see RFC 6069)
566 */
567 if (!fastopen &&
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
570 break;
571 case ICMP_TIME_EXCEEDED:
572 err = EHOSTUNREACH;
573 break;
574 default:
575 goto out;
576 }
577
578 switch (sk->sk_state) {
579 case TCP_SYN_SENT:
580 case TCP_SYN_RECV:
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
583 */
584 if (fastopen && !fastopen->sk)
585 break;
586
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
603 *
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
607 *
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 *
613 * Now we are in compliance with RFCs.
614 * --ANK (980905)
615 */
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk_error_report(sk);
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
623 }
624
625 out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629 }
630
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639
640 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648
649 /*
650 * This routine will send an RST to the other tcp.
651 *
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653 * for reset.
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
660 */
661
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664 #else
665 #define OPTION_BYTES sizeof(__be32)
666 #endif
667
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 const struct tcphdr *th = tcp_hdr(skb);
671 struct {
672 struct tcphdr th;
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
674 } rep;
675 struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
680 int genhash;
681 struct sock *sk1 = NULL;
682 #endif
683 u64 transmit_time = 0;
684 struct sock *ctl_sk;
685 struct net *net;
686
687 /* Never send a reset in response to a reset. */
688 if (th->rst)
689 return;
690
691 /* If sk not NULL, it means we did a successful lookup and incoming
692 * route had to be correct. prequeue might have dropped our dst.
693 */
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 return;
696
697 /* Swap the send and the receive. */
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
702 rep.th.rst = 1;
703
704 if (th->ack) {
705 rep.th.seq = th->ack_seq;
706 } else {
707 rep.th.ack = 1;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
710 }
711
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
715
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
718 rcu_read_lock();
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
722 int l3index;
723
724 /* sdif set, means packet ingressed via a device
725 * in an L3 domain and inet_iif is set to it.
726 */
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
734 int l3index;
735
736 /*
737 * active side is lost. Try to find listening socket through
738 * source port, and then find md5 key through listening socket.
739 * we are not loose security here:
740 * Incoming packet is checked with md5 hash with finding key,
741 * no RST generated if md5 hash doesn't match.
742 */
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 ip_hdr(skb)->saddr,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747 /* don't send rst if it can't find key */
748 if (!sk1)
749 goto out;
750
751 /* sdif set, means packet ingressed via a device
752 * in an L3 domain and dif is set to it.
753 */
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 if (!key)
758 goto out;
759
760
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 goto out;
764
765 }
766
767 if (key) {
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 (TCPOPT_NOP << 16) |
770 (TCPOPT_MD5SIG << 8) |
771 TCPOLEN_MD5SIG);
772 /* Update length and the length the header thinks exists */
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
775
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
779 }
780 #endif
781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
784
785 if (mrst) {
786 rep.opt[0] = mrst;
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
789 }
790 }
791
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr, /* XXX */
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797
798 /* When socket is gone, all binding information is lost.
799 * routing might fail in this case. No choice here, if we choose to force
800 * input interface, we will misroute in case of asymmetric route.
801 */
802 if (sk) {
803 arg.bound_dev_if = sk->sk_bound_dev_if;
804 if (sk_fullsock(sk))
805 trace_tcp_send_reset(sk, skb);
806 }
807
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 local_bh_disable();
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
816 if (sk) {
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 xfrm_sk_clone_policy(ctl_sk, sk);
823 } else {
824 ctl_sk->sk_mark = 0;
825 ctl_sk->sk_priority = 0;
826 }
827 ip_send_unicast_reply(ctl_sk,
828 skb, &TCP_SKB_CB(skb)->header.h4.opt,
829 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
830 &arg, arg.iov[0].iov_len,
831 transmit_time);
832
833 xfrm_sk_free_policy(ctl_sk);
834 sock_net_set(ctl_sk, &init_net);
835 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
836 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
837 local_bh_enable();
838
839 #ifdef CONFIG_TCP_MD5SIG
840 out:
841 rcu_read_unlock();
842 #endif
843 }
844
845 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
846 outside socket context is ugly, certainly. What can I do?
847 */
848
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)849 static void tcp_v4_send_ack(const struct sock *sk,
850 struct sk_buff *skb, u32 seq, u32 ack,
851 u32 win, u32 tsval, u32 tsecr, int oif,
852 struct tcp_md5sig_key *key,
853 int reply_flags, u8 tos)
854 {
855 const struct tcphdr *th = tcp_hdr(skb);
856 struct {
857 struct tcphdr th;
858 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
859 #ifdef CONFIG_TCP_MD5SIG
860 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
861 #endif
862 ];
863 } rep;
864 struct net *net = sock_net(sk);
865 struct ip_reply_arg arg;
866 struct sock *ctl_sk;
867 u64 transmit_time;
868
869 memset(&rep.th, 0, sizeof(struct tcphdr));
870 memset(&arg, 0, sizeof(arg));
871
872 arg.iov[0].iov_base = (unsigned char *)&rep;
873 arg.iov[0].iov_len = sizeof(rep.th);
874 if (tsecr) {
875 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
876 (TCPOPT_TIMESTAMP << 8) |
877 TCPOLEN_TIMESTAMP);
878 rep.opt[1] = htonl(tsval);
879 rep.opt[2] = htonl(tsecr);
880 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
881 }
882
883 /* Swap the send and the receive. */
884 rep.th.dest = th->source;
885 rep.th.source = th->dest;
886 rep.th.doff = arg.iov[0].iov_len / 4;
887 rep.th.seq = htonl(seq);
888 rep.th.ack_seq = htonl(ack);
889 rep.th.ack = 1;
890 rep.th.window = htons(win);
891
892 #ifdef CONFIG_TCP_MD5SIG
893 if (key) {
894 int offset = (tsecr) ? 3 : 0;
895
896 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
897 (TCPOPT_NOP << 16) |
898 (TCPOPT_MD5SIG << 8) |
899 TCPOLEN_MD5SIG);
900 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
901 rep.th.doff = arg.iov[0].iov_len/4;
902
903 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
904 key, ip_hdr(skb)->saddr,
905 ip_hdr(skb)->daddr, &rep.th);
906 }
907 #endif
908 arg.flags = reply_flags;
909 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
910 ip_hdr(skb)->saddr, /* XXX */
911 arg.iov[0].iov_len, IPPROTO_TCP, 0);
912 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
913 if (oif)
914 arg.bound_dev_if = oif;
915 arg.tos = tos;
916 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
917 local_bh_disable();
918 ctl_sk = this_cpu_read(ipv4_tcp_sk);
919 sock_net_set(ctl_sk, net);
920 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
921 inet_twsk(sk)->tw_mark : sk->sk_mark;
922 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
923 inet_twsk(sk)->tw_priority : sk->sk_priority;
924 transmit_time = tcp_transmit_time(sk);
925 ip_send_unicast_reply(ctl_sk,
926 skb, &TCP_SKB_CB(skb)->header.h4.opt,
927 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
928 &arg, arg.iov[0].iov_len,
929 transmit_time);
930
931 sock_net_set(ctl_sk, &init_net);
932 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
933 local_bh_enable();
934 }
935
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)936 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
937 {
938 struct inet_timewait_sock *tw = inet_twsk(sk);
939 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
940
941 tcp_v4_send_ack(sk, skb,
942 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
943 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
944 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
945 tcptw->tw_ts_recent,
946 tw->tw_bound_dev_if,
947 tcp_twsk_md5_key(tcptw),
948 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
949 tw->tw_tos
950 );
951
952 inet_twsk_put(tw);
953 }
954
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)955 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
956 struct request_sock *req)
957 {
958 const union tcp_md5_addr *addr;
959 int l3index;
960
961 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
962 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
963 */
964 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
965 tcp_sk(sk)->snd_nxt;
966
967 /* RFC 7323 2.3
968 * The window field (SEG.WND) of every outgoing segment, with the
969 * exception of <SYN> segments, MUST be right-shifted by
970 * Rcv.Wind.Shift bits:
971 */
972 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
973 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
974 tcp_v4_send_ack(sk, skb, seq,
975 tcp_rsk(req)->rcv_nxt,
976 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
977 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
978 READ_ONCE(req->ts_recent),
979 0,
980 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
981 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
982 ip_hdr(skb)->tos);
983 }
984
985 /*
986 * Send a SYN-ACK after having received a SYN.
987 * This still operates on a request_sock only, not on a big
988 * socket.
989 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)990 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
991 struct flowi *fl,
992 struct request_sock *req,
993 struct tcp_fastopen_cookie *foc,
994 enum tcp_synack_type synack_type,
995 struct sk_buff *syn_skb)
996 {
997 const struct inet_request_sock *ireq = inet_rsk(req);
998 struct flowi4 fl4;
999 int err = -1;
1000 struct sk_buff *skb;
1001 u8 tos;
1002
1003 /* First, grab a route. */
1004 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1005 return -1;
1006
1007 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1008
1009 if (skb) {
1010 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1011
1012 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1013 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1014 (inet_sk(sk)->tos & INET_ECN_MASK) :
1015 inet_sk(sk)->tos;
1016
1017 if (!INET_ECN_is_capable(tos) &&
1018 tcp_bpf_ca_needs_ecn((struct sock *)req))
1019 tos |= INET_ECN_ECT_0;
1020
1021 rcu_read_lock();
1022 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1023 ireq->ir_rmt_addr,
1024 rcu_dereference(ireq->ireq_opt),
1025 tos);
1026 rcu_read_unlock();
1027 err = net_xmit_eval(err);
1028 }
1029
1030 return err;
1031 }
1032
1033 /*
1034 * IPv4 request_sock destructor.
1035 */
tcp_v4_reqsk_destructor(struct request_sock * req)1036 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1037 {
1038 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1039 }
1040
1041 #ifdef CONFIG_TCP_MD5SIG
1042 /*
1043 * RFC2385 MD5 checksumming requires a mapping of
1044 * IP address->MD5 Key.
1045 * We need to maintain these in the sk structure.
1046 */
1047
1048 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1049 EXPORT_SYMBOL(tcp_md5_needed);
1050
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1051 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1052 {
1053 if (!old)
1054 return true;
1055
1056 /* l3index always overrides non-l3index */
1057 if (old->l3index && new->l3index == 0)
1058 return false;
1059 if (old->l3index == 0 && new->l3index)
1060 return true;
1061
1062 return old->prefixlen < new->prefixlen;
1063 }
1064
1065 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1066 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1067 const union tcp_md5_addr *addr,
1068 int family)
1069 {
1070 const struct tcp_sock *tp = tcp_sk(sk);
1071 struct tcp_md5sig_key *key;
1072 const struct tcp_md5sig_info *md5sig;
1073 __be32 mask;
1074 struct tcp_md5sig_key *best_match = NULL;
1075 bool match;
1076
1077 /* caller either holds rcu_read_lock() or socket lock */
1078 md5sig = rcu_dereference_check(tp->md5sig_info,
1079 lockdep_sock_is_held(sk));
1080 if (!md5sig)
1081 return NULL;
1082
1083 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084 lockdep_sock_is_held(sk)) {
1085 if (key->family != family)
1086 continue;
1087 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1088 continue;
1089 if (family == AF_INET) {
1090 mask = inet_make_mask(key->prefixlen);
1091 match = (key->addr.a4.s_addr & mask) ==
1092 (addr->a4.s_addr & mask);
1093 #if IS_ENABLED(CONFIG_IPV6)
1094 } else if (family == AF_INET6) {
1095 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1096 key->prefixlen);
1097 #endif
1098 } else {
1099 match = false;
1100 }
1101
1102 if (match && better_md5_match(best_match, key))
1103 best_match = key;
1104 }
1105 return best_match;
1106 }
1107 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1108
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1109 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1110 const union tcp_md5_addr *addr,
1111 int family, u8 prefixlen,
1112 int l3index, u8 flags)
1113 {
1114 const struct tcp_sock *tp = tcp_sk(sk);
1115 struct tcp_md5sig_key *key;
1116 unsigned int size = sizeof(struct in_addr);
1117 const struct tcp_md5sig_info *md5sig;
1118
1119 /* caller either holds rcu_read_lock() or socket lock */
1120 md5sig = rcu_dereference_check(tp->md5sig_info,
1121 lockdep_sock_is_held(sk));
1122 if (!md5sig)
1123 return NULL;
1124 #if IS_ENABLED(CONFIG_IPV6)
1125 if (family == AF_INET6)
1126 size = sizeof(struct in6_addr);
1127 #endif
1128 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1129 lockdep_sock_is_held(sk)) {
1130 if (key->family != family)
1131 continue;
1132 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1133 continue;
1134 if (key->l3index != l3index)
1135 continue;
1136 if (!memcmp(&key->addr, addr, size) &&
1137 key->prefixlen == prefixlen)
1138 return key;
1139 }
1140 return NULL;
1141 }
1142
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1143 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1144 const struct sock *addr_sk)
1145 {
1146 const union tcp_md5_addr *addr;
1147 int l3index;
1148
1149 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1150 addr_sk->sk_bound_dev_if);
1151 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1152 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1153 }
1154 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1155
1156 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1157 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1158 int family, u8 prefixlen, int l3index, u8 flags,
1159 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1160 {
1161 /* Add Key to the list */
1162 struct tcp_md5sig_key *key;
1163 struct tcp_sock *tp = tcp_sk(sk);
1164 struct tcp_md5sig_info *md5sig;
1165
1166 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1167 if (key) {
1168 /* Pre-existing entry - just update that one.
1169 * Note that the key might be used concurrently.
1170 * data_race() is telling kcsan that we do not care of
1171 * key mismatches, since changing MD5 key on live flows
1172 * can lead to packet drops.
1173 */
1174 data_race(memcpy(key->key, newkey, newkeylen));
1175
1176 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1177 * Also note that a reader could catch new key->keylen value
1178 * but old key->key[], this is the reason we use __GFP_ZERO
1179 * at sock_kmalloc() time below these lines.
1180 */
1181 WRITE_ONCE(key->keylen, newkeylen);
1182
1183 return 0;
1184 }
1185
1186 md5sig = rcu_dereference_protected(tp->md5sig_info,
1187 lockdep_sock_is_held(sk));
1188 if (!md5sig) {
1189 md5sig = kmalloc(sizeof(*md5sig), gfp);
1190 if (!md5sig)
1191 return -ENOMEM;
1192
1193 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1194 INIT_HLIST_HEAD(&md5sig->head);
1195 rcu_assign_pointer(tp->md5sig_info, md5sig);
1196 }
1197
1198 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1199 if (!key)
1200 return -ENOMEM;
1201 if (!tcp_alloc_md5sig_pool()) {
1202 sock_kfree_s(sk, key, sizeof(*key));
1203 return -ENOMEM;
1204 }
1205
1206 memcpy(key->key, newkey, newkeylen);
1207 key->keylen = newkeylen;
1208 key->family = family;
1209 key->prefixlen = prefixlen;
1210 key->l3index = l3index;
1211 key->flags = flags;
1212 memcpy(&key->addr, addr,
1213 (family == AF_INET6) ? sizeof(struct in6_addr) :
1214 sizeof(struct in_addr));
1215 hlist_add_head_rcu(&key->node, &md5sig->head);
1216 return 0;
1217 }
1218 EXPORT_SYMBOL(tcp_md5_do_add);
1219
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1220 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1221 u8 prefixlen, int l3index, u8 flags)
1222 {
1223 struct tcp_md5sig_key *key;
1224
1225 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1226 if (!key)
1227 return -ENOENT;
1228 hlist_del_rcu(&key->node);
1229 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1230 kfree_rcu(key, rcu);
1231 return 0;
1232 }
1233 EXPORT_SYMBOL(tcp_md5_do_del);
1234
tcp_clear_md5_list(struct sock * sk)1235 static void tcp_clear_md5_list(struct sock *sk)
1236 {
1237 struct tcp_sock *tp = tcp_sk(sk);
1238 struct tcp_md5sig_key *key;
1239 struct hlist_node *n;
1240 struct tcp_md5sig_info *md5sig;
1241
1242 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1243
1244 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1245 hlist_del_rcu(&key->node);
1246 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1247 kfree_rcu(key, rcu);
1248 }
1249 }
1250
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1251 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1252 sockptr_t optval, int optlen)
1253 {
1254 struct tcp_md5sig cmd;
1255 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1256 const union tcp_md5_addr *addr;
1257 u8 prefixlen = 32;
1258 int l3index = 0;
1259 u8 flags;
1260
1261 if (optlen < sizeof(cmd))
1262 return -EINVAL;
1263
1264 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1265 return -EFAULT;
1266
1267 if (sin->sin_family != AF_INET)
1268 return -EINVAL;
1269
1270 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1271
1272 if (optname == TCP_MD5SIG_EXT &&
1273 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1274 prefixlen = cmd.tcpm_prefixlen;
1275 if (prefixlen > 32)
1276 return -EINVAL;
1277 }
1278
1279 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1280 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1281 struct net_device *dev;
1282
1283 rcu_read_lock();
1284 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1285 if (dev && netif_is_l3_master(dev))
1286 l3index = dev->ifindex;
1287
1288 rcu_read_unlock();
1289
1290 /* ok to reference set/not set outside of rcu;
1291 * right now device MUST be an L3 master
1292 */
1293 if (!dev || !l3index)
1294 return -EINVAL;
1295 }
1296
1297 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1298
1299 if (!cmd.tcpm_keylen)
1300 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1301
1302 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1303 return -EINVAL;
1304
1305 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1306 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1307 }
1308
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1309 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1310 __be32 daddr, __be32 saddr,
1311 const struct tcphdr *th, int nbytes)
1312 {
1313 struct tcp4_pseudohdr *bp;
1314 struct scatterlist sg;
1315 struct tcphdr *_th;
1316
1317 bp = hp->scratch;
1318 bp->saddr = saddr;
1319 bp->daddr = daddr;
1320 bp->pad = 0;
1321 bp->protocol = IPPROTO_TCP;
1322 bp->len = cpu_to_be16(nbytes);
1323
1324 _th = (struct tcphdr *)(bp + 1);
1325 memcpy(_th, th, sizeof(*th));
1326 _th->check = 0;
1327
1328 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1329 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1330 sizeof(*bp) + sizeof(*th));
1331 return crypto_ahash_update(hp->md5_req);
1332 }
1333
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1334 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1335 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1336 {
1337 struct tcp_md5sig_pool *hp;
1338 struct ahash_request *req;
1339
1340 hp = tcp_get_md5sig_pool();
1341 if (!hp)
1342 goto clear_hash_noput;
1343 req = hp->md5_req;
1344
1345 if (crypto_ahash_init(req))
1346 goto clear_hash;
1347 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1348 goto clear_hash;
1349 if (tcp_md5_hash_key(hp, key))
1350 goto clear_hash;
1351 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1352 if (crypto_ahash_final(req))
1353 goto clear_hash;
1354
1355 tcp_put_md5sig_pool();
1356 return 0;
1357
1358 clear_hash:
1359 tcp_put_md5sig_pool();
1360 clear_hash_noput:
1361 memset(md5_hash, 0, 16);
1362 return 1;
1363 }
1364
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1365 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1366 const struct sock *sk,
1367 const struct sk_buff *skb)
1368 {
1369 struct tcp_md5sig_pool *hp;
1370 struct ahash_request *req;
1371 const struct tcphdr *th = tcp_hdr(skb);
1372 __be32 saddr, daddr;
1373
1374 if (sk) { /* valid for establish/request sockets */
1375 saddr = sk->sk_rcv_saddr;
1376 daddr = sk->sk_daddr;
1377 } else {
1378 const struct iphdr *iph = ip_hdr(skb);
1379 saddr = iph->saddr;
1380 daddr = iph->daddr;
1381 }
1382
1383 hp = tcp_get_md5sig_pool();
1384 if (!hp)
1385 goto clear_hash_noput;
1386 req = hp->md5_req;
1387
1388 if (crypto_ahash_init(req))
1389 goto clear_hash;
1390
1391 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1392 goto clear_hash;
1393 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1394 goto clear_hash;
1395 if (tcp_md5_hash_key(hp, key))
1396 goto clear_hash;
1397 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1398 if (crypto_ahash_final(req))
1399 goto clear_hash;
1400
1401 tcp_put_md5sig_pool();
1402 return 0;
1403
1404 clear_hash:
1405 tcp_put_md5sig_pool();
1406 clear_hash_noput:
1407 memset(md5_hash, 0, 16);
1408 return 1;
1409 }
1410 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1411
1412 #endif
1413
1414 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1415 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1416 const struct sk_buff *skb,
1417 int dif, int sdif)
1418 {
1419 #ifdef CONFIG_TCP_MD5SIG
1420 /*
1421 * This gets called for each TCP segment that arrives
1422 * so we want to be efficient.
1423 * We have 3 drop cases:
1424 * o No MD5 hash and one expected.
1425 * o MD5 hash and we're not expecting one.
1426 * o MD5 hash and its wrong.
1427 */
1428 const __u8 *hash_location = NULL;
1429 struct tcp_md5sig_key *hash_expected;
1430 const struct iphdr *iph = ip_hdr(skb);
1431 const struct tcphdr *th = tcp_hdr(skb);
1432 const union tcp_md5_addr *addr;
1433 unsigned char newhash[16];
1434 int genhash, l3index;
1435
1436 /* sdif set, means packet ingressed via a device
1437 * in an L3 domain and dif is set to the l3mdev
1438 */
1439 l3index = sdif ? dif : 0;
1440
1441 addr = (union tcp_md5_addr *)&iph->saddr;
1442 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1443 hash_location = tcp_parse_md5sig_option(th);
1444
1445 /* We've parsed the options - do we have a hash? */
1446 if (!hash_expected && !hash_location)
1447 return false;
1448
1449 if (hash_expected && !hash_location) {
1450 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1451 return true;
1452 }
1453
1454 if (!hash_expected && hash_location) {
1455 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1456 return true;
1457 }
1458
1459 /* Okay, so this is hash_expected and hash_location -
1460 * so we need to calculate the checksum.
1461 */
1462 genhash = tcp_v4_md5_hash_skb(newhash,
1463 hash_expected,
1464 NULL, skb);
1465
1466 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1467 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1468 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1469 &iph->saddr, ntohs(th->source),
1470 &iph->daddr, ntohs(th->dest),
1471 genhash ? " tcp_v4_calc_md5_hash failed"
1472 : "", l3index);
1473 return true;
1474 }
1475 return false;
1476 #endif
1477 return false;
1478 }
1479
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1480 static void tcp_v4_init_req(struct request_sock *req,
1481 const struct sock *sk_listener,
1482 struct sk_buff *skb)
1483 {
1484 struct inet_request_sock *ireq = inet_rsk(req);
1485 struct net *net = sock_net(sk_listener);
1486
1487 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1488 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1489 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1490 }
1491
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1492 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1493 struct sk_buff *skb,
1494 struct flowi *fl,
1495 struct request_sock *req)
1496 {
1497 tcp_v4_init_req(req, sk, skb);
1498
1499 if (security_inet_conn_request(sk, skb, req))
1500 return NULL;
1501
1502 return inet_csk_route_req(sk, &fl->u.ip4, req);
1503 }
1504
1505 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1506 .family = PF_INET,
1507 .obj_size = sizeof(struct tcp_request_sock),
1508 .rtx_syn_ack = tcp_rtx_synack,
1509 .send_ack = tcp_v4_reqsk_send_ack,
1510 .destructor = tcp_v4_reqsk_destructor,
1511 .send_reset = tcp_v4_send_reset,
1512 .syn_ack_timeout = tcp_syn_ack_timeout,
1513 };
1514
1515 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1516 .mss_clamp = TCP_MSS_DEFAULT,
1517 #ifdef CONFIG_TCP_MD5SIG
1518 .req_md5_lookup = tcp_v4_md5_lookup,
1519 .calc_md5_hash = tcp_v4_md5_hash_skb,
1520 #endif
1521 #ifdef CONFIG_SYN_COOKIES
1522 .cookie_init_seq = cookie_v4_init_sequence,
1523 #endif
1524 .route_req = tcp_v4_route_req,
1525 .init_seq = tcp_v4_init_seq,
1526 .init_ts_off = tcp_v4_init_ts_off,
1527 .send_synack = tcp_v4_send_synack,
1528 };
1529
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1530 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1531 {
1532 /* Never answer to SYNs send to broadcast or multicast */
1533 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1534 goto drop;
1535
1536 return tcp_conn_request(&tcp_request_sock_ops,
1537 &tcp_request_sock_ipv4_ops, sk, skb);
1538
1539 drop:
1540 tcp_listendrop(sk);
1541 return 0;
1542 }
1543 EXPORT_SYMBOL(tcp_v4_conn_request);
1544
1545
1546 /*
1547 * The three way handshake has completed - we got a valid synack -
1548 * now create the new socket.
1549 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1550 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1551 struct request_sock *req,
1552 struct dst_entry *dst,
1553 struct request_sock *req_unhash,
1554 bool *own_req)
1555 {
1556 struct inet_request_sock *ireq;
1557 bool found_dup_sk = false;
1558 struct inet_sock *newinet;
1559 struct tcp_sock *newtp;
1560 struct sock *newsk;
1561 #ifdef CONFIG_TCP_MD5SIG
1562 const union tcp_md5_addr *addr;
1563 struct tcp_md5sig_key *key;
1564 int l3index;
1565 #endif
1566 struct ip_options_rcu *inet_opt;
1567
1568 if (sk_acceptq_is_full(sk))
1569 goto exit_overflow;
1570
1571 newsk = tcp_create_openreq_child(sk, req, skb);
1572 if (!newsk)
1573 goto exit_nonewsk;
1574
1575 newsk->sk_gso_type = SKB_GSO_TCPV4;
1576 inet_sk_rx_dst_set(newsk, skb);
1577
1578 newtp = tcp_sk(newsk);
1579 newinet = inet_sk(newsk);
1580 ireq = inet_rsk(req);
1581 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1582 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1583 newsk->sk_bound_dev_if = ireq->ir_iif;
1584 newinet->inet_saddr = ireq->ir_loc_addr;
1585 inet_opt = rcu_dereference(ireq->ireq_opt);
1586 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1587 newinet->mc_index = inet_iif(skb);
1588 newinet->mc_ttl = ip_hdr(skb)->ttl;
1589 newinet->rcv_tos = ip_hdr(skb)->tos;
1590 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1591 if (inet_opt)
1592 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1593 newinet->inet_id = prandom_u32();
1594
1595 /* Set ToS of the new socket based upon the value of incoming SYN.
1596 * ECT bits are set later in tcp_init_transfer().
1597 */
1598 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1599 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1600
1601 if (!dst) {
1602 dst = inet_csk_route_child_sock(sk, newsk, req);
1603 if (!dst)
1604 goto put_and_exit;
1605 } else {
1606 /* syncookie case : see end of cookie_v4_check() */
1607 }
1608 sk_setup_caps(newsk, dst);
1609
1610 tcp_ca_openreq_child(newsk, dst);
1611
1612 tcp_sync_mss(newsk, dst_mtu(dst));
1613 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1614
1615 tcp_initialize_rcv_mss(newsk);
1616
1617 #ifdef CONFIG_TCP_MD5SIG
1618 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1619 /* Copy over the MD5 key from the original socket */
1620 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1621 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1622 if (key) {
1623 /*
1624 * We're using one, so create a matching key
1625 * on the newsk structure. If we fail to get
1626 * memory, then we end up not copying the key
1627 * across. Shucks.
1628 */
1629 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1630 key->key, key->keylen, GFP_ATOMIC);
1631 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1632 }
1633 #endif
1634
1635 if (__inet_inherit_port(sk, newsk) < 0)
1636 goto put_and_exit;
1637 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638 &found_dup_sk);
1639 if (likely(*own_req)) {
1640 tcp_move_syn(newtp, req);
1641 ireq->ireq_opt = NULL;
1642 } else {
1643 newinet->inet_opt = NULL;
1644
1645 if (!req_unhash && found_dup_sk) {
1646 /* This code path should only be executed in the
1647 * syncookie case only
1648 */
1649 bh_unlock_sock(newsk);
1650 sock_put(newsk);
1651 newsk = NULL;
1652 }
1653 }
1654 return newsk;
1655
1656 exit_overflow:
1657 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659 dst_release(dst);
1660 exit:
1661 tcp_listendrop(sk);
1662 return NULL;
1663 put_and_exit:
1664 newinet->inet_opt = NULL;
1665 inet_csk_prepare_forced_close(newsk);
1666 tcp_done(newsk);
1667 goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674 const struct tcphdr *th = tcp_hdr(skb);
1675
1676 if (!th->syn)
1677 sk = cookie_v4_check(sk, skb);
1678 #endif
1679 return sk;
1680 }
1681
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683 struct tcphdr *th, u32 *cookie)
1684 {
1685 u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688 &tcp_request_sock_ipv4_ops, sk, th);
1689 if (mss) {
1690 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691 tcp_synq_overflow(sk);
1692 }
1693 #endif
1694 return mss;
1695 }
1696
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698 u32));
1699 /* The socket must have it's spinlock held when we get
1700 * here, unless it is a TCP_LISTEN socket.
1701 *
1702 * We have a potential double-lock case here, so even when
1703 * doing backlog processing we use the BH locking scheme.
1704 * This is because we cannot sleep with the original spinlock
1705 * held.
1706 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709 struct sock *rsk;
1710
1711 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1712 struct dst_entry *dst;
1713
1714 dst = rcu_dereference_protected(sk->sk_rx_dst,
1715 lockdep_sock_is_held(sk));
1716
1717 sock_rps_save_rxhash(sk, skb);
1718 sk_mark_napi_id(sk, skb);
1719 if (dst) {
1720 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1721 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1722 dst, 0)) {
1723 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1724 dst_release(dst);
1725 }
1726 }
1727 tcp_rcv_established(sk, skb);
1728 return 0;
1729 }
1730
1731 if (tcp_checksum_complete(skb))
1732 goto csum_err;
1733
1734 if (sk->sk_state == TCP_LISTEN) {
1735 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1736
1737 if (!nsk)
1738 goto discard;
1739 if (nsk != sk) {
1740 if (tcp_child_process(sk, nsk, skb)) {
1741 rsk = nsk;
1742 goto reset;
1743 }
1744 return 0;
1745 }
1746 } else
1747 sock_rps_save_rxhash(sk, skb);
1748
1749 if (tcp_rcv_state_process(sk, skb)) {
1750 rsk = sk;
1751 goto reset;
1752 }
1753 return 0;
1754
1755 reset:
1756 tcp_v4_send_reset(rsk, skb);
1757 discard:
1758 kfree_skb(skb);
1759 /* Be careful here. If this function gets more complicated and
1760 * gcc suffers from register pressure on the x86, sk (in %ebx)
1761 * might be destroyed here. This current version compiles correctly,
1762 * but you have been warned.
1763 */
1764 return 0;
1765
1766 csum_err:
1767 trace_tcp_bad_csum(skb);
1768 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1769 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1770 goto discard;
1771 }
1772 EXPORT_SYMBOL(tcp_v4_do_rcv);
1773
tcp_v4_early_demux(struct sk_buff * skb)1774 int tcp_v4_early_demux(struct sk_buff *skb)
1775 {
1776 const struct iphdr *iph;
1777 const struct tcphdr *th;
1778 struct sock *sk;
1779
1780 if (skb->pkt_type != PACKET_HOST)
1781 return 0;
1782
1783 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1784 return 0;
1785
1786 iph = ip_hdr(skb);
1787 th = tcp_hdr(skb);
1788
1789 if (th->doff < sizeof(struct tcphdr) / 4)
1790 return 0;
1791
1792 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1793 iph->saddr, th->source,
1794 iph->daddr, ntohs(th->dest),
1795 skb->skb_iif, inet_sdif(skb));
1796 if (sk) {
1797 skb->sk = sk;
1798 skb->destructor = sock_edemux;
1799 if (sk_fullsock(sk)) {
1800 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1801
1802 if (dst)
1803 dst = dst_check(dst, 0);
1804 if (dst &&
1805 sk->sk_rx_dst_ifindex == skb->skb_iif)
1806 skb_dst_set_noref(skb, dst);
1807 }
1808 }
1809 return 0;
1810 }
1811
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1812 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1813 {
1814 u32 limit, tail_gso_size, tail_gso_segs;
1815 struct skb_shared_info *shinfo;
1816 const struct tcphdr *th;
1817 struct tcphdr *thtail;
1818 struct sk_buff *tail;
1819 unsigned int hdrlen;
1820 bool fragstolen;
1821 u32 gso_segs;
1822 u32 gso_size;
1823 int delta;
1824
1825 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1826 * we can fix skb->truesize to its real value to avoid future drops.
1827 * This is valid because skb is not yet charged to the socket.
1828 * It has been noticed pure SACK packets were sometimes dropped
1829 * (if cooked by drivers without copybreak feature).
1830 */
1831 skb_condense(skb);
1832
1833 skb_dst_drop(skb);
1834
1835 if (unlikely(tcp_checksum_complete(skb))) {
1836 bh_unlock_sock(sk);
1837 trace_tcp_bad_csum(skb);
1838 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1839 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1840 return true;
1841 }
1842
1843 /* Attempt coalescing to last skb in backlog, even if we are
1844 * above the limits.
1845 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1846 */
1847 th = (const struct tcphdr *)skb->data;
1848 hdrlen = th->doff * 4;
1849
1850 tail = sk->sk_backlog.tail;
1851 if (!tail)
1852 goto no_coalesce;
1853 thtail = (struct tcphdr *)tail->data;
1854
1855 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1856 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1857 ((TCP_SKB_CB(tail)->tcp_flags |
1858 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1859 !((TCP_SKB_CB(tail)->tcp_flags &
1860 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1861 ((TCP_SKB_CB(tail)->tcp_flags ^
1862 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1863 #ifdef CONFIG_TLS_DEVICE
1864 tail->decrypted != skb->decrypted ||
1865 #endif
1866 !mptcp_skb_can_collapse(tail, skb) ||
1867 thtail->doff != th->doff ||
1868 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1869 goto no_coalesce;
1870
1871 __skb_pull(skb, hdrlen);
1872
1873 shinfo = skb_shinfo(skb);
1874 gso_size = shinfo->gso_size ?: skb->len;
1875 gso_segs = shinfo->gso_segs ?: 1;
1876
1877 shinfo = skb_shinfo(tail);
1878 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1879 tail_gso_segs = shinfo->gso_segs ?: 1;
1880
1881 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1882 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1883
1884 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1885 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1886 thtail->window = th->window;
1887 }
1888
1889 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1890 * thtail->fin, so that the fast path in tcp_rcv_established()
1891 * is not entered if we append a packet with a FIN.
1892 * SYN, RST, URG are not present.
1893 * ACK is set on both packets.
1894 * PSH : we do not really care in TCP stack,
1895 * at least for 'GRO' packets.
1896 */
1897 thtail->fin |= th->fin;
1898 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1899
1900 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1901 TCP_SKB_CB(tail)->has_rxtstamp = true;
1902 tail->tstamp = skb->tstamp;
1903 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1904 }
1905
1906 /* Not as strict as GRO. We only need to carry mss max value */
1907 shinfo->gso_size = max(gso_size, tail_gso_size);
1908 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1909
1910 sk->sk_backlog.len += delta;
1911 __NET_INC_STATS(sock_net(sk),
1912 LINUX_MIB_TCPBACKLOGCOALESCE);
1913 kfree_skb_partial(skb, fragstolen);
1914 return false;
1915 }
1916 __skb_push(skb, hdrlen);
1917
1918 no_coalesce:
1919 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1920
1921 /* Only socket owner can try to collapse/prune rx queues
1922 * to reduce memory overhead, so add a little headroom here.
1923 * Few sockets backlog are possibly concurrently non empty.
1924 */
1925 limit += 64 * 1024;
1926
1927 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1928 bh_unlock_sock(sk);
1929 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1930 return true;
1931 }
1932 return false;
1933 }
1934 EXPORT_SYMBOL(tcp_add_backlog);
1935
tcp_filter(struct sock * sk,struct sk_buff * skb)1936 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1937 {
1938 struct tcphdr *th = (struct tcphdr *)skb->data;
1939
1940 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1941 }
1942 EXPORT_SYMBOL(tcp_filter);
1943
tcp_v4_restore_cb(struct sk_buff * skb)1944 static void tcp_v4_restore_cb(struct sk_buff *skb)
1945 {
1946 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1947 sizeof(struct inet_skb_parm));
1948 }
1949
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1950 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1951 const struct tcphdr *th)
1952 {
1953 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1954 * barrier() makes sure compiler wont play fool^Waliasing games.
1955 */
1956 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1957 sizeof(struct inet_skb_parm));
1958 barrier();
1959
1960 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1961 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1962 skb->len - th->doff * 4);
1963 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1964 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1965 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1966 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1967 TCP_SKB_CB(skb)->sacked = 0;
1968 TCP_SKB_CB(skb)->has_rxtstamp =
1969 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1970 }
1971
1972 /*
1973 * From tcp_input.c
1974 */
1975
tcp_v4_rcv(struct sk_buff * skb)1976 int tcp_v4_rcv(struct sk_buff *skb)
1977 {
1978 struct net *net = dev_net(skb->dev);
1979 struct sk_buff *skb_to_free;
1980 int sdif = inet_sdif(skb);
1981 int dif = inet_iif(skb);
1982 const struct iphdr *iph;
1983 const struct tcphdr *th;
1984 bool refcounted;
1985 struct sock *sk;
1986 int drop_reason;
1987 int ret;
1988
1989 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1990 if (skb->pkt_type != PACKET_HOST)
1991 goto discard_it;
1992
1993 /* Count it even if it's bad */
1994 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1995
1996 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1997 goto discard_it;
1998
1999 th = (const struct tcphdr *)skb->data;
2000
2001 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2002 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2003 goto bad_packet;
2004 }
2005 if (!pskb_may_pull(skb, th->doff * 4))
2006 goto discard_it;
2007
2008 /* An explanation is required here, I think.
2009 * Packet length and doff are validated by header prediction,
2010 * provided case of th->doff==0 is eliminated.
2011 * So, we defer the checks. */
2012
2013 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2014 goto csum_error;
2015
2016 th = (const struct tcphdr *)skb->data;
2017 iph = ip_hdr(skb);
2018 lookup:
2019 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2020 th->dest, sdif, &refcounted);
2021 if (!sk)
2022 goto no_tcp_socket;
2023
2024 process:
2025 if (sk->sk_state == TCP_TIME_WAIT)
2026 goto do_time_wait;
2027
2028 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2029 struct request_sock *req = inet_reqsk(sk);
2030 bool req_stolen = false;
2031 struct sock *nsk;
2032
2033 sk = req->rsk_listener;
2034 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2035 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2036 sk_drops_add(sk, skb);
2037 reqsk_put(req);
2038 goto discard_it;
2039 }
2040 if (tcp_checksum_complete(skb)) {
2041 reqsk_put(req);
2042 goto csum_error;
2043 }
2044 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2045 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2046 if (!nsk) {
2047 inet_csk_reqsk_queue_drop_and_put(sk, req);
2048 goto lookup;
2049 }
2050 sk = nsk;
2051 /* reuseport_migrate_sock() has already held one sk_refcnt
2052 * before returning.
2053 */
2054 } else {
2055 /* We own a reference on the listener, increase it again
2056 * as we might lose it too soon.
2057 */
2058 sock_hold(sk);
2059 }
2060 refcounted = true;
2061 nsk = NULL;
2062 if (!tcp_filter(sk, skb)) {
2063 th = (const struct tcphdr *)skb->data;
2064 iph = ip_hdr(skb);
2065 tcp_v4_fill_cb(skb, iph, th);
2066 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2067 }
2068 if (!nsk) {
2069 reqsk_put(req);
2070 if (req_stolen) {
2071 /* Another cpu got exclusive access to req
2072 * and created a full blown socket.
2073 * Try to feed this packet to this socket
2074 * instead of discarding it.
2075 */
2076 tcp_v4_restore_cb(skb);
2077 sock_put(sk);
2078 goto lookup;
2079 }
2080 goto discard_and_relse;
2081 }
2082 nf_reset_ct(skb);
2083 if (nsk == sk) {
2084 reqsk_put(req);
2085 tcp_v4_restore_cb(skb);
2086 } else if (tcp_child_process(sk, nsk, skb)) {
2087 tcp_v4_send_reset(nsk, skb);
2088 goto discard_and_relse;
2089 } else {
2090 sock_put(sk);
2091 return 0;
2092 }
2093 }
2094 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2095 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2096 goto discard_and_relse;
2097 }
2098
2099 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2100 goto discard_and_relse;
2101
2102 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2103 goto discard_and_relse;
2104
2105 nf_reset_ct(skb);
2106
2107 if (tcp_filter(sk, skb)) {
2108 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2109 goto discard_and_relse;
2110 }
2111 th = (const struct tcphdr *)skb->data;
2112 iph = ip_hdr(skb);
2113 tcp_v4_fill_cb(skb, iph, th);
2114
2115 skb->dev = NULL;
2116
2117 if (sk->sk_state == TCP_LISTEN) {
2118 ret = tcp_v4_do_rcv(sk, skb);
2119 goto put_and_return;
2120 }
2121
2122 sk_incoming_cpu_update(sk);
2123
2124 bh_lock_sock_nested(sk);
2125 tcp_segs_in(tcp_sk(sk), skb);
2126 ret = 0;
2127 if (!sock_owned_by_user(sk)) {
2128 skb_to_free = sk->sk_rx_skb_cache;
2129 sk->sk_rx_skb_cache = NULL;
2130 ret = tcp_v4_do_rcv(sk, skb);
2131 } else {
2132 if (tcp_add_backlog(sk, skb))
2133 goto discard_and_relse;
2134 skb_to_free = NULL;
2135 }
2136 bh_unlock_sock(sk);
2137 if (skb_to_free)
2138 __kfree_skb(skb_to_free);
2139
2140 put_and_return:
2141 if (refcounted)
2142 sock_put(sk);
2143
2144 return ret;
2145
2146 no_tcp_socket:
2147 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2148 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2149 goto discard_it;
2150
2151 tcp_v4_fill_cb(skb, iph, th);
2152
2153 if (tcp_checksum_complete(skb)) {
2154 csum_error:
2155 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2156 trace_tcp_bad_csum(skb);
2157 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2158 bad_packet:
2159 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2160 } else {
2161 tcp_v4_send_reset(NULL, skb);
2162 }
2163
2164 discard_it:
2165 /* Discard frame. */
2166 kfree_skb_reason(skb, drop_reason);
2167 return 0;
2168
2169 discard_and_relse:
2170 sk_drops_add(sk, skb);
2171 if (refcounted)
2172 sock_put(sk);
2173 goto discard_it;
2174
2175 do_time_wait:
2176 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2177 inet_twsk_put(inet_twsk(sk));
2178 goto discard_it;
2179 }
2180
2181 tcp_v4_fill_cb(skb, iph, th);
2182
2183 if (tcp_checksum_complete(skb)) {
2184 inet_twsk_put(inet_twsk(sk));
2185 goto csum_error;
2186 }
2187 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2188 case TCP_TW_SYN: {
2189 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2190 &tcp_hashinfo, skb,
2191 __tcp_hdrlen(th),
2192 iph->saddr, th->source,
2193 iph->daddr, th->dest,
2194 inet_iif(skb),
2195 sdif);
2196 if (sk2) {
2197 inet_twsk_deschedule_put(inet_twsk(sk));
2198 sk = sk2;
2199 tcp_v4_restore_cb(skb);
2200 refcounted = false;
2201 goto process;
2202 }
2203 }
2204 /* to ACK */
2205 fallthrough;
2206 case TCP_TW_ACK:
2207 tcp_v4_timewait_ack(sk, skb);
2208 break;
2209 case TCP_TW_RST:
2210 tcp_v4_send_reset(sk, skb);
2211 inet_twsk_deschedule_put(inet_twsk(sk));
2212 goto discard_it;
2213 case TCP_TW_SUCCESS:;
2214 }
2215 goto discard_it;
2216 }
2217
2218 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2219 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2220 .twsk_unique = tcp_twsk_unique,
2221 .twsk_destructor= tcp_twsk_destructor,
2222 };
2223
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2224 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2225 {
2226 struct dst_entry *dst = skb_dst(skb);
2227
2228 if (dst && dst_hold_safe(dst)) {
2229 rcu_assign_pointer(sk->sk_rx_dst, dst);
2230 sk->sk_rx_dst_ifindex = skb->skb_iif;
2231 }
2232 }
2233 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2234
2235 const struct inet_connection_sock_af_ops ipv4_specific = {
2236 .queue_xmit = ip_queue_xmit,
2237 .send_check = tcp_v4_send_check,
2238 .rebuild_header = inet_sk_rebuild_header,
2239 .sk_rx_dst_set = inet_sk_rx_dst_set,
2240 .conn_request = tcp_v4_conn_request,
2241 .syn_recv_sock = tcp_v4_syn_recv_sock,
2242 .net_header_len = sizeof(struct iphdr),
2243 .setsockopt = ip_setsockopt,
2244 .getsockopt = ip_getsockopt,
2245 .addr2sockaddr = inet_csk_addr2sockaddr,
2246 .sockaddr_len = sizeof(struct sockaddr_in),
2247 .mtu_reduced = tcp_v4_mtu_reduced,
2248 };
2249 EXPORT_SYMBOL(ipv4_specific);
2250
2251 #ifdef CONFIG_TCP_MD5SIG
2252 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2253 .md5_lookup = tcp_v4_md5_lookup,
2254 .calc_md5_hash = tcp_v4_md5_hash_skb,
2255 .md5_parse = tcp_v4_parse_md5_keys,
2256 };
2257 #endif
2258
2259 /* NOTE: A lot of things set to zero explicitly by call to
2260 * sk_alloc() so need not be done here.
2261 */
tcp_v4_init_sock(struct sock * sk)2262 static int tcp_v4_init_sock(struct sock *sk)
2263 {
2264 struct inet_connection_sock *icsk = inet_csk(sk);
2265
2266 tcp_init_sock(sk);
2267
2268 icsk->icsk_af_ops = &ipv4_specific;
2269
2270 #ifdef CONFIG_TCP_MD5SIG
2271 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2272 #endif
2273
2274 return 0;
2275 }
2276
tcp_v4_destroy_sock(struct sock * sk)2277 void tcp_v4_destroy_sock(struct sock *sk)
2278 {
2279 struct tcp_sock *tp = tcp_sk(sk);
2280
2281 trace_tcp_destroy_sock(sk);
2282
2283 tcp_clear_xmit_timers(sk);
2284
2285 tcp_cleanup_congestion_control(sk);
2286
2287 tcp_cleanup_ulp(sk);
2288
2289 /* Cleanup up the write buffer. */
2290 tcp_write_queue_purge(sk);
2291
2292 /* Check if we want to disable active TFO */
2293 tcp_fastopen_active_disable_ofo_check(sk);
2294
2295 /* Cleans up our, hopefully empty, out_of_order_queue. */
2296 skb_rbtree_purge(&tp->out_of_order_queue);
2297
2298 #ifdef CONFIG_TCP_MD5SIG
2299 /* Clean up the MD5 key list, if any */
2300 if (tp->md5sig_info) {
2301 tcp_clear_md5_list(sk);
2302 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2303 tp->md5sig_info = NULL;
2304 }
2305 #endif
2306
2307 /* Clean up a referenced TCP bind bucket. */
2308 if (inet_csk(sk)->icsk_bind_hash)
2309 inet_put_port(sk);
2310
2311 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2312
2313 /* If socket is aborted during connect operation */
2314 tcp_free_fastopen_req(tp);
2315 tcp_fastopen_destroy_cipher(sk);
2316 tcp_saved_syn_free(tp);
2317
2318 sk_sockets_allocated_dec(sk);
2319 }
2320 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2321
2322 #ifdef CONFIG_PROC_FS
2323 /* Proc filesystem TCP sock list dumping. */
2324
2325 static unsigned short seq_file_family(const struct seq_file *seq);
2326
seq_sk_match(struct seq_file * seq,const struct sock * sk)2327 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2328 {
2329 unsigned short family = seq_file_family(seq);
2330
2331 /* AF_UNSPEC is used as a match all */
2332 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2333 net_eq(sock_net(sk), seq_file_net(seq)));
2334 }
2335
2336 /* Find a non empty bucket (starting from st->bucket)
2337 * and return the first sk from it.
2338 */
listening_get_first(struct seq_file * seq)2339 static void *listening_get_first(struct seq_file *seq)
2340 {
2341 struct tcp_iter_state *st = seq->private;
2342
2343 st->offset = 0;
2344 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2345 struct inet_listen_hashbucket *ilb2;
2346 struct inet_connection_sock *icsk;
2347 struct sock *sk;
2348
2349 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2350 if (hlist_empty(&ilb2->head))
2351 continue;
2352
2353 spin_lock(&ilb2->lock);
2354 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2355 sk = (struct sock *)icsk;
2356 if (seq_sk_match(seq, sk))
2357 return sk;
2358 }
2359 spin_unlock(&ilb2->lock);
2360 }
2361
2362 return NULL;
2363 }
2364
2365 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2366 * If "cur" is the last one in the st->bucket,
2367 * call listening_get_first() to return the first sk of the next
2368 * non empty bucket.
2369 */
listening_get_next(struct seq_file * seq,void * cur)2370 static void *listening_get_next(struct seq_file *seq, void *cur)
2371 {
2372 struct tcp_iter_state *st = seq->private;
2373 struct inet_listen_hashbucket *ilb2;
2374 struct inet_connection_sock *icsk;
2375 struct sock *sk = cur;
2376
2377 ++st->num;
2378 ++st->offset;
2379
2380 icsk = inet_csk(sk);
2381 inet_lhash2_for_each_icsk_continue(icsk) {
2382 sk = (struct sock *)icsk;
2383 if (seq_sk_match(seq, sk))
2384 return sk;
2385 }
2386
2387 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2388 spin_unlock(&ilb2->lock);
2389 ++st->bucket;
2390 return listening_get_first(seq);
2391 }
2392
listening_get_idx(struct seq_file * seq,loff_t * pos)2393 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2394 {
2395 struct tcp_iter_state *st = seq->private;
2396 void *rc;
2397
2398 st->bucket = 0;
2399 st->offset = 0;
2400 rc = listening_get_first(seq);
2401
2402 while (rc && *pos) {
2403 rc = listening_get_next(seq, rc);
2404 --*pos;
2405 }
2406 return rc;
2407 }
2408
empty_bucket(const struct tcp_iter_state * st)2409 static inline bool empty_bucket(const struct tcp_iter_state *st)
2410 {
2411 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2412 }
2413
2414 /*
2415 * Get first established socket starting from bucket given in st->bucket.
2416 * If st->bucket is zero, the very first socket in the hash is returned.
2417 */
established_get_first(struct seq_file * seq)2418 static void *established_get_first(struct seq_file *seq)
2419 {
2420 struct tcp_iter_state *st = seq->private;
2421
2422 st->offset = 0;
2423 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2424 struct sock *sk;
2425 struct hlist_nulls_node *node;
2426 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2427
2428 /* Lockless fast path for the common case of empty buckets */
2429 if (empty_bucket(st))
2430 continue;
2431
2432 spin_lock_bh(lock);
2433 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2434 if (seq_sk_match(seq, sk))
2435 return sk;
2436 }
2437 spin_unlock_bh(lock);
2438 }
2439
2440 return NULL;
2441 }
2442
established_get_next(struct seq_file * seq,void * cur)2443 static void *established_get_next(struct seq_file *seq, void *cur)
2444 {
2445 struct sock *sk = cur;
2446 struct hlist_nulls_node *node;
2447 struct tcp_iter_state *st = seq->private;
2448
2449 ++st->num;
2450 ++st->offset;
2451
2452 sk = sk_nulls_next(sk);
2453
2454 sk_nulls_for_each_from(sk, node) {
2455 if (seq_sk_match(seq, sk))
2456 return sk;
2457 }
2458
2459 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2460 ++st->bucket;
2461 return established_get_first(seq);
2462 }
2463
established_get_idx(struct seq_file * seq,loff_t pos)2464 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2465 {
2466 struct tcp_iter_state *st = seq->private;
2467 void *rc;
2468
2469 st->bucket = 0;
2470 rc = established_get_first(seq);
2471
2472 while (rc && pos) {
2473 rc = established_get_next(seq, rc);
2474 --pos;
2475 }
2476 return rc;
2477 }
2478
tcp_get_idx(struct seq_file * seq,loff_t pos)2479 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2480 {
2481 void *rc;
2482 struct tcp_iter_state *st = seq->private;
2483
2484 st->state = TCP_SEQ_STATE_LISTENING;
2485 rc = listening_get_idx(seq, &pos);
2486
2487 if (!rc) {
2488 st->state = TCP_SEQ_STATE_ESTABLISHED;
2489 rc = established_get_idx(seq, pos);
2490 }
2491
2492 return rc;
2493 }
2494
tcp_seek_last_pos(struct seq_file * seq)2495 static void *tcp_seek_last_pos(struct seq_file *seq)
2496 {
2497 struct tcp_iter_state *st = seq->private;
2498 int bucket = st->bucket;
2499 int offset = st->offset;
2500 int orig_num = st->num;
2501 void *rc = NULL;
2502
2503 switch (st->state) {
2504 case TCP_SEQ_STATE_LISTENING:
2505 if (st->bucket > tcp_hashinfo.lhash2_mask)
2506 break;
2507 st->state = TCP_SEQ_STATE_LISTENING;
2508 rc = listening_get_first(seq);
2509 while (offset-- && rc && bucket == st->bucket)
2510 rc = listening_get_next(seq, rc);
2511 if (rc)
2512 break;
2513 st->bucket = 0;
2514 st->state = TCP_SEQ_STATE_ESTABLISHED;
2515 fallthrough;
2516 case TCP_SEQ_STATE_ESTABLISHED:
2517 if (st->bucket > tcp_hashinfo.ehash_mask)
2518 break;
2519 rc = established_get_first(seq);
2520 while (offset-- && rc && bucket == st->bucket)
2521 rc = established_get_next(seq, rc);
2522 }
2523
2524 st->num = orig_num;
2525
2526 return rc;
2527 }
2528
tcp_seq_start(struct seq_file * seq,loff_t * pos)2529 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2530 {
2531 struct tcp_iter_state *st = seq->private;
2532 void *rc;
2533
2534 if (*pos && *pos == st->last_pos) {
2535 rc = tcp_seek_last_pos(seq);
2536 if (rc)
2537 goto out;
2538 }
2539
2540 st->state = TCP_SEQ_STATE_LISTENING;
2541 st->num = 0;
2542 st->bucket = 0;
2543 st->offset = 0;
2544 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2545
2546 out:
2547 st->last_pos = *pos;
2548 return rc;
2549 }
2550 EXPORT_SYMBOL(tcp_seq_start);
2551
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2552 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2553 {
2554 struct tcp_iter_state *st = seq->private;
2555 void *rc = NULL;
2556
2557 if (v == SEQ_START_TOKEN) {
2558 rc = tcp_get_idx(seq, 0);
2559 goto out;
2560 }
2561
2562 switch (st->state) {
2563 case TCP_SEQ_STATE_LISTENING:
2564 rc = listening_get_next(seq, v);
2565 if (!rc) {
2566 st->state = TCP_SEQ_STATE_ESTABLISHED;
2567 st->bucket = 0;
2568 st->offset = 0;
2569 rc = established_get_first(seq);
2570 }
2571 break;
2572 case TCP_SEQ_STATE_ESTABLISHED:
2573 rc = established_get_next(seq, v);
2574 break;
2575 }
2576 out:
2577 ++*pos;
2578 st->last_pos = *pos;
2579 return rc;
2580 }
2581 EXPORT_SYMBOL(tcp_seq_next);
2582
tcp_seq_stop(struct seq_file * seq,void * v)2583 void tcp_seq_stop(struct seq_file *seq, void *v)
2584 {
2585 struct tcp_iter_state *st = seq->private;
2586
2587 switch (st->state) {
2588 case TCP_SEQ_STATE_LISTENING:
2589 if (v != SEQ_START_TOKEN)
2590 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2591 break;
2592 case TCP_SEQ_STATE_ESTABLISHED:
2593 if (v)
2594 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2595 break;
2596 }
2597 }
2598 EXPORT_SYMBOL(tcp_seq_stop);
2599
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2600 static void get_openreq4(const struct request_sock *req,
2601 struct seq_file *f, int i)
2602 {
2603 const struct inet_request_sock *ireq = inet_rsk(req);
2604 long delta = req->rsk_timer.expires - jiffies;
2605
2606 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2607 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2608 i,
2609 ireq->ir_loc_addr,
2610 ireq->ir_num,
2611 ireq->ir_rmt_addr,
2612 ntohs(ireq->ir_rmt_port),
2613 TCP_SYN_RECV,
2614 0, 0, /* could print option size, but that is af dependent. */
2615 1, /* timers active (only the expire timer) */
2616 jiffies_delta_to_clock_t(delta),
2617 req->num_timeout,
2618 from_kuid_munged(seq_user_ns(f),
2619 sock_i_uid(req->rsk_listener)),
2620 0, /* non standard timer */
2621 0, /* open_requests have no inode */
2622 0,
2623 req);
2624 }
2625
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2626 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2627 {
2628 int timer_active;
2629 unsigned long timer_expires;
2630 const struct tcp_sock *tp = tcp_sk(sk);
2631 const struct inet_connection_sock *icsk = inet_csk(sk);
2632 const struct inet_sock *inet = inet_sk(sk);
2633 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2634 __be32 dest = inet->inet_daddr;
2635 __be32 src = inet->inet_rcv_saddr;
2636 __u16 destp = ntohs(inet->inet_dport);
2637 __u16 srcp = ntohs(inet->inet_sport);
2638 int rx_queue;
2639 int state;
2640
2641 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2642 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2643 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2644 timer_active = 1;
2645 timer_expires = icsk->icsk_timeout;
2646 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2647 timer_active = 4;
2648 timer_expires = icsk->icsk_timeout;
2649 } else if (timer_pending(&sk->sk_timer)) {
2650 timer_active = 2;
2651 timer_expires = sk->sk_timer.expires;
2652 } else {
2653 timer_active = 0;
2654 timer_expires = jiffies;
2655 }
2656
2657 state = inet_sk_state_load(sk);
2658 if (state == TCP_LISTEN)
2659 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2660 else
2661 /* Because we don't lock the socket,
2662 * we might find a transient negative value.
2663 */
2664 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2665 READ_ONCE(tp->copied_seq), 0);
2666
2667 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2668 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2669 i, src, srcp, dest, destp, state,
2670 READ_ONCE(tp->write_seq) - tp->snd_una,
2671 rx_queue,
2672 timer_active,
2673 jiffies_delta_to_clock_t(timer_expires - jiffies),
2674 icsk->icsk_retransmits,
2675 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2676 icsk->icsk_probes_out,
2677 sock_i_ino(sk),
2678 refcount_read(&sk->sk_refcnt), sk,
2679 jiffies_to_clock_t(icsk->icsk_rto),
2680 jiffies_to_clock_t(icsk->icsk_ack.ato),
2681 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2682 tcp_snd_cwnd(tp),
2683 state == TCP_LISTEN ?
2684 fastopenq->max_qlen :
2685 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2686 }
2687
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2688 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2689 struct seq_file *f, int i)
2690 {
2691 long delta = tw->tw_timer.expires - jiffies;
2692 __be32 dest, src;
2693 __u16 destp, srcp;
2694
2695 dest = tw->tw_daddr;
2696 src = tw->tw_rcv_saddr;
2697 destp = ntohs(tw->tw_dport);
2698 srcp = ntohs(tw->tw_sport);
2699
2700 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2701 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2702 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2703 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2704 refcount_read(&tw->tw_refcnt), tw);
2705 }
2706
2707 #define TMPSZ 150
2708
tcp4_seq_show(struct seq_file * seq,void * v)2709 static int tcp4_seq_show(struct seq_file *seq, void *v)
2710 {
2711 struct tcp_iter_state *st;
2712 struct sock *sk = v;
2713
2714 seq_setwidth(seq, TMPSZ - 1);
2715 if (v == SEQ_START_TOKEN) {
2716 seq_puts(seq, " sl local_address rem_address st tx_queue "
2717 "rx_queue tr tm->when retrnsmt uid timeout "
2718 "inode");
2719 goto out;
2720 }
2721 st = seq->private;
2722
2723 if (sk->sk_state == TCP_TIME_WAIT)
2724 get_timewait4_sock(v, seq, st->num);
2725 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2726 get_openreq4(v, seq, st->num);
2727 else
2728 get_tcp4_sock(v, seq, st->num);
2729 out:
2730 seq_pad(seq, '\n');
2731 return 0;
2732 }
2733
2734 #ifdef CONFIG_BPF_SYSCALL
2735 struct bpf_tcp_iter_state {
2736 struct tcp_iter_state state;
2737 unsigned int cur_sk;
2738 unsigned int end_sk;
2739 unsigned int max_sk;
2740 struct sock **batch;
2741 bool st_bucket_done;
2742 };
2743
2744 struct bpf_iter__tcp {
2745 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2746 __bpf_md_ptr(struct sock_common *, sk_common);
2747 uid_t uid __aligned(8);
2748 };
2749
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2750 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2751 struct sock_common *sk_common, uid_t uid)
2752 {
2753 struct bpf_iter__tcp ctx;
2754
2755 meta->seq_num--; /* skip SEQ_START_TOKEN */
2756 ctx.meta = meta;
2757 ctx.sk_common = sk_common;
2758 ctx.uid = uid;
2759 return bpf_iter_run_prog(prog, &ctx);
2760 }
2761
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2762 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2763 {
2764 while (iter->cur_sk < iter->end_sk)
2765 sock_gen_put(iter->batch[iter->cur_sk++]);
2766 }
2767
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2768 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2769 unsigned int new_batch_sz)
2770 {
2771 struct sock **new_batch;
2772
2773 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2774 GFP_USER | __GFP_NOWARN);
2775 if (!new_batch)
2776 return -ENOMEM;
2777
2778 bpf_iter_tcp_put_batch(iter);
2779 kvfree(iter->batch);
2780 iter->batch = new_batch;
2781 iter->max_sk = new_batch_sz;
2782
2783 return 0;
2784 }
2785
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2786 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2787 struct sock *start_sk)
2788 {
2789 struct bpf_tcp_iter_state *iter = seq->private;
2790 struct tcp_iter_state *st = &iter->state;
2791 struct inet_connection_sock *icsk;
2792 unsigned int expected = 1;
2793 struct sock *sk;
2794
2795 sock_hold(start_sk);
2796 iter->batch[iter->end_sk++] = start_sk;
2797
2798 icsk = inet_csk(start_sk);
2799 inet_lhash2_for_each_icsk_continue(icsk) {
2800 sk = (struct sock *)icsk;
2801 if (seq_sk_match(seq, sk)) {
2802 if (iter->end_sk < iter->max_sk) {
2803 sock_hold(sk);
2804 iter->batch[iter->end_sk++] = sk;
2805 }
2806 expected++;
2807 }
2808 }
2809 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2810
2811 return expected;
2812 }
2813
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2814 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2815 struct sock *start_sk)
2816 {
2817 struct bpf_tcp_iter_state *iter = seq->private;
2818 struct tcp_iter_state *st = &iter->state;
2819 struct hlist_nulls_node *node;
2820 unsigned int expected = 1;
2821 struct sock *sk;
2822
2823 sock_hold(start_sk);
2824 iter->batch[iter->end_sk++] = start_sk;
2825
2826 sk = sk_nulls_next(start_sk);
2827 sk_nulls_for_each_from(sk, node) {
2828 if (seq_sk_match(seq, sk)) {
2829 if (iter->end_sk < iter->max_sk) {
2830 sock_hold(sk);
2831 iter->batch[iter->end_sk++] = sk;
2832 }
2833 expected++;
2834 }
2835 }
2836 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2837
2838 return expected;
2839 }
2840
bpf_iter_tcp_batch(struct seq_file * seq)2841 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2842 {
2843 struct bpf_tcp_iter_state *iter = seq->private;
2844 struct tcp_iter_state *st = &iter->state;
2845 unsigned int expected;
2846 bool resized = false;
2847 struct sock *sk;
2848
2849 /* The st->bucket is done. Directly advance to the next
2850 * bucket instead of having the tcp_seek_last_pos() to skip
2851 * one by one in the current bucket and eventually find out
2852 * it has to advance to the next bucket.
2853 */
2854 if (iter->st_bucket_done) {
2855 st->offset = 0;
2856 st->bucket++;
2857 if (st->state == TCP_SEQ_STATE_LISTENING &&
2858 st->bucket > tcp_hashinfo.lhash2_mask) {
2859 st->state = TCP_SEQ_STATE_ESTABLISHED;
2860 st->bucket = 0;
2861 }
2862 }
2863
2864 again:
2865 /* Get a new batch */
2866 iter->cur_sk = 0;
2867 iter->end_sk = 0;
2868 iter->st_bucket_done = false;
2869
2870 sk = tcp_seek_last_pos(seq);
2871 if (!sk)
2872 return NULL; /* Done */
2873
2874 if (st->state == TCP_SEQ_STATE_LISTENING)
2875 expected = bpf_iter_tcp_listening_batch(seq, sk);
2876 else
2877 expected = bpf_iter_tcp_established_batch(seq, sk);
2878
2879 if (iter->end_sk == expected) {
2880 iter->st_bucket_done = true;
2881 return sk;
2882 }
2883
2884 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2885 resized = true;
2886 goto again;
2887 }
2888
2889 return sk;
2890 }
2891
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2892 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2893 {
2894 /* bpf iter does not support lseek, so it always
2895 * continue from where it was stop()-ped.
2896 */
2897 if (*pos)
2898 return bpf_iter_tcp_batch(seq);
2899
2900 return SEQ_START_TOKEN;
2901 }
2902
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2903 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2904 {
2905 struct bpf_tcp_iter_state *iter = seq->private;
2906 struct tcp_iter_state *st = &iter->state;
2907 struct sock *sk;
2908
2909 /* Whenever seq_next() is called, the iter->cur_sk is
2910 * done with seq_show(), so advance to the next sk in
2911 * the batch.
2912 */
2913 if (iter->cur_sk < iter->end_sk) {
2914 /* Keeping st->num consistent in tcp_iter_state.
2915 * bpf_iter_tcp does not use st->num.
2916 * meta.seq_num is used instead.
2917 */
2918 st->num++;
2919 /* Move st->offset to the next sk in the bucket such that
2920 * the future start() will resume at st->offset in
2921 * st->bucket. See tcp_seek_last_pos().
2922 */
2923 st->offset++;
2924 sock_gen_put(iter->batch[iter->cur_sk++]);
2925 }
2926
2927 if (iter->cur_sk < iter->end_sk)
2928 sk = iter->batch[iter->cur_sk];
2929 else
2930 sk = bpf_iter_tcp_batch(seq);
2931
2932 ++*pos;
2933 /* Keeping st->last_pos consistent in tcp_iter_state.
2934 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2935 */
2936 st->last_pos = *pos;
2937 return sk;
2938 }
2939
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2940 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2941 {
2942 struct bpf_iter_meta meta;
2943 struct bpf_prog *prog;
2944 struct sock *sk = v;
2945 bool slow;
2946 uid_t uid;
2947 int ret;
2948
2949 if (v == SEQ_START_TOKEN)
2950 return 0;
2951
2952 if (sk_fullsock(sk))
2953 slow = lock_sock_fast(sk);
2954
2955 if (unlikely(sk_unhashed(sk))) {
2956 ret = SEQ_SKIP;
2957 goto unlock;
2958 }
2959
2960 if (sk->sk_state == TCP_TIME_WAIT) {
2961 uid = 0;
2962 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2963 const struct request_sock *req = v;
2964
2965 uid = from_kuid_munged(seq_user_ns(seq),
2966 sock_i_uid(req->rsk_listener));
2967 } else {
2968 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2969 }
2970
2971 meta.seq = seq;
2972 prog = bpf_iter_get_info(&meta, false);
2973 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2974
2975 unlock:
2976 if (sk_fullsock(sk))
2977 unlock_sock_fast(sk, slow);
2978 return ret;
2979
2980 }
2981
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2982 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2983 {
2984 struct bpf_tcp_iter_state *iter = seq->private;
2985 struct bpf_iter_meta meta;
2986 struct bpf_prog *prog;
2987
2988 if (!v) {
2989 meta.seq = seq;
2990 prog = bpf_iter_get_info(&meta, true);
2991 if (prog)
2992 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2993 }
2994
2995 if (iter->cur_sk < iter->end_sk) {
2996 bpf_iter_tcp_put_batch(iter);
2997 iter->st_bucket_done = false;
2998 }
2999 }
3000
3001 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3002 .show = bpf_iter_tcp_seq_show,
3003 .start = bpf_iter_tcp_seq_start,
3004 .next = bpf_iter_tcp_seq_next,
3005 .stop = bpf_iter_tcp_seq_stop,
3006 };
3007 #endif
seq_file_family(const struct seq_file * seq)3008 static unsigned short seq_file_family(const struct seq_file *seq)
3009 {
3010 const struct tcp_seq_afinfo *afinfo;
3011
3012 #ifdef CONFIG_BPF_SYSCALL
3013 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3014 if (seq->op == &bpf_iter_tcp_seq_ops)
3015 return AF_UNSPEC;
3016 #endif
3017
3018 /* Iterated from proc fs */
3019 afinfo = PDE_DATA(file_inode(seq->file));
3020 return afinfo->family;
3021 }
3022
3023 static const struct seq_operations tcp4_seq_ops = {
3024 .show = tcp4_seq_show,
3025 .start = tcp_seq_start,
3026 .next = tcp_seq_next,
3027 .stop = tcp_seq_stop,
3028 };
3029
3030 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3031 .family = AF_INET,
3032 };
3033
tcp4_proc_init_net(struct net * net)3034 static int __net_init tcp4_proc_init_net(struct net *net)
3035 {
3036 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3037 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3038 return -ENOMEM;
3039 return 0;
3040 }
3041
tcp4_proc_exit_net(struct net * net)3042 static void __net_exit tcp4_proc_exit_net(struct net *net)
3043 {
3044 remove_proc_entry("tcp", net->proc_net);
3045 }
3046
3047 static struct pernet_operations tcp4_net_ops = {
3048 .init = tcp4_proc_init_net,
3049 .exit = tcp4_proc_exit_net,
3050 };
3051
tcp4_proc_init(void)3052 int __init tcp4_proc_init(void)
3053 {
3054 return register_pernet_subsys(&tcp4_net_ops);
3055 }
3056
tcp4_proc_exit(void)3057 void tcp4_proc_exit(void)
3058 {
3059 unregister_pernet_subsys(&tcp4_net_ops);
3060 }
3061 #endif /* CONFIG_PROC_FS */
3062
3063 /* @wake is one when sk_stream_write_space() calls us.
3064 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3065 * This mimics the strategy used in sock_def_write_space().
3066 */
tcp_stream_memory_free(const struct sock * sk,int wake)3067 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3068 {
3069 const struct tcp_sock *tp = tcp_sk(sk);
3070 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3071 READ_ONCE(tp->snd_nxt);
3072
3073 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3074 }
3075 EXPORT_SYMBOL(tcp_stream_memory_free);
3076
3077 struct proto tcp_prot = {
3078 .name = "TCP",
3079 .owner = THIS_MODULE,
3080 .close = tcp_close,
3081 .pre_connect = tcp_v4_pre_connect,
3082 .connect = tcp_v4_connect,
3083 .disconnect = tcp_disconnect,
3084 .accept = inet_csk_accept,
3085 .ioctl = tcp_ioctl,
3086 .init = tcp_v4_init_sock,
3087 .destroy = tcp_v4_destroy_sock,
3088 .shutdown = tcp_shutdown,
3089 .setsockopt = tcp_setsockopt,
3090 .getsockopt = tcp_getsockopt,
3091 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3092 .keepalive = tcp_set_keepalive,
3093 .recvmsg = tcp_recvmsg,
3094 .sendmsg = tcp_sendmsg,
3095 .sendpage = tcp_sendpage,
3096 .backlog_rcv = tcp_v4_do_rcv,
3097 .release_cb = tcp_release_cb,
3098 .hash = inet_hash,
3099 .unhash = inet_unhash,
3100 .get_port = inet_csk_get_port,
3101 #ifdef CONFIG_BPF_SYSCALL
3102 .psock_update_sk_prot = tcp_bpf_update_proto,
3103 #endif
3104 .enter_memory_pressure = tcp_enter_memory_pressure,
3105 .leave_memory_pressure = tcp_leave_memory_pressure,
3106 .stream_memory_free = tcp_stream_memory_free,
3107 .sockets_allocated = &tcp_sockets_allocated,
3108 .orphan_count = &tcp_orphan_count,
3109 .memory_allocated = &tcp_memory_allocated,
3110 .memory_pressure = &tcp_memory_pressure,
3111 .sysctl_mem = sysctl_tcp_mem,
3112 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3113 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3114 .max_header = MAX_TCP_HEADER,
3115 .obj_size = sizeof(struct tcp_sock),
3116 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3117 .twsk_prot = &tcp_timewait_sock_ops,
3118 .rsk_prot = &tcp_request_sock_ops,
3119 .h.hashinfo = &tcp_hashinfo,
3120 .no_autobind = true,
3121 .diag_destroy = tcp_abort,
3122 };
3123 EXPORT_SYMBOL(tcp_prot);
3124
tcp_sk_exit(struct net * net)3125 static void __net_exit tcp_sk_exit(struct net *net)
3126 {
3127 if (net->ipv4.tcp_congestion_control)
3128 bpf_module_put(net->ipv4.tcp_congestion_control,
3129 net->ipv4.tcp_congestion_control->owner);
3130 }
3131
tcp_sk_init(struct net * net)3132 static int __net_init tcp_sk_init(struct net *net)
3133 {
3134 int cnt;
3135
3136 net->ipv4.sysctl_tcp_ecn = 2;
3137 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3138
3139 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3140 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3141 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3142 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3143 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3144
3145 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3146 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3147 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3148
3149 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3150 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3151 net->ipv4.sysctl_tcp_syncookies = 1;
3152 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3153 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3154 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3155 net->ipv4.sysctl_tcp_orphan_retries = 0;
3156 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3157 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3158 net->ipv4.sysctl_tcp_tw_reuse = 2;
3159 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3160
3161 cnt = tcp_hashinfo.ehash_mask + 1;
3162 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3163 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3164
3165 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3166 net->ipv4.sysctl_tcp_sack = 1;
3167 net->ipv4.sysctl_tcp_window_scaling = 1;
3168 net->ipv4.sysctl_tcp_timestamps = 1;
3169 net->ipv4.sysctl_tcp_early_retrans = 3;
3170 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3171 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3172 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3173 net->ipv4.sysctl_tcp_max_reordering = 300;
3174 net->ipv4.sysctl_tcp_dsack = 1;
3175 net->ipv4.sysctl_tcp_app_win = 31;
3176 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3177 net->ipv4.sysctl_tcp_frto = 2;
3178 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3179 /* This limits the percentage of the congestion window which we
3180 * will allow a single TSO frame to consume. Building TSO frames
3181 * which are too large can cause TCP streams to be bursty.
3182 */
3183 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3184 /* Default TSQ limit of 16 TSO segments */
3185 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3186 /* rfc5961 challenge ack rate limiting */
3187 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3188 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3189 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3190 net->ipv4.sysctl_tcp_autocorking = 1;
3191 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3192 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3193 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3194 if (net != &init_net) {
3195 memcpy(net->ipv4.sysctl_tcp_rmem,
3196 init_net.ipv4.sysctl_tcp_rmem,
3197 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3198 memcpy(net->ipv4.sysctl_tcp_wmem,
3199 init_net.ipv4.sysctl_tcp_wmem,
3200 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3201 }
3202 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3203 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3204 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3205 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3206 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3207 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3208
3209 /* Reno is always built in */
3210 if (!net_eq(net, &init_net) &&
3211 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3212 init_net.ipv4.tcp_congestion_control->owner))
3213 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3214 else
3215 net->ipv4.tcp_congestion_control = &tcp_reno;
3216
3217 return 0;
3218 }
3219
tcp_sk_exit_batch(struct list_head * net_exit_list)3220 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3221 {
3222 struct net *net;
3223
3224 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3225
3226 list_for_each_entry(net, net_exit_list, exit_list)
3227 tcp_fastopen_ctx_destroy(net);
3228 }
3229
3230 static struct pernet_operations __net_initdata tcp_sk_ops = {
3231 .init = tcp_sk_init,
3232 .exit = tcp_sk_exit,
3233 .exit_batch = tcp_sk_exit_batch,
3234 };
3235
3236 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3237 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3238 struct sock_common *sk_common, uid_t uid)
3239
3240 #define INIT_BATCH_SZ 16
3241
3242 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3243 {
3244 struct bpf_tcp_iter_state *iter = priv_data;
3245 int err;
3246
3247 err = bpf_iter_init_seq_net(priv_data, aux);
3248 if (err)
3249 return err;
3250
3251 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3252 if (err) {
3253 bpf_iter_fini_seq_net(priv_data);
3254 return err;
3255 }
3256
3257 return 0;
3258 }
3259
bpf_iter_fini_tcp(void * priv_data)3260 static void bpf_iter_fini_tcp(void *priv_data)
3261 {
3262 struct bpf_tcp_iter_state *iter = priv_data;
3263
3264 bpf_iter_fini_seq_net(priv_data);
3265 kvfree(iter->batch);
3266 }
3267
3268 static const struct bpf_iter_seq_info tcp_seq_info = {
3269 .seq_ops = &bpf_iter_tcp_seq_ops,
3270 .init_seq_private = bpf_iter_init_tcp,
3271 .fini_seq_private = bpf_iter_fini_tcp,
3272 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3273 };
3274
3275 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3276 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3277 const struct bpf_prog *prog)
3278 {
3279 switch (func_id) {
3280 case BPF_FUNC_setsockopt:
3281 return &bpf_sk_setsockopt_proto;
3282 case BPF_FUNC_getsockopt:
3283 return &bpf_sk_getsockopt_proto;
3284 default:
3285 return NULL;
3286 }
3287 }
3288
3289 static struct bpf_iter_reg tcp_reg_info = {
3290 .target = "tcp",
3291 .ctx_arg_info_size = 1,
3292 .ctx_arg_info = {
3293 { offsetof(struct bpf_iter__tcp, sk_common),
3294 PTR_TO_BTF_ID_OR_NULL },
3295 },
3296 .get_func_proto = bpf_iter_tcp_get_func_proto,
3297 .seq_info = &tcp_seq_info,
3298 };
3299
bpf_iter_register(void)3300 static void __init bpf_iter_register(void)
3301 {
3302 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3303 if (bpf_iter_reg_target(&tcp_reg_info))
3304 pr_warn("Warning: could not register bpf iterator tcp\n");
3305 }
3306
3307 #endif
3308
tcp_v4_init(void)3309 void __init tcp_v4_init(void)
3310 {
3311 int cpu, res;
3312
3313 for_each_possible_cpu(cpu) {
3314 struct sock *sk;
3315
3316 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3317 IPPROTO_TCP, &init_net);
3318 if (res)
3319 panic("Failed to create the TCP control socket.\n");
3320 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3321
3322 /* Please enforce IP_DF and IPID==0 for RST and
3323 * ACK sent in SYN-RECV and TIME-WAIT state.
3324 */
3325 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3326
3327 per_cpu(ipv4_tcp_sk, cpu) = sk;
3328 }
3329 if (register_pernet_subsys(&tcp_sk_ops))
3330 panic("Failed to create the TCP control socket.\n");
3331
3332 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3333 bpf_iter_register();
3334 #endif
3335 }
3336