1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100 }
101
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
110 const struct inet_timewait_sock *tw = inet_twsk(sktw);
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
113
114 if (reuse == 2) {
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
118 */
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 loopback = true;
129 } else
130 #endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
139
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
146 holder.
147
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
150 */
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
157 * process.
158 *
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
164 */
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184 {
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
188 */
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
205 struct flowi4 *fl4;
206 struct rtable *rt;
207 int err;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
221 if (!daddr)
222 return -EINVAL;
223 nexthop = inet_opt->opt.faddr;
224 }
225
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
232 orig_sport, orig_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 return err;
238 }
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
245 if (!inet_opt || !inet_opt->opt.srr)
246 daddr = fl4->daddr;
247
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
258 }
259
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
262
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
273 */
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
276 if (err)
277 goto failure;
278
279 sk_set_txhash(sk);
280
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
286 goto failure;
287 }
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
291 rt = NULL;
292
293 if (likely(!tp->repair)) {
294 if (!tp->write_seq)
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
302 inet->inet_daddr);
303 }
304
305 inet->inet_id = prandom_u32();
306
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
312 err = tcp_connect(sk);
313
314 if (err)
315 goto failure;
316
317 return 0;
318
319 failure:
320 /*
321 * This unhashes the socket and releases the local port,
322 * if necessary.
323 */
324 tcp_set_state(sk, TCP_CLOSE);
325 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
326 inet_reset_saddr(sk);
327 ip_rt_put(rt);
328 sk->sk_route_caps = 0;
329 inet->inet_dport = 0;
330 return err;
331 }
332 EXPORT_SYMBOL(tcp_v4_connect);
333
334 /*
335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336 * It can be called through tcp_release_cb() if socket was owned by user
337 * at the time tcp_v4_err() was called to handle ICMP message.
338 */
tcp_v4_mtu_reduced(struct sock * sk)339 void tcp_v4_mtu_reduced(struct sock *sk)
340 {
341 struct inet_sock *inet = inet_sk(sk);
342 struct dst_entry *dst;
343 u32 mtu;
344
345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 return;
347 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
348 dst = inet_csk_update_pmtu(sk, mtu);
349 if (!dst)
350 return;
351
352 /* Something is about to be wrong... Remember soft error
353 * for the case, if this connection will not able to recover.
354 */
355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 sk->sk_err_soft = EMSGSIZE;
357
358 mtu = dst_mtu(dst);
359
360 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 ip_sk_accept_pmtu(sk) &&
362 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 tcp_sync_mss(sk, mtu);
364
365 /* Resend the TCP packet because it's
366 * clear that the old packet has been
367 * dropped. This is the new "fast" path mtu
368 * discovery.
369 */
370 tcp_simple_retransmit(sk);
371 } /* else let the usual retransmit timer handle it */
372 }
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374
do_redirect(struct sk_buff * skb,struct sock * sk)375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 {
377 struct dst_entry *dst = __sk_dst_check(sk, 0);
378
379 if (dst)
380 dst->ops->redirect(dst, sk, skb);
381 }
382
383
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 {
387 struct request_sock *req = inet_reqsk(sk);
388 struct net *net = sock_net(sk);
389
390 /* ICMPs are not backlogged, hence we cannot get
391 * an established socket here.
392 */
393 if (seq != tcp_rsk(req)->snt_isn) {
394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 } else if (abort) {
396 /*
397 * Still in SYN_RECV, just remove it silently.
398 * There is no good way to pass the error to the newly
399 * created socket, and POSIX does not want network
400 * errors returned from accept().
401 */
402 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 tcp_listendrop(req->rsk_listener);
404 }
405 reqsk_put(req);
406 }
407 EXPORT_SYMBOL(tcp_req_err);
408
409 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411 {
412 struct inet_connection_sock *icsk = inet_csk(sk);
413 struct tcp_sock *tp = tcp_sk(sk);
414 struct sk_buff *skb;
415 s32 remaining;
416 u32 delta_us;
417
418 if (sock_owned_by_user(sk))
419 return;
420
421 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
422 !icsk->icsk_backoff)
423 return;
424
425 skb = tcp_rtx_queue_head(sk);
426 if (WARN_ON_ONCE(!skb))
427 return;
428
429 icsk->icsk_backoff--;
430 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432
433 tcp_mstamp_refresh(tp);
434 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436
437 if (remaining > 0) {
438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439 remaining, TCP_RTO_MAX);
440 } else {
441 /* RTO revert clocked out retransmission.
442 * Will retransmit now.
443 */
444 tcp_retransmit_timer(sk);
445 }
446 }
447 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448
449 /*
450 * This routine is called by the ICMP module when it gets some
451 * sort of error condition. If err < 0 then the socket should
452 * be closed and the error returned to the user. If err > 0
453 * it's just the icmp type << 8 | icmp code. After adjustment
454 * header points to the first 8 bytes of the tcp header. We need
455 * to find the appropriate port.
456 *
457 * The locking strategy used here is very "optimistic". When
458 * someone else accesses the socket the ICMP is just dropped
459 * and for some paths there is no check at all.
460 * A more general error queue to queue errors for later handling
461 * is probably better.
462 *
463 */
464
tcp_v4_err(struct sk_buff * skb,u32 info)465 int tcp_v4_err(struct sk_buff *skb, u32 info)
466 {
467 const struct iphdr *iph = (const struct iphdr *)skb->data;
468 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
469 struct tcp_sock *tp;
470 struct inet_sock *inet;
471 const int type = icmp_hdr(skb)->type;
472 const int code = icmp_hdr(skb)->code;
473 struct sock *sk;
474 struct request_sock *fastopen;
475 u32 seq, snd_una;
476 int err;
477 struct net *net = dev_net(skb->dev);
478
479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
480 th->dest, iph->saddr, ntohs(th->source),
481 inet_iif(skb), 0);
482 if (!sk) {
483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 return -ENOENT;
485 }
486 if (sk->sk_state == TCP_TIME_WAIT) {
487 inet_twsk_put(inet_twsk(sk));
488 return 0;
489 }
490 seq = ntohl(th->seq);
491 if (sk->sk_state == TCP_NEW_SYN_RECV) {
492 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493 type == ICMP_TIME_EXCEEDED ||
494 (type == ICMP_DEST_UNREACH &&
495 (code == ICMP_NET_UNREACH ||
496 code == ICMP_HOST_UNREACH)));
497 return 0;
498 }
499
500 bh_lock_sock(sk);
501 /* If too many ICMPs get dropped on busy
502 * servers this needs to be solved differently.
503 * We do take care of PMTU discovery (RFC1191) special case :
504 * we can receive locally generated ICMP messages while socket is held.
505 */
506 if (sock_owned_by_user(sk)) {
507 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
508 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
509 }
510 if (sk->sk_state == TCP_CLOSE)
511 goto out;
512
513 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
514 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
515 goto out;
516 }
517
518 tp = tcp_sk(sk);
519 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
520 fastopen = rcu_dereference(tp->fastopen_rsk);
521 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
522 if (sk->sk_state != TCP_LISTEN &&
523 !between(seq, snd_una, tp->snd_nxt)) {
524 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
525 goto out;
526 }
527
528 switch (type) {
529 case ICMP_REDIRECT:
530 if (!sock_owned_by_user(sk))
531 do_redirect(skb, sk);
532 goto out;
533 case ICMP_SOURCE_QUENCH:
534 /* Just silently ignore these. */
535 goto out;
536 case ICMP_PARAMETERPROB:
537 err = EPROTO;
538 break;
539 case ICMP_DEST_UNREACH:
540 if (code > NR_ICMP_UNREACH)
541 goto out;
542
543 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
544 /* We are not interested in TCP_LISTEN and open_requests
545 * (SYN-ACKs send out by Linux are always <576bytes so
546 * they should go through unfragmented).
547 */
548 if (sk->sk_state == TCP_LISTEN)
549 goto out;
550
551 WRITE_ONCE(tp->mtu_info, info);
552 if (!sock_owned_by_user(sk)) {
553 tcp_v4_mtu_reduced(sk);
554 } else {
555 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
556 sock_hold(sk);
557 }
558 goto out;
559 }
560
561 err = icmp_err_convert[code].errno;
562 /* check if this ICMP message allows revert of backoff.
563 * (see RFC 6069)
564 */
565 if (!fastopen &&
566 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
567 tcp_ld_RTO_revert(sk, seq);
568 break;
569 case ICMP_TIME_EXCEEDED:
570 err = EHOSTUNREACH;
571 break;
572 default:
573 goto out;
574 }
575
576 switch (sk->sk_state) {
577 case TCP_SYN_SENT:
578 case TCP_SYN_RECV:
579 /* Only in fast or simultaneous open. If a fast open socket is
580 * already accepted it is treated as a connected one below.
581 */
582 if (fastopen && !fastopen->sk)
583 break;
584
585 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
586
587 if (!sock_owned_by_user(sk)) {
588 sk->sk_err = err;
589
590 sk_error_report(sk);
591
592 tcp_done(sk);
593 } else {
594 sk->sk_err_soft = err;
595 }
596 goto out;
597 }
598
599 /* If we've already connected we will keep trying
600 * until we time out, or the user gives up.
601 *
602 * rfc1122 4.2.3.9 allows to consider as hard errors
603 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
604 * but it is obsoleted by pmtu discovery).
605 *
606 * Note, that in modern internet, where routing is unreliable
607 * and in each dark corner broken firewalls sit, sending random
608 * errors ordered by their masters even this two messages finally lose
609 * their original sense (even Linux sends invalid PORT_UNREACHs)
610 *
611 * Now we are in compliance with RFCs.
612 * --ANK (980905)
613 */
614
615 inet = inet_sk(sk);
616 if (!sock_owned_by_user(sk) && inet->recverr) {
617 sk->sk_err = err;
618 sk_error_report(sk);
619 } else { /* Only an error on timeout */
620 sk->sk_err_soft = err;
621 }
622
623 out:
624 bh_unlock_sock(sk);
625 sock_put(sk);
626 return 0;
627 }
628
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)629 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
630 {
631 struct tcphdr *th = tcp_hdr(skb);
632
633 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
634 skb->csum_start = skb_transport_header(skb) - skb->head;
635 skb->csum_offset = offsetof(struct tcphdr, check);
636 }
637
638 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)639 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
640 {
641 const struct inet_sock *inet = inet_sk(sk);
642
643 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
644 }
645 EXPORT_SYMBOL(tcp_v4_send_check);
646
647 /*
648 * This routine will send an RST to the other tcp.
649 *
650 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
651 * for reset.
652 * Answer: if a packet caused RST, it is not for a socket
653 * existing in our system, if it is matched to a socket,
654 * it is just duplicate segment or bug in other side's TCP.
655 * So that we build reply only basing on parameters
656 * arrived with segment.
657 * Exception: precedence violation. We do not implement it in any case.
658 */
659
660 #ifdef CONFIG_TCP_MD5SIG
661 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
662 #else
663 #define OPTION_BYTES sizeof(__be32)
664 #endif
665
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)666 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
667 {
668 const struct tcphdr *th = tcp_hdr(skb);
669 struct {
670 struct tcphdr th;
671 __be32 opt[OPTION_BYTES / sizeof(__be32)];
672 } rep;
673 struct ip_reply_arg arg;
674 #ifdef CONFIG_TCP_MD5SIG
675 struct tcp_md5sig_key *key = NULL;
676 const __u8 *hash_location = NULL;
677 unsigned char newhash[16];
678 int genhash;
679 struct sock *sk1 = NULL;
680 #endif
681 u64 transmit_time = 0;
682 struct sock *ctl_sk;
683 struct net *net;
684
685 /* Never send a reset in response to a reset. */
686 if (th->rst)
687 return;
688
689 /* If sk not NULL, it means we did a successful lookup and incoming
690 * route had to be correct. prequeue might have dropped our dst.
691 */
692 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
693 return;
694
695 /* Swap the send and the receive. */
696 memset(&rep, 0, sizeof(rep));
697 rep.th.dest = th->source;
698 rep.th.source = th->dest;
699 rep.th.doff = sizeof(struct tcphdr) / 4;
700 rep.th.rst = 1;
701
702 if (th->ack) {
703 rep.th.seq = th->ack_seq;
704 } else {
705 rep.th.ack = 1;
706 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
707 skb->len - (th->doff << 2));
708 }
709
710 memset(&arg, 0, sizeof(arg));
711 arg.iov[0].iov_base = (unsigned char *)&rep;
712 arg.iov[0].iov_len = sizeof(rep.th);
713
714 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
715 #ifdef CONFIG_TCP_MD5SIG
716 rcu_read_lock();
717 hash_location = tcp_parse_md5sig_option(th);
718 if (sk && sk_fullsock(sk)) {
719 const union tcp_md5_addr *addr;
720 int l3index;
721
722 /* sdif set, means packet ingressed via a device
723 * in an L3 domain and inet_iif is set to it.
724 */
725 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
726 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
727 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
728 } else if (hash_location) {
729 const union tcp_md5_addr *addr;
730 int sdif = tcp_v4_sdif(skb);
731 int dif = inet_iif(skb);
732 int l3index;
733
734 /*
735 * active side is lost. Try to find listening socket through
736 * source port, and then find md5 key through listening socket.
737 * we are not loose security here:
738 * Incoming packet is checked with md5 hash with finding key,
739 * no RST generated if md5 hash doesn't match.
740 */
741 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
742 ip_hdr(skb)->saddr,
743 th->source, ip_hdr(skb)->daddr,
744 ntohs(th->source), dif, sdif);
745 /* don't send rst if it can't find key */
746 if (!sk1)
747 goto out;
748
749 /* sdif set, means packet ingressed via a device
750 * in an L3 domain and dif is set to it.
751 */
752 l3index = sdif ? dif : 0;
753 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
754 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
755 if (!key)
756 goto out;
757
758
759 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
760 if (genhash || memcmp(hash_location, newhash, 16) != 0)
761 goto out;
762
763 }
764
765 if (key) {
766 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
767 (TCPOPT_NOP << 16) |
768 (TCPOPT_MD5SIG << 8) |
769 TCPOLEN_MD5SIG);
770 /* Update length and the length the header thinks exists */
771 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
772 rep.th.doff = arg.iov[0].iov_len / 4;
773
774 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
775 key, ip_hdr(skb)->saddr,
776 ip_hdr(skb)->daddr, &rep.th);
777 }
778 #endif
779 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
780 if (rep.opt[0] == 0) {
781 __be32 mrst = mptcp_reset_option(skb);
782
783 if (mrst) {
784 rep.opt[0] = mrst;
785 arg.iov[0].iov_len += sizeof(mrst);
786 rep.th.doff = arg.iov[0].iov_len / 4;
787 }
788 }
789
790 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
791 ip_hdr(skb)->saddr, /* XXX */
792 arg.iov[0].iov_len, IPPROTO_TCP, 0);
793 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
794 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
795
796 /* When socket is gone, all binding information is lost.
797 * routing might fail in this case. No choice here, if we choose to force
798 * input interface, we will misroute in case of asymmetric route.
799 */
800 if (sk) {
801 arg.bound_dev_if = sk->sk_bound_dev_if;
802 if (sk_fullsock(sk))
803 trace_tcp_send_reset(sk, skb);
804 }
805
806 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
807 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
808
809 arg.tos = ip_hdr(skb)->tos;
810 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
811 local_bh_disable();
812 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
813 if (sk) {
814 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_mark : sk->sk_mark;
816 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
817 inet_twsk(sk)->tw_priority : sk->sk_priority;
818 transmit_time = tcp_transmit_time(sk);
819 xfrm_sk_clone_policy(ctl_sk, sk);
820 } else {
821 ctl_sk->sk_mark = 0;
822 ctl_sk->sk_priority = 0;
823 }
824 ip_send_unicast_reply(ctl_sk,
825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 &arg, arg.iov[0].iov_len,
828 transmit_time);
829
830 xfrm_sk_free_policy(ctl_sk);
831 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
832 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
833 local_bh_enable();
834
835 #ifdef CONFIG_TCP_MD5SIG
836 out:
837 rcu_read_unlock();
838 #endif
839 }
840
841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
842 outside socket context is ugly, certainly. What can I do?
843 */
844
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)845 static void tcp_v4_send_ack(const struct sock *sk,
846 struct sk_buff *skb, u32 seq, u32 ack,
847 u32 win, u32 tsval, u32 tsecr, int oif,
848 struct tcp_md5sig_key *key,
849 int reply_flags, u8 tos)
850 {
851 const struct tcphdr *th = tcp_hdr(skb);
852 struct {
853 struct tcphdr th;
854 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
855 #ifdef CONFIG_TCP_MD5SIG
856 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
857 #endif
858 ];
859 } rep;
860 struct net *net = sock_net(sk);
861 struct ip_reply_arg arg;
862 struct sock *ctl_sk;
863 u64 transmit_time;
864
865 memset(&rep.th, 0, sizeof(struct tcphdr));
866 memset(&arg, 0, sizeof(arg));
867
868 arg.iov[0].iov_base = (unsigned char *)&rep;
869 arg.iov[0].iov_len = sizeof(rep.th);
870 if (tsecr) {
871 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
872 (TCPOPT_TIMESTAMP << 8) |
873 TCPOLEN_TIMESTAMP);
874 rep.opt[1] = htonl(tsval);
875 rep.opt[2] = htonl(tsecr);
876 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
877 }
878
879 /* Swap the send and the receive. */
880 rep.th.dest = th->source;
881 rep.th.source = th->dest;
882 rep.th.doff = arg.iov[0].iov_len / 4;
883 rep.th.seq = htonl(seq);
884 rep.th.ack_seq = htonl(ack);
885 rep.th.ack = 1;
886 rep.th.window = htons(win);
887
888 #ifdef CONFIG_TCP_MD5SIG
889 if (key) {
890 int offset = (tsecr) ? 3 : 0;
891
892 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
893 (TCPOPT_NOP << 16) |
894 (TCPOPT_MD5SIG << 8) |
895 TCPOLEN_MD5SIG);
896 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
897 rep.th.doff = arg.iov[0].iov_len/4;
898
899 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
900 key, ip_hdr(skb)->saddr,
901 ip_hdr(skb)->daddr, &rep.th);
902 }
903 #endif
904 arg.flags = reply_flags;
905 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
906 ip_hdr(skb)->saddr, /* XXX */
907 arg.iov[0].iov_len, IPPROTO_TCP, 0);
908 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
909 if (oif)
910 arg.bound_dev_if = oif;
911 arg.tos = tos;
912 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
913 local_bh_disable();
914 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
915 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
916 inet_twsk(sk)->tw_mark : sk->sk_mark;
917 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
918 inet_twsk(sk)->tw_priority : sk->sk_priority;
919 transmit_time = tcp_transmit_time(sk);
920 ip_send_unicast_reply(ctl_sk,
921 skb, &TCP_SKB_CB(skb)->header.h4.opt,
922 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
923 &arg, arg.iov[0].iov_len,
924 transmit_time);
925
926 sock_net_set(ctl_sk, &init_net);
927 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
928 local_bh_enable();
929 }
930
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)931 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
932 {
933 struct inet_timewait_sock *tw = inet_twsk(sk);
934 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
935
936 tcp_v4_send_ack(sk, skb,
937 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
938 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
939 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
940 tcptw->tw_ts_recent,
941 tw->tw_bound_dev_if,
942 tcp_twsk_md5_key(tcptw),
943 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
944 tw->tw_tos
945 );
946
947 inet_twsk_put(tw);
948 }
949
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)950 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
951 struct request_sock *req)
952 {
953 const union tcp_md5_addr *addr;
954 int l3index;
955
956 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
957 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
958 */
959 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
960 tcp_sk(sk)->snd_nxt;
961
962 /* RFC 7323 2.3
963 * The window field (SEG.WND) of every outgoing segment, with the
964 * exception of <SYN> segments, MUST be right-shifted by
965 * Rcv.Wind.Shift bits:
966 */
967 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
968 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
969 tcp_v4_send_ack(sk, skb, seq,
970 tcp_rsk(req)->rcv_nxt,
971 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
972 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
973 READ_ONCE(req->ts_recent),
974 0,
975 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
976 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
977 ip_hdr(skb)->tos);
978 }
979
980 /*
981 * Send a SYN-ACK after having received a SYN.
982 * This still operates on a request_sock only, not on a big
983 * socket.
984 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)985 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
986 struct flowi *fl,
987 struct request_sock *req,
988 struct tcp_fastopen_cookie *foc,
989 enum tcp_synack_type synack_type,
990 struct sk_buff *syn_skb)
991 {
992 const struct inet_request_sock *ireq = inet_rsk(req);
993 struct flowi4 fl4;
994 int err = -1;
995 struct sk_buff *skb;
996 u8 tos;
997
998 /* First, grab a route. */
999 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1000 return -1;
1001
1002 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1003
1004 if (skb) {
1005 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1006
1007 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1008 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1009 (inet_sk(sk)->tos & INET_ECN_MASK) :
1010 inet_sk(sk)->tos;
1011
1012 if (!INET_ECN_is_capable(tos) &&
1013 tcp_bpf_ca_needs_ecn((struct sock *)req))
1014 tos |= INET_ECN_ECT_0;
1015
1016 rcu_read_lock();
1017 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1018 ireq->ir_rmt_addr,
1019 rcu_dereference(ireq->ireq_opt),
1020 tos);
1021 rcu_read_unlock();
1022 err = net_xmit_eval(err);
1023 }
1024
1025 return err;
1026 }
1027
1028 /*
1029 * IPv4 request_sock destructor.
1030 */
tcp_v4_reqsk_destructor(struct request_sock * req)1031 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1032 {
1033 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1034 }
1035
1036 #ifdef CONFIG_TCP_MD5SIG
1037 /*
1038 * RFC2385 MD5 checksumming requires a mapping of
1039 * IP address->MD5 Key.
1040 * We need to maintain these in the sk structure.
1041 */
1042
1043 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1044 EXPORT_SYMBOL(tcp_md5_needed);
1045
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1046 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1047 {
1048 if (!old)
1049 return true;
1050
1051 /* l3index always overrides non-l3index */
1052 if (old->l3index && new->l3index == 0)
1053 return false;
1054 if (old->l3index == 0 && new->l3index)
1055 return true;
1056
1057 return old->prefixlen < new->prefixlen;
1058 }
1059
1060 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1061 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1062 const union tcp_md5_addr *addr,
1063 int family)
1064 {
1065 const struct tcp_sock *tp = tcp_sk(sk);
1066 struct tcp_md5sig_key *key;
1067 const struct tcp_md5sig_info *md5sig;
1068 __be32 mask;
1069 struct tcp_md5sig_key *best_match = NULL;
1070 bool match;
1071
1072 /* caller either holds rcu_read_lock() or socket lock */
1073 md5sig = rcu_dereference_check(tp->md5sig_info,
1074 lockdep_sock_is_held(sk));
1075 if (!md5sig)
1076 return NULL;
1077
1078 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1079 lockdep_sock_is_held(sk)) {
1080 if (key->family != family)
1081 continue;
1082 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1083 continue;
1084 if (family == AF_INET) {
1085 mask = inet_make_mask(key->prefixlen);
1086 match = (key->addr.a4.s_addr & mask) ==
1087 (addr->a4.s_addr & mask);
1088 #if IS_ENABLED(CONFIG_IPV6)
1089 } else if (family == AF_INET6) {
1090 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1091 key->prefixlen);
1092 #endif
1093 } else {
1094 match = false;
1095 }
1096
1097 if (match && better_md5_match(best_match, key))
1098 best_match = key;
1099 }
1100 return best_match;
1101 }
1102 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1103
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1104 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1105 const union tcp_md5_addr *addr,
1106 int family, u8 prefixlen,
1107 int l3index, u8 flags)
1108 {
1109 const struct tcp_sock *tp = tcp_sk(sk);
1110 struct tcp_md5sig_key *key;
1111 unsigned int size = sizeof(struct in_addr);
1112 const struct tcp_md5sig_info *md5sig;
1113
1114 /* caller either holds rcu_read_lock() or socket lock */
1115 md5sig = rcu_dereference_check(tp->md5sig_info,
1116 lockdep_sock_is_held(sk));
1117 if (!md5sig)
1118 return NULL;
1119 #if IS_ENABLED(CONFIG_IPV6)
1120 if (family == AF_INET6)
1121 size = sizeof(struct in6_addr);
1122 #endif
1123 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1124 lockdep_sock_is_held(sk)) {
1125 if (key->family != family)
1126 continue;
1127 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1128 continue;
1129 if (key->l3index != l3index)
1130 continue;
1131 if (!memcmp(&key->addr, addr, size) &&
1132 key->prefixlen == prefixlen)
1133 return key;
1134 }
1135 return NULL;
1136 }
1137
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1138 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1139 const struct sock *addr_sk)
1140 {
1141 const union tcp_md5_addr *addr;
1142 int l3index;
1143
1144 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1145 addr_sk->sk_bound_dev_if);
1146 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1147 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1148 }
1149 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1150
1151 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1152 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1153 int family, u8 prefixlen, int l3index, u8 flags,
1154 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1155 {
1156 /* Add Key to the list */
1157 struct tcp_md5sig_key *key;
1158 struct tcp_sock *tp = tcp_sk(sk);
1159 struct tcp_md5sig_info *md5sig;
1160
1161 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1162 if (key) {
1163 /* Pre-existing entry - just update that one.
1164 * Note that the key might be used concurrently.
1165 * data_race() is telling kcsan that we do not care of
1166 * key mismatches, since changing MD5 key on live flows
1167 * can lead to packet drops.
1168 */
1169 data_race(memcpy(key->key, newkey, newkeylen));
1170
1171 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1172 * Also note that a reader could catch new key->keylen value
1173 * but old key->key[], this is the reason we use __GFP_ZERO
1174 * at sock_kmalloc() time below these lines.
1175 */
1176 WRITE_ONCE(key->keylen, newkeylen);
1177
1178 return 0;
1179 }
1180
1181 md5sig = rcu_dereference_protected(tp->md5sig_info,
1182 lockdep_sock_is_held(sk));
1183 if (!md5sig) {
1184 md5sig = kmalloc(sizeof(*md5sig), gfp);
1185 if (!md5sig)
1186 return -ENOMEM;
1187
1188 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1189 INIT_HLIST_HEAD(&md5sig->head);
1190 rcu_assign_pointer(tp->md5sig_info, md5sig);
1191 }
1192
1193 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1194 if (!key)
1195 return -ENOMEM;
1196 if (!tcp_alloc_md5sig_pool()) {
1197 sock_kfree_s(sk, key, sizeof(*key));
1198 return -ENOMEM;
1199 }
1200
1201 memcpy(key->key, newkey, newkeylen);
1202 key->keylen = newkeylen;
1203 key->family = family;
1204 key->prefixlen = prefixlen;
1205 key->l3index = l3index;
1206 key->flags = flags;
1207 memcpy(&key->addr, addr,
1208 (family == AF_INET6) ? sizeof(struct in6_addr) :
1209 sizeof(struct in_addr));
1210 hlist_add_head_rcu(&key->node, &md5sig->head);
1211 return 0;
1212 }
1213 EXPORT_SYMBOL(tcp_md5_do_add);
1214
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1215 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1216 u8 prefixlen, int l3index, u8 flags)
1217 {
1218 struct tcp_md5sig_key *key;
1219
1220 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1221 if (!key)
1222 return -ENOENT;
1223 hlist_del_rcu(&key->node);
1224 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1225 kfree_rcu(key, rcu);
1226 return 0;
1227 }
1228 EXPORT_SYMBOL(tcp_md5_do_del);
1229
tcp_clear_md5_list(struct sock * sk)1230 static void tcp_clear_md5_list(struct sock *sk)
1231 {
1232 struct tcp_sock *tp = tcp_sk(sk);
1233 struct tcp_md5sig_key *key;
1234 struct hlist_node *n;
1235 struct tcp_md5sig_info *md5sig;
1236
1237 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1238
1239 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1240 hlist_del_rcu(&key->node);
1241 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1242 kfree_rcu(key, rcu);
1243 }
1244 }
1245
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1246 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1247 sockptr_t optval, int optlen)
1248 {
1249 struct tcp_md5sig cmd;
1250 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1251 const union tcp_md5_addr *addr;
1252 u8 prefixlen = 32;
1253 int l3index = 0;
1254 u8 flags;
1255
1256 if (optlen < sizeof(cmd))
1257 return -EINVAL;
1258
1259 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1260 return -EFAULT;
1261
1262 if (sin->sin_family != AF_INET)
1263 return -EINVAL;
1264
1265 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1266
1267 if (optname == TCP_MD5SIG_EXT &&
1268 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1269 prefixlen = cmd.tcpm_prefixlen;
1270 if (prefixlen > 32)
1271 return -EINVAL;
1272 }
1273
1274 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1275 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1276 struct net_device *dev;
1277
1278 rcu_read_lock();
1279 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1280 if (dev && netif_is_l3_master(dev))
1281 l3index = dev->ifindex;
1282
1283 rcu_read_unlock();
1284
1285 /* ok to reference set/not set outside of rcu;
1286 * right now device MUST be an L3 master
1287 */
1288 if (!dev || !l3index)
1289 return -EINVAL;
1290 }
1291
1292 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1293
1294 if (!cmd.tcpm_keylen)
1295 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1296
1297 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1298 return -EINVAL;
1299
1300 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1301 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1302 }
1303
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1304 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1305 __be32 daddr, __be32 saddr,
1306 const struct tcphdr *th, int nbytes)
1307 {
1308 struct tcp4_pseudohdr *bp;
1309 struct scatterlist sg;
1310 struct tcphdr *_th;
1311
1312 bp = hp->scratch;
1313 bp->saddr = saddr;
1314 bp->daddr = daddr;
1315 bp->pad = 0;
1316 bp->protocol = IPPROTO_TCP;
1317 bp->len = cpu_to_be16(nbytes);
1318
1319 _th = (struct tcphdr *)(bp + 1);
1320 memcpy(_th, th, sizeof(*th));
1321 _th->check = 0;
1322
1323 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1324 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1325 sizeof(*bp) + sizeof(*th));
1326 return crypto_ahash_update(hp->md5_req);
1327 }
1328
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1329 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1330 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1331 {
1332 struct tcp_md5sig_pool *hp;
1333 struct ahash_request *req;
1334
1335 hp = tcp_get_md5sig_pool();
1336 if (!hp)
1337 goto clear_hash_noput;
1338 req = hp->md5_req;
1339
1340 if (crypto_ahash_init(req))
1341 goto clear_hash;
1342 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1343 goto clear_hash;
1344 if (tcp_md5_hash_key(hp, key))
1345 goto clear_hash;
1346 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1347 if (crypto_ahash_final(req))
1348 goto clear_hash;
1349
1350 tcp_put_md5sig_pool();
1351 return 0;
1352
1353 clear_hash:
1354 tcp_put_md5sig_pool();
1355 clear_hash_noput:
1356 memset(md5_hash, 0, 16);
1357 return 1;
1358 }
1359
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1360 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1361 const struct sock *sk,
1362 const struct sk_buff *skb)
1363 {
1364 struct tcp_md5sig_pool *hp;
1365 struct ahash_request *req;
1366 const struct tcphdr *th = tcp_hdr(skb);
1367 __be32 saddr, daddr;
1368
1369 if (sk) { /* valid for establish/request sockets */
1370 saddr = sk->sk_rcv_saddr;
1371 daddr = sk->sk_daddr;
1372 } else {
1373 const struct iphdr *iph = ip_hdr(skb);
1374 saddr = iph->saddr;
1375 daddr = iph->daddr;
1376 }
1377
1378 hp = tcp_get_md5sig_pool();
1379 if (!hp)
1380 goto clear_hash_noput;
1381 req = hp->md5_req;
1382
1383 if (crypto_ahash_init(req))
1384 goto clear_hash;
1385
1386 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1387 goto clear_hash;
1388 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1389 goto clear_hash;
1390 if (tcp_md5_hash_key(hp, key))
1391 goto clear_hash;
1392 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1393 if (crypto_ahash_final(req))
1394 goto clear_hash;
1395
1396 tcp_put_md5sig_pool();
1397 return 0;
1398
1399 clear_hash:
1400 tcp_put_md5sig_pool();
1401 clear_hash_noput:
1402 memset(md5_hash, 0, 16);
1403 return 1;
1404 }
1405 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1406
1407 #endif
1408
1409 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1410 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1411 const struct sk_buff *skb,
1412 int dif, int sdif)
1413 {
1414 #ifdef CONFIG_TCP_MD5SIG
1415 /*
1416 * This gets called for each TCP segment that arrives
1417 * so we want to be efficient.
1418 * We have 3 drop cases:
1419 * o No MD5 hash and one expected.
1420 * o MD5 hash and we're not expecting one.
1421 * o MD5 hash and its wrong.
1422 */
1423 const __u8 *hash_location = NULL;
1424 struct tcp_md5sig_key *hash_expected;
1425 const struct iphdr *iph = ip_hdr(skb);
1426 const struct tcphdr *th = tcp_hdr(skb);
1427 const union tcp_md5_addr *addr;
1428 unsigned char newhash[16];
1429 int genhash, l3index;
1430
1431 /* sdif set, means packet ingressed via a device
1432 * in an L3 domain and dif is set to the l3mdev
1433 */
1434 l3index = sdif ? dif : 0;
1435
1436 addr = (union tcp_md5_addr *)&iph->saddr;
1437 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1438 hash_location = tcp_parse_md5sig_option(th);
1439
1440 /* We've parsed the options - do we have a hash? */
1441 if (!hash_expected && !hash_location)
1442 return false;
1443
1444 if (hash_expected && !hash_location) {
1445 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1446 return true;
1447 }
1448
1449 if (!hash_expected && hash_location) {
1450 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1451 return true;
1452 }
1453
1454 /* Okay, so this is hash_expected and hash_location -
1455 * so we need to calculate the checksum.
1456 */
1457 genhash = tcp_v4_md5_hash_skb(newhash,
1458 hash_expected,
1459 NULL, skb);
1460
1461 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1462 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1463 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1464 &iph->saddr, ntohs(th->source),
1465 &iph->daddr, ntohs(th->dest),
1466 genhash ? " tcp_v4_calc_md5_hash failed"
1467 : "", l3index);
1468 return true;
1469 }
1470 return false;
1471 #endif
1472 return false;
1473 }
1474
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1475 static void tcp_v4_init_req(struct request_sock *req,
1476 const struct sock *sk_listener,
1477 struct sk_buff *skb)
1478 {
1479 struct inet_request_sock *ireq = inet_rsk(req);
1480 struct net *net = sock_net(sk_listener);
1481
1482 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1483 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1484 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1485 }
1486
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1487 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1488 struct sk_buff *skb,
1489 struct flowi *fl,
1490 struct request_sock *req)
1491 {
1492 tcp_v4_init_req(req, sk, skb);
1493
1494 if (security_inet_conn_request(sk, skb, req))
1495 return NULL;
1496
1497 return inet_csk_route_req(sk, &fl->u.ip4, req);
1498 }
1499
1500 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1501 .family = PF_INET,
1502 .obj_size = sizeof(struct tcp_request_sock),
1503 .rtx_syn_ack = tcp_rtx_synack,
1504 .send_ack = tcp_v4_reqsk_send_ack,
1505 .destructor = tcp_v4_reqsk_destructor,
1506 .send_reset = tcp_v4_send_reset,
1507 .syn_ack_timeout = tcp_syn_ack_timeout,
1508 };
1509
1510 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1511 .mss_clamp = TCP_MSS_DEFAULT,
1512 #ifdef CONFIG_TCP_MD5SIG
1513 .req_md5_lookup = tcp_v4_md5_lookup,
1514 .calc_md5_hash = tcp_v4_md5_hash_skb,
1515 #endif
1516 #ifdef CONFIG_SYN_COOKIES
1517 .cookie_init_seq = cookie_v4_init_sequence,
1518 #endif
1519 .route_req = tcp_v4_route_req,
1520 .init_seq = tcp_v4_init_seq,
1521 .init_ts_off = tcp_v4_init_ts_off,
1522 .send_synack = tcp_v4_send_synack,
1523 };
1524
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1525 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1526 {
1527 /* Never answer to SYNs send to broadcast or multicast */
1528 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1529 goto drop;
1530
1531 return tcp_conn_request(&tcp_request_sock_ops,
1532 &tcp_request_sock_ipv4_ops, sk, skb);
1533
1534 drop:
1535 tcp_listendrop(sk);
1536 return 0;
1537 }
1538 EXPORT_SYMBOL(tcp_v4_conn_request);
1539
1540
1541 /*
1542 * The three way handshake has completed - we got a valid synack -
1543 * now create the new socket.
1544 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1545 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1546 struct request_sock *req,
1547 struct dst_entry *dst,
1548 struct request_sock *req_unhash,
1549 bool *own_req)
1550 {
1551 struct inet_request_sock *ireq;
1552 bool found_dup_sk = false;
1553 struct inet_sock *newinet;
1554 struct tcp_sock *newtp;
1555 struct sock *newsk;
1556 #ifdef CONFIG_TCP_MD5SIG
1557 const union tcp_md5_addr *addr;
1558 struct tcp_md5sig_key *key;
1559 int l3index;
1560 #endif
1561 struct ip_options_rcu *inet_opt;
1562
1563 if (sk_acceptq_is_full(sk))
1564 goto exit_overflow;
1565
1566 newsk = tcp_create_openreq_child(sk, req, skb);
1567 if (!newsk)
1568 goto exit_nonewsk;
1569
1570 newsk->sk_gso_type = SKB_GSO_TCPV4;
1571 inet_sk_rx_dst_set(newsk, skb);
1572
1573 newtp = tcp_sk(newsk);
1574 newinet = inet_sk(newsk);
1575 ireq = inet_rsk(req);
1576 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1577 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1578 newsk->sk_bound_dev_if = ireq->ir_iif;
1579 newinet->inet_saddr = ireq->ir_loc_addr;
1580 inet_opt = rcu_dereference(ireq->ireq_opt);
1581 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1582 newinet->mc_index = inet_iif(skb);
1583 newinet->mc_ttl = ip_hdr(skb)->ttl;
1584 newinet->rcv_tos = ip_hdr(skb)->tos;
1585 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1586 if (inet_opt)
1587 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1588 newinet->inet_id = prandom_u32();
1589
1590 /* Set ToS of the new socket based upon the value of incoming SYN.
1591 * ECT bits are set later in tcp_init_transfer().
1592 */
1593 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1594 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1595
1596 if (!dst) {
1597 dst = inet_csk_route_child_sock(sk, newsk, req);
1598 if (!dst)
1599 goto put_and_exit;
1600 } else {
1601 /* syncookie case : see end of cookie_v4_check() */
1602 }
1603 sk_setup_caps(newsk, dst);
1604
1605 tcp_ca_openreq_child(newsk, dst);
1606
1607 tcp_sync_mss(newsk, dst_mtu(dst));
1608 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1609
1610 tcp_initialize_rcv_mss(newsk);
1611
1612 #ifdef CONFIG_TCP_MD5SIG
1613 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1614 /* Copy over the MD5 key from the original socket */
1615 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1616 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1617 if (key) {
1618 /*
1619 * We're using one, so create a matching key
1620 * on the newsk structure. If we fail to get
1621 * memory, then we end up not copying the key
1622 * across. Shucks.
1623 */
1624 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1625 key->key, key->keylen, GFP_ATOMIC);
1626 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1627 }
1628 #endif
1629
1630 if (__inet_inherit_port(sk, newsk) < 0)
1631 goto put_and_exit;
1632 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1633 &found_dup_sk);
1634 if (likely(*own_req)) {
1635 tcp_move_syn(newtp, req);
1636 ireq->ireq_opt = NULL;
1637 } else {
1638 newinet->inet_opt = NULL;
1639
1640 if (!req_unhash && found_dup_sk) {
1641 /* This code path should only be executed in the
1642 * syncookie case only
1643 */
1644 bh_unlock_sock(newsk);
1645 sock_put(newsk);
1646 newsk = NULL;
1647 }
1648 }
1649 return newsk;
1650
1651 exit_overflow:
1652 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1653 exit_nonewsk:
1654 dst_release(dst);
1655 exit:
1656 tcp_listendrop(sk);
1657 return NULL;
1658 put_and_exit:
1659 newinet->inet_opt = NULL;
1660 inet_csk_prepare_forced_close(newsk);
1661 tcp_done(newsk);
1662 goto exit;
1663 }
1664 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1665
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1666 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1667 {
1668 #ifdef CONFIG_SYN_COOKIES
1669 const struct tcphdr *th = tcp_hdr(skb);
1670
1671 if (!th->syn)
1672 sk = cookie_v4_check(sk, skb);
1673 #endif
1674 return sk;
1675 }
1676
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1677 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1678 struct tcphdr *th, u32 *cookie)
1679 {
1680 u16 mss = 0;
1681 #ifdef CONFIG_SYN_COOKIES
1682 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1683 &tcp_request_sock_ipv4_ops, sk, th);
1684 if (mss) {
1685 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1686 tcp_synq_overflow(sk);
1687 }
1688 #endif
1689 return mss;
1690 }
1691
1692 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1693 u32));
1694 /* The socket must have it's spinlock held when we get
1695 * here, unless it is a TCP_LISTEN socket.
1696 *
1697 * We have a potential double-lock case here, so even when
1698 * doing backlog processing we use the BH locking scheme.
1699 * This is because we cannot sleep with the original spinlock
1700 * held.
1701 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1702 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1703 {
1704 struct sock *rsk;
1705
1706 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1707 struct dst_entry *dst;
1708
1709 dst = rcu_dereference_protected(sk->sk_rx_dst,
1710 lockdep_sock_is_held(sk));
1711
1712 sock_rps_save_rxhash(sk, skb);
1713 sk_mark_napi_id(sk, skb);
1714 if (dst) {
1715 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1716 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1717 dst, 0)) {
1718 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1719 dst_release(dst);
1720 }
1721 }
1722 tcp_rcv_established(sk, skb);
1723 return 0;
1724 }
1725
1726 if (tcp_checksum_complete(skb))
1727 goto csum_err;
1728
1729 if (sk->sk_state == TCP_LISTEN) {
1730 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1731
1732 if (!nsk)
1733 goto discard;
1734 if (nsk != sk) {
1735 if (tcp_child_process(sk, nsk, skb)) {
1736 rsk = nsk;
1737 goto reset;
1738 }
1739 return 0;
1740 }
1741 } else
1742 sock_rps_save_rxhash(sk, skb);
1743
1744 if (tcp_rcv_state_process(sk, skb)) {
1745 rsk = sk;
1746 goto reset;
1747 }
1748 return 0;
1749
1750 reset:
1751 tcp_v4_send_reset(rsk, skb);
1752 discard:
1753 kfree_skb(skb);
1754 /* Be careful here. If this function gets more complicated and
1755 * gcc suffers from register pressure on the x86, sk (in %ebx)
1756 * might be destroyed here. This current version compiles correctly,
1757 * but you have been warned.
1758 */
1759 return 0;
1760
1761 csum_err:
1762 trace_tcp_bad_csum(skb);
1763 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1764 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1765 goto discard;
1766 }
1767 EXPORT_SYMBOL(tcp_v4_do_rcv);
1768
tcp_v4_early_demux(struct sk_buff * skb)1769 int tcp_v4_early_demux(struct sk_buff *skb)
1770 {
1771 const struct iphdr *iph;
1772 const struct tcphdr *th;
1773 struct sock *sk;
1774
1775 if (skb->pkt_type != PACKET_HOST)
1776 return 0;
1777
1778 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1779 return 0;
1780
1781 iph = ip_hdr(skb);
1782 th = tcp_hdr(skb);
1783
1784 if (th->doff < sizeof(struct tcphdr) / 4)
1785 return 0;
1786
1787 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1788 iph->saddr, th->source,
1789 iph->daddr, ntohs(th->dest),
1790 skb->skb_iif, inet_sdif(skb));
1791 if (sk) {
1792 skb->sk = sk;
1793 skb->destructor = sock_edemux;
1794 if (sk_fullsock(sk)) {
1795 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1796
1797 if (dst)
1798 dst = dst_check(dst, 0);
1799 if (dst &&
1800 sk->sk_rx_dst_ifindex == skb->skb_iif)
1801 skb_dst_set_noref(skb, dst);
1802 }
1803 }
1804 return 0;
1805 }
1806
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1807 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1808 {
1809 u32 limit, tail_gso_size, tail_gso_segs;
1810 struct skb_shared_info *shinfo;
1811 const struct tcphdr *th;
1812 struct tcphdr *thtail;
1813 struct sk_buff *tail;
1814 unsigned int hdrlen;
1815 bool fragstolen;
1816 u32 gso_segs;
1817 u32 gso_size;
1818 int delta;
1819
1820 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1821 * we can fix skb->truesize to its real value to avoid future drops.
1822 * This is valid because skb is not yet charged to the socket.
1823 * It has been noticed pure SACK packets were sometimes dropped
1824 * (if cooked by drivers without copybreak feature).
1825 */
1826 skb_condense(skb);
1827
1828 skb_dst_drop(skb);
1829
1830 if (unlikely(tcp_checksum_complete(skb))) {
1831 bh_unlock_sock(sk);
1832 trace_tcp_bad_csum(skb);
1833 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1834 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1835 return true;
1836 }
1837
1838 /* Attempt coalescing to last skb in backlog, even if we are
1839 * above the limits.
1840 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1841 */
1842 th = (const struct tcphdr *)skb->data;
1843 hdrlen = th->doff * 4;
1844
1845 tail = sk->sk_backlog.tail;
1846 if (!tail)
1847 goto no_coalesce;
1848 thtail = (struct tcphdr *)tail->data;
1849
1850 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1851 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1852 ((TCP_SKB_CB(tail)->tcp_flags |
1853 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1854 !((TCP_SKB_CB(tail)->tcp_flags &
1855 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1856 ((TCP_SKB_CB(tail)->tcp_flags ^
1857 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1858 #ifdef CONFIG_TLS_DEVICE
1859 tail->decrypted != skb->decrypted ||
1860 #endif
1861 !mptcp_skb_can_collapse(tail, skb) ||
1862 thtail->doff != th->doff ||
1863 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1864 goto no_coalesce;
1865
1866 __skb_pull(skb, hdrlen);
1867
1868 shinfo = skb_shinfo(skb);
1869 gso_size = shinfo->gso_size ?: skb->len;
1870 gso_segs = shinfo->gso_segs ?: 1;
1871
1872 shinfo = skb_shinfo(tail);
1873 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1874 tail_gso_segs = shinfo->gso_segs ?: 1;
1875
1876 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1877 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1878
1879 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1880 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1881 thtail->window = th->window;
1882 }
1883
1884 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1885 * thtail->fin, so that the fast path in tcp_rcv_established()
1886 * is not entered if we append a packet with a FIN.
1887 * SYN, RST, URG are not present.
1888 * ACK is set on both packets.
1889 * PSH : we do not really care in TCP stack,
1890 * at least for 'GRO' packets.
1891 */
1892 thtail->fin |= th->fin;
1893 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1894
1895 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1896 TCP_SKB_CB(tail)->has_rxtstamp = true;
1897 tail->tstamp = skb->tstamp;
1898 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1899 }
1900
1901 /* Not as strict as GRO. We only need to carry mss max value */
1902 shinfo->gso_size = max(gso_size, tail_gso_size);
1903 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1904
1905 sk->sk_backlog.len += delta;
1906 __NET_INC_STATS(sock_net(sk),
1907 LINUX_MIB_TCPBACKLOGCOALESCE);
1908 kfree_skb_partial(skb, fragstolen);
1909 return false;
1910 }
1911 __skb_push(skb, hdrlen);
1912
1913 no_coalesce:
1914 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1915
1916 /* Only socket owner can try to collapse/prune rx queues
1917 * to reduce memory overhead, so add a little headroom here.
1918 * Few sockets backlog are possibly concurrently non empty.
1919 */
1920 limit += 64 * 1024;
1921
1922 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1923 bh_unlock_sock(sk);
1924 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1925 return true;
1926 }
1927 return false;
1928 }
1929 EXPORT_SYMBOL(tcp_add_backlog);
1930
tcp_filter(struct sock * sk,struct sk_buff * skb)1931 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1932 {
1933 struct tcphdr *th = (struct tcphdr *)skb->data;
1934
1935 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1936 }
1937 EXPORT_SYMBOL(tcp_filter);
1938
tcp_v4_restore_cb(struct sk_buff * skb)1939 static void tcp_v4_restore_cb(struct sk_buff *skb)
1940 {
1941 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1942 sizeof(struct inet_skb_parm));
1943 }
1944
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1945 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1946 const struct tcphdr *th)
1947 {
1948 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1949 * barrier() makes sure compiler wont play fool^Waliasing games.
1950 */
1951 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1952 sizeof(struct inet_skb_parm));
1953 barrier();
1954
1955 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1956 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1957 skb->len - th->doff * 4);
1958 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1959 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1960 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1961 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1962 TCP_SKB_CB(skb)->sacked = 0;
1963 TCP_SKB_CB(skb)->has_rxtstamp =
1964 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1965 }
1966
1967 /*
1968 * From tcp_input.c
1969 */
1970
tcp_v4_rcv(struct sk_buff * skb)1971 int tcp_v4_rcv(struct sk_buff *skb)
1972 {
1973 struct net *net = dev_net(skb->dev);
1974 struct sk_buff *skb_to_free;
1975 int sdif = inet_sdif(skb);
1976 int dif = inet_iif(skb);
1977 const struct iphdr *iph;
1978 const struct tcphdr *th;
1979 bool refcounted;
1980 struct sock *sk;
1981 int drop_reason;
1982 int ret;
1983
1984 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1985 if (skb->pkt_type != PACKET_HOST)
1986 goto discard_it;
1987
1988 /* Count it even if it's bad */
1989 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1990
1991 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1992 goto discard_it;
1993
1994 th = (const struct tcphdr *)skb->data;
1995
1996 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1997 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1998 goto bad_packet;
1999 }
2000 if (!pskb_may_pull(skb, th->doff * 4))
2001 goto discard_it;
2002
2003 /* An explanation is required here, I think.
2004 * Packet length and doff are validated by header prediction,
2005 * provided case of th->doff==0 is eliminated.
2006 * So, we defer the checks. */
2007
2008 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2009 goto csum_error;
2010
2011 th = (const struct tcphdr *)skb->data;
2012 iph = ip_hdr(skb);
2013 lookup:
2014 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2015 th->dest, sdif, &refcounted);
2016 if (!sk)
2017 goto no_tcp_socket;
2018
2019 process:
2020 if (sk->sk_state == TCP_TIME_WAIT)
2021 goto do_time_wait;
2022
2023 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2024 struct request_sock *req = inet_reqsk(sk);
2025 bool req_stolen = false;
2026 struct sock *nsk;
2027
2028 sk = req->rsk_listener;
2029 if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
2030 tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2031 sk_drops_add(sk, skb);
2032 reqsk_put(req);
2033 goto discard_it;
2034 }
2035 if (tcp_checksum_complete(skb)) {
2036 reqsk_put(req);
2037 goto csum_error;
2038 }
2039 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2040 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2041 if (!nsk) {
2042 inet_csk_reqsk_queue_drop_and_put(sk, req);
2043 goto lookup;
2044 }
2045 sk = nsk;
2046 /* reuseport_migrate_sock() has already held one sk_refcnt
2047 * before returning.
2048 */
2049 } else {
2050 /* We own a reference on the listener, increase it again
2051 * as we might lose it too soon.
2052 */
2053 sock_hold(sk);
2054 }
2055 refcounted = true;
2056 nsk = NULL;
2057 if (!tcp_filter(sk, skb)) {
2058 th = (const struct tcphdr *)skb->data;
2059 iph = ip_hdr(skb);
2060 tcp_v4_fill_cb(skb, iph, th);
2061 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2062 }
2063 if (!nsk) {
2064 reqsk_put(req);
2065 if (req_stolen) {
2066 /* Another cpu got exclusive access to req
2067 * and created a full blown socket.
2068 * Try to feed this packet to this socket
2069 * instead of discarding it.
2070 */
2071 tcp_v4_restore_cb(skb);
2072 sock_put(sk);
2073 goto lookup;
2074 }
2075 goto discard_and_relse;
2076 }
2077 nf_reset_ct(skb);
2078 if (nsk == sk) {
2079 reqsk_put(req);
2080 tcp_v4_restore_cb(skb);
2081 } else if (tcp_child_process(sk, nsk, skb)) {
2082 tcp_v4_send_reset(nsk, skb);
2083 goto discard_and_relse;
2084 } else {
2085 sock_put(sk);
2086 return 0;
2087 }
2088 }
2089 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2090 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2091 goto discard_and_relse;
2092 }
2093
2094 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2095 goto discard_and_relse;
2096
2097 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2098 goto discard_and_relse;
2099
2100 nf_reset_ct(skb);
2101
2102 if (tcp_filter(sk, skb)) {
2103 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2104 goto discard_and_relse;
2105 }
2106 th = (const struct tcphdr *)skb->data;
2107 iph = ip_hdr(skb);
2108 tcp_v4_fill_cb(skb, iph, th);
2109
2110 skb->dev = NULL;
2111
2112 if (sk->sk_state == TCP_LISTEN) {
2113 ret = tcp_v4_do_rcv(sk, skb);
2114 goto put_and_return;
2115 }
2116
2117 sk_incoming_cpu_update(sk);
2118
2119 bh_lock_sock_nested(sk);
2120 tcp_segs_in(tcp_sk(sk), skb);
2121 ret = 0;
2122 if (!sock_owned_by_user(sk)) {
2123 skb_to_free = sk->sk_rx_skb_cache;
2124 sk->sk_rx_skb_cache = NULL;
2125 ret = tcp_v4_do_rcv(sk, skb);
2126 } else {
2127 if (tcp_add_backlog(sk, skb))
2128 goto discard_and_relse;
2129 skb_to_free = NULL;
2130 }
2131 bh_unlock_sock(sk);
2132 if (skb_to_free)
2133 __kfree_skb(skb_to_free);
2134
2135 put_and_return:
2136 if (refcounted)
2137 sock_put(sk);
2138
2139 return ret;
2140
2141 no_tcp_socket:
2142 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2143 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2144 goto discard_it;
2145
2146 tcp_v4_fill_cb(skb, iph, th);
2147
2148 if (tcp_checksum_complete(skb)) {
2149 csum_error:
2150 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2151 trace_tcp_bad_csum(skb);
2152 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2153 bad_packet:
2154 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2155 } else {
2156 tcp_v4_send_reset(NULL, skb);
2157 }
2158
2159 discard_it:
2160 /* Discard frame. */
2161 kfree_skb_reason(skb, drop_reason);
2162 return 0;
2163
2164 discard_and_relse:
2165 sk_drops_add(sk, skb);
2166 if (refcounted)
2167 sock_put(sk);
2168 goto discard_it;
2169
2170 do_time_wait:
2171 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2172 inet_twsk_put(inet_twsk(sk));
2173 goto discard_it;
2174 }
2175
2176 tcp_v4_fill_cb(skb, iph, th);
2177
2178 if (tcp_checksum_complete(skb)) {
2179 inet_twsk_put(inet_twsk(sk));
2180 goto csum_error;
2181 }
2182 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2183 case TCP_TW_SYN: {
2184 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2185 &tcp_hashinfo, skb,
2186 __tcp_hdrlen(th),
2187 iph->saddr, th->source,
2188 iph->daddr, th->dest,
2189 inet_iif(skb),
2190 sdif);
2191 if (sk2) {
2192 inet_twsk_deschedule_put(inet_twsk(sk));
2193 sk = sk2;
2194 tcp_v4_restore_cb(skb);
2195 refcounted = false;
2196 goto process;
2197 }
2198 }
2199 /* to ACK */
2200 fallthrough;
2201 case TCP_TW_ACK:
2202 tcp_v4_timewait_ack(sk, skb);
2203 break;
2204 case TCP_TW_RST:
2205 tcp_v4_send_reset(sk, skb);
2206 inet_twsk_deschedule_put(inet_twsk(sk));
2207 goto discard_it;
2208 case TCP_TW_SUCCESS:;
2209 }
2210 goto discard_it;
2211 }
2212
2213 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2214 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2215 .twsk_unique = tcp_twsk_unique,
2216 .twsk_destructor= tcp_twsk_destructor,
2217 };
2218
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2219 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2220 {
2221 struct dst_entry *dst = skb_dst(skb);
2222
2223 if (dst && dst_hold_safe(dst)) {
2224 rcu_assign_pointer(sk->sk_rx_dst, dst);
2225 sk->sk_rx_dst_ifindex = skb->skb_iif;
2226 }
2227 }
2228 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2229
2230 const struct inet_connection_sock_af_ops ipv4_specific = {
2231 .queue_xmit = ip_queue_xmit,
2232 .send_check = tcp_v4_send_check,
2233 .rebuild_header = inet_sk_rebuild_header,
2234 .sk_rx_dst_set = inet_sk_rx_dst_set,
2235 .conn_request = tcp_v4_conn_request,
2236 .syn_recv_sock = tcp_v4_syn_recv_sock,
2237 .net_header_len = sizeof(struct iphdr),
2238 .setsockopt = ip_setsockopt,
2239 .getsockopt = ip_getsockopt,
2240 .addr2sockaddr = inet_csk_addr2sockaddr,
2241 .sockaddr_len = sizeof(struct sockaddr_in),
2242 .mtu_reduced = tcp_v4_mtu_reduced,
2243 };
2244 EXPORT_SYMBOL(ipv4_specific);
2245
2246 #ifdef CONFIG_TCP_MD5SIG
2247 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2248 .md5_lookup = tcp_v4_md5_lookup,
2249 .calc_md5_hash = tcp_v4_md5_hash_skb,
2250 .md5_parse = tcp_v4_parse_md5_keys,
2251 };
2252 #endif
2253
2254 /* NOTE: A lot of things set to zero explicitly by call to
2255 * sk_alloc() so need not be done here.
2256 */
tcp_v4_init_sock(struct sock * sk)2257 static int tcp_v4_init_sock(struct sock *sk)
2258 {
2259 struct inet_connection_sock *icsk = inet_csk(sk);
2260
2261 tcp_init_sock(sk);
2262
2263 icsk->icsk_af_ops = &ipv4_specific;
2264
2265 #ifdef CONFIG_TCP_MD5SIG
2266 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2267 #endif
2268
2269 return 0;
2270 }
2271
tcp_v4_destroy_sock(struct sock * sk)2272 void tcp_v4_destroy_sock(struct sock *sk)
2273 {
2274 struct tcp_sock *tp = tcp_sk(sk);
2275
2276 trace_tcp_destroy_sock(sk);
2277
2278 tcp_clear_xmit_timers(sk);
2279
2280 tcp_cleanup_congestion_control(sk);
2281
2282 tcp_cleanup_ulp(sk);
2283
2284 /* Cleanup up the write buffer. */
2285 tcp_write_queue_purge(sk);
2286
2287 /* Check if we want to disable active TFO */
2288 tcp_fastopen_active_disable_ofo_check(sk);
2289
2290 /* Cleans up our, hopefully empty, out_of_order_queue. */
2291 skb_rbtree_purge(&tp->out_of_order_queue);
2292
2293 #ifdef CONFIG_TCP_MD5SIG
2294 /* Clean up the MD5 key list, if any */
2295 if (tp->md5sig_info) {
2296 tcp_clear_md5_list(sk);
2297 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2298 tp->md5sig_info = NULL;
2299 }
2300 #endif
2301
2302 /* Clean up a referenced TCP bind bucket. */
2303 if (inet_csk(sk)->icsk_bind_hash)
2304 inet_put_port(sk);
2305
2306 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2307
2308 /* If socket is aborted during connect operation */
2309 tcp_free_fastopen_req(tp);
2310 tcp_fastopen_destroy_cipher(sk);
2311 tcp_saved_syn_free(tp);
2312
2313 sk_sockets_allocated_dec(sk);
2314 }
2315 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2316
2317 #ifdef CONFIG_PROC_FS
2318 /* Proc filesystem TCP sock list dumping. */
2319
2320 static unsigned short seq_file_family(const struct seq_file *seq);
2321
seq_sk_match(struct seq_file * seq,const struct sock * sk)2322 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2323 {
2324 unsigned short family = seq_file_family(seq);
2325
2326 /* AF_UNSPEC is used as a match all */
2327 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2328 net_eq(sock_net(sk), seq_file_net(seq)));
2329 }
2330
2331 /* Find a non empty bucket (starting from st->bucket)
2332 * and return the first sk from it.
2333 */
listening_get_first(struct seq_file * seq)2334 static void *listening_get_first(struct seq_file *seq)
2335 {
2336 struct tcp_iter_state *st = seq->private;
2337
2338 st->offset = 0;
2339 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2340 struct inet_listen_hashbucket *ilb2;
2341 struct inet_connection_sock *icsk;
2342 struct sock *sk;
2343
2344 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2345 if (hlist_empty(&ilb2->head))
2346 continue;
2347
2348 spin_lock(&ilb2->lock);
2349 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2350 sk = (struct sock *)icsk;
2351 if (seq_sk_match(seq, sk))
2352 return sk;
2353 }
2354 spin_unlock(&ilb2->lock);
2355 }
2356
2357 return NULL;
2358 }
2359
2360 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2361 * If "cur" is the last one in the st->bucket,
2362 * call listening_get_first() to return the first sk of the next
2363 * non empty bucket.
2364 */
listening_get_next(struct seq_file * seq,void * cur)2365 static void *listening_get_next(struct seq_file *seq, void *cur)
2366 {
2367 struct tcp_iter_state *st = seq->private;
2368 struct inet_listen_hashbucket *ilb2;
2369 struct inet_connection_sock *icsk;
2370 struct sock *sk = cur;
2371
2372 ++st->num;
2373 ++st->offset;
2374
2375 icsk = inet_csk(sk);
2376 inet_lhash2_for_each_icsk_continue(icsk) {
2377 sk = (struct sock *)icsk;
2378 if (seq_sk_match(seq, sk))
2379 return sk;
2380 }
2381
2382 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2383 spin_unlock(&ilb2->lock);
2384 ++st->bucket;
2385 return listening_get_first(seq);
2386 }
2387
listening_get_idx(struct seq_file * seq,loff_t * pos)2388 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2389 {
2390 struct tcp_iter_state *st = seq->private;
2391 void *rc;
2392
2393 st->bucket = 0;
2394 st->offset = 0;
2395 rc = listening_get_first(seq);
2396
2397 while (rc && *pos) {
2398 rc = listening_get_next(seq, rc);
2399 --*pos;
2400 }
2401 return rc;
2402 }
2403
empty_bucket(const struct tcp_iter_state * st)2404 static inline bool empty_bucket(const struct tcp_iter_state *st)
2405 {
2406 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2407 }
2408
2409 /*
2410 * Get first established socket starting from bucket given in st->bucket.
2411 * If st->bucket is zero, the very first socket in the hash is returned.
2412 */
established_get_first(struct seq_file * seq)2413 static void *established_get_first(struct seq_file *seq)
2414 {
2415 struct tcp_iter_state *st = seq->private;
2416
2417 st->offset = 0;
2418 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2419 struct sock *sk;
2420 struct hlist_nulls_node *node;
2421 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2422
2423 /* Lockless fast path for the common case of empty buckets */
2424 if (empty_bucket(st))
2425 continue;
2426
2427 spin_lock_bh(lock);
2428 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2429 if (seq_sk_match(seq, sk))
2430 return sk;
2431 }
2432 spin_unlock_bh(lock);
2433 }
2434
2435 return NULL;
2436 }
2437
established_get_next(struct seq_file * seq,void * cur)2438 static void *established_get_next(struct seq_file *seq, void *cur)
2439 {
2440 struct sock *sk = cur;
2441 struct hlist_nulls_node *node;
2442 struct tcp_iter_state *st = seq->private;
2443
2444 ++st->num;
2445 ++st->offset;
2446
2447 sk = sk_nulls_next(sk);
2448
2449 sk_nulls_for_each_from(sk, node) {
2450 if (seq_sk_match(seq, sk))
2451 return sk;
2452 }
2453
2454 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2455 ++st->bucket;
2456 return established_get_first(seq);
2457 }
2458
established_get_idx(struct seq_file * seq,loff_t pos)2459 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2460 {
2461 struct tcp_iter_state *st = seq->private;
2462 void *rc;
2463
2464 st->bucket = 0;
2465 rc = established_get_first(seq);
2466
2467 while (rc && pos) {
2468 rc = established_get_next(seq, rc);
2469 --pos;
2470 }
2471 return rc;
2472 }
2473
tcp_get_idx(struct seq_file * seq,loff_t pos)2474 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2475 {
2476 void *rc;
2477 struct tcp_iter_state *st = seq->private;
2478
2479 st->state = TCP_SEQ_STATE_LISTENING;
2480 rc = listening_get_idx(seq, &pos);
2481
2482 if (!rc) {
2483 st->state = TCP_SEQ_STATE_ESTABLISHED;
2484 rc = established_get_idx(seq, pos);
2485 }
2486
2487 return rc;
2488 }
2489
tcp_seek_last_pos(struct seq_file * seq)2490 static void *tcp_seek_last_pos(struct seq_file *seq)
2491 {
2492 struct tcp_iter_state *st = seq->private;
2493 int bucket = st->bucket;
2494 int offset = st->offset;
2495 int orig_num = st->num;
2496 void *rc = NULL;
2497
2498 switch (st->state) {
2499 case TCP_SEQ_STATE_LISTENING:
2500 if (st->bucket > tcp_hashinfo.lhash2_mask)
2501 break;
2502 st->state = TCP_SEQ_STATE_LISTENING;
2503 rc = listening_get_first(seq);
2504 while (offset-- && rc && bucket == st->bucket)
2505 rc = listening_get_next(seq, rc);
2506 if (rc)
2507 break;
2508 st->bucket = 0;
2509 st->state = TCP_SEQ_STATE_ESTABLISHED;
2510 fallthrough;
2511 case TCP_SEQ_STATE_ESTABLISHED:
2512 if (st->bucket > tcp_hashinfo.ehash_mask)
2513 break;
2514 rc = established_get_first(seq);
2515 while (offset-- && rc && bucket == st->bucket)
2516 rc = established_get_next(seq, rc);
2517 }
2518
2519 st->num = orig_num;
2520
2521 return rc;
2522 }
2523
tcp_seq_start(struct seq_file * seq,loff_t * pos)2524 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2525 {
2526 struct tcp_iter_state *st = seq->private;
2527 void *rc;
2528
2529 if (*pos && *pos == st->last_pos) {
2530 rc = tcp_seek_last_pos(seq);
2531 if (rc)
2532 goto out;
2533 }
2534
2535 st->state = TCP_SEQ_STATE_LISTENING;
2536 st->num = 0;
2537 st->bucket = 0;
2538 st->offset = 0;
2539 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2540
2541 out:
2542 st->last_pos = *pos;
2543 return rc;
2544 }
2545 EXPORT_SYMBOL(tcp_seq_start);
2546
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2547 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2548 {
2549 struct tcp_iter_state *st = seq->private;
2550 void *rc = NULL;
2551
2552 if (v == SEQ_START_TOKEN) {
2553 rc = tcp_get_idx(seq, 0);
2554 goto out;
2555 }
2556
2557 switch (st->state) {
2558 case TCP_SEQ_STATE_LISTENING:
2559 rc = listening_get_next(seq, v);
2560 if (!rc) {
2561 st->state = TCP_SEQ_STATE_ESTABLISHED;
2562 st->bucket = 0;
2563 st->offset = 0;
2564 rc = established_get_first(seq);
2565 }
2566 break;
2567 case TCP_SEQ_STATE_ESTABLISHED:
2568 rc = established_get_next(seq, v);
2569 break;
2570 }
2571 out:
2572 ++*pos;
2573 st->last_pos = *pos;
2574 return rc;
2575 }
2576 EXPORT_SYMBOL(tcp_seq_next);
2577
tcp_seq_stop(struct seq_file * seq,void * v)2578 void tcp_seq_stop(struct seq_file *seq, void *v)
2579 {
2580 struct tcp_iter_state *st = seq->private;
2581
2582 switch (st->state) {
2583 case TCP_SEQ_STATE_LISTENING:
2584 if (v != SEQ_START_TOKEN)
2585 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2586 break;
2587 case TCP_SEQ_STATE_ESTABLISHED:
2588 if (v)
2589 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2590 break;
2591 }
2592 }
2593 EXPORT_SYMBOL(tcp_seq_stop);
2594
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2595 static void get_openreq4(const struct request_sock *req,
2596 struct seq_file *f, int i)
2597 {
2598 const struct inet_request_sock *ireq = inet_rsk(req);
2599 long delta = req->rsk_timer.expires - jiffies;
2600
2601 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2602 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2603 i,
2604 ireq->ir_loc_addr,
2605 ireq->ir_num,
2606 ireq->ir_rmt_addr,
2607 ntohs(ireq->ir_rmt_port),
2608 TCP_SYN_RECV,
2609 0, 0, /* could print option size, but that is af dependent. */
2610 1, /* timers active (only the expire timer) */
2611 jiffies_delta_to_clock_t(delta),
2612 req->num_timeout,
2613 from_kuid_munged(seq_user_ns(f),
2614 sock_i_uid(req->rsk_listener)),
2615 0, /* non standard timer */
2616 0, /* open_requests have no inode */
2617 0,
2618 req);
2619 }
2620
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2621 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2622 {
2623 int timer_active;
2624 unsigned long timer_expires;
2625 const struct tcp_sock *tp = tcp_sk(sk);
2626 const struct inet_connection_sock *icsk = inet_csk(sk);
2627 const struct inet_sock *inet = inet_sk(sk);
2628 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2629 __be32 dest = inet->inet_daddr;
2630 __be32 src = inet->inet_rcv_saddr;
2631 __u16 destp = ntohs(inet->inet_dport);
2632 __u16 srcp = ntohs(inet->inet_sport);
2633 int rx_queue;
2634 int state;
2635
2636 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2637 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2638 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2639 timer_active = 1;
2640 timer_expires = icsk->icsk_timeout;
2641 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2642 timer_active = 4;
2643 timer_expires = icsk->icsk_timeout;
2644 } else if (timer_pending(&sk->sk_timer)) {
2645 timer_active = 2;
2646 timer_expires = sk->sk_timer.expires;
2647 } else {
2648 timer_active = 0;
2649 timer_expires = jiffies;
2650 }
2651
2652 state = inet_sk_state_load(sk);
2653 if (state == TCP_LISTEN)
2654 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2655 else
2656 /* Because we don't lock the socket,
2657 * we might find a transient negative value.
2658 */
2659 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2660 READ_ONCE(tp->copied_seq), 0);
2661
2662 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2663 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2664 i, src, srcp, dest, destp, state,
2665 READ_ONCE(tp->write_seq) - tp->snd_una,
2666 rx_queue,
2667 timer_active,
2668 jiffies_delta_to_clock_t(timer_expires - jiffies),
2669 icsk->icsk_retransmits,
2670 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2671 icsk->icsk_probes_out,
2672 sock_i_ino(sk),
2673 refcount_read(&sk->sk_refcnt), sk,
2674 jiffies_to_clock_t(icsk->icsk_rto),
2675 jiffies_to_clock_t(icsk->icsk_ack.ato),
2676 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2677 tcp_snd_cwnd(tp),
2678 state == TCP_LISTEN ?
2679 fastopenq->max_qlen :
2680 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2681 }
2682
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2683 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2684 struct seq_file *f, int i)
2685 {
2686 long delta = tw->tw_timer.expires - jiffies;
2687 __be32 dest, src;
2688 __u16 destp, srcp;
2689
2690 dest = tw->tw_daddr;
2691 src = tw->tw_rcv_saddr;
2692 destp = ntohs(tw->tw_dport);
2693 srcp = ntohs(tw->tw_sport);
2694
2695 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2696 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2697 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2698 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2699 refcount_read(&tw->tw_refcnt), tw);
2700 }
2701
2702 #define TMPSZ 150
2703
tcp4_seq_show(struct seq_file * seq,void * v)2704 static int tcp4_seq_show(struct seq_file *seq, void *v)
2705 {
2706 struct tcp_iter_state *st;
2707 struct sock *sk = v;
2708
2709 seq_setwidth(seq, TMPSZ - 1);
2710 if (v == SEQ_START_TOKEN) {
2711 seq_puts(seq, " sl local_address rem_address st tx_queue "
2712 "rx_queue tr tm->when retrnsmt uid timeout "
2713 "inode");
2714 goto out;
2715 }
2716 st = seq->private;
2717
2718 if (sk->sk_state == TCP_TIME_WAIT)
2719 get_timewait4_sock(v, seq, st->num);
2720 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2721 get_openreq4(v, seq, st->num);
2722 else
2723 get_tcp4_sock(v, seq, st->num);
2724 out:
2725 seq_pad(seq, '\n');
2726 return 0;
2727 }
2728
2729 #ifdef CONFIG_BPF_SYSCALL
2730 struct bpf_tcp_iter_state {
2731 struct tcp_iter_state state;
2732 unsigned int cur_sk;
2733 unsigned int end_sk;
2734 unsigned int max_sk;
2735 struct sock **batch;
2736 bool st_bucket_done;
2737 };
2738
2739 struct bpf_iter__tcp {
2740 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2741 __bpf_md_ptr(struct sock_common *, sk_common);
2742 uid_t uid __aligned(8);
2743 };
2744
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2745 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2746 struct sock_common *sk_common, uid_t uid)
2747 {
2748 struct bpf_iter__tcp ctx;
2749
2750 meta->seq_num--; /* skip SEQ_START_TOKEN */
2751 ctx.meta = meta;
2752 ctx.sk_common = sk_common;
2753 ctx.uid = uid;
2754 return bpf_iter_run_prog(prog, &ctx);
2755 }
2756
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2757 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2758 {
2759 while (iter->cur_sk < iter->end_sk)
2760 sock_gen_put(iter->batch[iter->cur_sk++]);
2761 }
2762
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2763 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2764 unsigned int new_batch_sz)
2765 {
2766 struct sock **new_batch;
2767
2768 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2769 GFP_USER | __GFP_NOWARN);
2770 if (!new_batch)
2771 return -ENOMEM;
2772
2773 bpf_iter_tcp_put_batch(iter);
2774 kvfree(iter->batch);
2775 iter->batch = new_batch;
2776 iter->max_sk = new_batch_sz;
2777
2778 return 0;
2779 }
2780
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2781 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2782 struct sock *start_sk)
2783 {
2784 struct bpf_tcp_iter_state *iter = seq->private;
2785 struct tcp_iter_state *st = &iter->state;
2786 struct inet_connection_sock *icsk;
2787 unsigned int expected = 1;
2788 struct sock *sk;
2789
2790 sock_hold(start_sk);
2791 iter->batch[iter->end_sk++] = start_sk;
2792
2793 icsk = inet_csk(start_sk);
2794 inet_lhash2_for_each_icsk_continue(icsk) {
2795 sk = (struct sock *)icsk;
2796 if (seq_sk_match(seq, sk)) {
2797 if (iter->end_sk < iter->max_sk) {
2798 sock_hold(sk);
2799 iter->batch[iter->end_sk++] = sk;
2800 }
2801 expected++;
2802 }
2803 }
2804 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2805
2806 return expected;
2807 }
2808
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2809 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2810 struct sock *start_sk)
2811 {
2812 struct bpf_tcp_iter_state *iter = seq->private;
2813 struct tcp_iter_state *st = &iter->state;
2814 struct hlist_nulls_node *node;
2815 unsigned int expected = 1;
2816 struct sock *sk;
2817
2818 sock_hold(start_sk);
2819 iter->batch[iter->end_sk++] = start_sk;
2820
2821 sk = sk_nulls_next(start_sk);
2822 sk_nulls_for_each_from(sk, node) {
2823 if (seq_sk_match(seq, sk)) {
2824 if (iter->end_sk < iter->max_sk) {
2825 sock_hold(sk);
2826 iter->batch[iter->end_sk++] = sk;
2827 }
2828 expected++;
2829 }
2830 }
2831 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2832
2833 return expected;
2834 }
2835
bpf_iter_tcp_batch(struct seq_file * seq)2836 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2837 {
2838 struct bpf_tcp_iter_state *iter = seq->private;
2839 struct tcp_iter_state *st = &iter->state;
2840 unsigned int expected;
2841 bool resized = false;
2842 struct sock *sk;
2843
2844 /* The st->bucket is done. Directly advance to the next
2845 * bucket instead of having the tcp_seek_last_pos() to skip
2846 * one by one in the current bucket and eventually find out
2847 * it has to advance to the next bucket.
2848 */
2849 if (iter->st_bucket_done) {
2850 st->offset = 0;
2851 st->bucket++;
2852 if (st->state == TCP_SEQ_STATE_LISTENING &&
2853 st->bucket > tcp_hashinfo.lhash2_mask) {
2854 st->state = TCP_SEQ_STATE_ESTABLISHED;
2855 st->bucket = 0;
2856 }
2857 }
2858
2859 again:
2860 /* Get a new batch */
2861 iter->cur_sk = 0;
2862 iter->end_sk = 0;
2863 iter->st_bucket_done = false;
2864
2865 sk = tcp_seek_last_pos(seq);
2866 if (!sk)
2867 return NULL; /* Done */
2868
2869 if (st->state == TCP_SEQ_STATE_LISTENING)
2870 expected = bpf_iter_tcp_listening_batch(seq, sk);
2871 else
2872 expected = bpf_iter_tcp_established_batch(seq, sk);
2873
2874 if (iter->end_sk == expected) {
2875 iter->st_bucket_done = true;
2876 return sk;
2877 }
2878
2879 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2880 resized = true;
2881 goto again;
2882 }
2883
2884 return sk;
2885 }
2886
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2887 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2888 {
2889 /* bpf iter does not support lseek, so it always
2890 * continue from where it was stop()-ped.
2891 */
2892 if (*pos)
2893 return bpf_iter_tcp_batch(seq);
2894
2895 return SEQ_START_TOKEN;
2896 }
2897
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2898 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2899 {
2900 struct bpf_tcp_iter_state *iter = seq->private;
2901 struct tcp_iter_state *st = &iter->state;
2902 struct sock *sk;
2903
2904 /* Whenever seq_next() is called, the iter->cur_sk is
2905 * done with seq_show(), so advance to the next sk in
2906 * the batch.
2907 */
2908 if (iter->cur_sk < iter->end_sk) {
2909 /* Keeping st->num consistent in tcp_iter_state.
2910 * bpf_iter_tcp does not use st->num.
2911 * meta.seq_num is used instead.
2912 */
2913 st->num++;
2914 /* Move st->offset to the next sk in the bucket such that
2915 * the future start() will resume at st->offset in
2916 * st->bucket. See tcp_seek_last_pos().
2917 */
2918 st->offset++;
2919 sock_gen_put(iter->batch[iter->cur_sk++]);
2920 }
2921
2922 if (iter->cur_sk < iter->end_sk)
2923 sk = iter->batch[iter->cur_sk];
2924 else
2925 sk = bpf_iter_tcp_batch(seq);
2926
2927 ++*pos;
2928 /* Keeping st->last_pos consistent in tcp_iter_state.
2929 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2930 */
2931 st->last_pos = *pos;
2932 return sk;
2933 }
2934
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2935 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2936 {
2937 struct bpf_iter_meta meta;
2938 struct bpf_prog *prog;
2939 struct sock *sk = v;
2940 bool slow;
2941 uid_t uid;
2942 int ret;
2943
2944 if (v == SEQ_START_TOKEN)
2945 return 0;
2946
2947 if (sk_fullsock(sk))
2948 slow = lock_sock_fast(sk);
2949
2950 if (unlikely(sk_unhashed(sk))) {
2951 ret = SEQ_SKIP;
2952 goto unlock;
2953 }
2954
2955 if (sk->sk_state == TCP_TIME_WAIT) {
2956 uid = 0;
2957 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2958 const struct request_sock *req = v;
2959
2960 uid = from_kuid_munged(seq_user_ns(seq),
2961 sock_i_uid(req->rsk_listener));
2962 } else {
2963 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2964 }
2965
2966 meta.seq = seq;
2967 prog = bpf_iter_get_info(&meta, false);
2968 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2969
2970 unlock:
2971 if (sk_fullsock(sk))
2972 unlock_sock_fast(sk, slow);
2973 return ret;
2974
2975 }
2976
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2977 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2978 {
2979 struct bpf_tcp_iter_state *iter = seq->private;
2980 struct bpf_iter_meta meta;
2981 struct bpf_prog *prog;
2982
2983 if (!v) {
2984 meta.seq = seq;
2985 prog = bpf_iter_get_info(&meta, true);
2986 if (prog)
2987 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2988 }
2989
2990 if (iter->cur_sk < iter->end_sk) {
2991 bpf_iter_tcp_put_batch(iter);
2992 iter->st_bucket_done = false;
2993 }
2994 }
2995
2996 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2997 .show = bpf_iter_tcp_seq_show,
2998 .start = bpf_iter_tcp_seq_start,
2999 .next = bpf_iter_tcp_seq_next,
3000 .stop = bpf_iter_tcp_seq_stop,
3001 };
3002 #endif
seq_file_family(const struct seq_file * seq)3003 static unsigned short seq_file_family(const struct seq_file *seq)
3004 {
3005 const struct tcp_seq_afinfo *afinfo;
3006
3007 #ifdef CONFIG_BPF_SYSCALL
3008 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3009 if (seq->op == &bpf_iter_tcp_seq_ops)
3010 return AF_UNSPEC;
3011 #endif
3012
3013 /* Iterated from proc fs */
3014 afinfo = PDE_DATA(file_inode(seq->file));
3015 return afinfo->family;
3016 }
3017
3018 static const struct seq_operations tcp4_seq_ops = {
3019 .show = tcp4_seq_show,
3020 .start = tcp_seq_start,
3021 .next = tcp_seq_next,
3022 .stop = tcp_seq_stop,
3023 };
3024
3025 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3026 .family = AF_INET,
3027 };
3028
tcp4_proc_init_net(struct net * net)3029 static int __net_init tcp4_proc_init_net(struct net *net)
3030 {
3031 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3032 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3033 return -ENOMEM;
3034 return 0;
3035 }
3036
tcp4_proc_exit_net(struct net * net)3037 static void __net_exit tcp4_proc_exit_net(struct net *net)
3038 {
3039 remove_proc_entry("tcp", net->proc_net);
3040 }
3041
3042 static struct pernet_operations tcp4_net_ops = {
3043 .init = tcp4_proc_init_net,
3044 .exit = tcp4_proc_exit_net,
3045 };
3046
tcp4_proc_init(void)3047 int __init tcp4_proc_init(void)
3048 {
3049 return register_pernet_subsys(&tcp4_net_ops);
3050 }
3051
tcp4_proc_exit(void)3052 void tcp4_proc_exit(void)
3053 {
3054 unregister_pernet_subsys(&tcp4_net_ops);
3055 }
3056 #endif /* CONFIG_PROC_FS */
3057
3058 /* @wake is one when sk_stream_write_space() calls us.
3059 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3060 * This mimics the strategy used in sock_def_write_space().
3061 */
tcp_stream_memory_free(const struct sock * sk,int wake)3062 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3063 {
3064 const struct tcp_sock *tp = tcp_sk(sk);
3065 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3066 READ_ONCE(tp->snd_nxt);
3067
3068 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3069 }
3070 EXPORT_SYMBOL(tcp_stream_memory_free);
3071
3072 struct proto tcp_prot = {
3073 .name = "TCP",
3074 .owner = THIS_MODULE,
3075 .close = tcp_close,
3076 .pre_connect = tcp_v4_pre_connect,
3077 .connect = tcp_v4_connect,
3078 .disconnect = tcp_disconnect,
3079 .accept = inet_csk_accept,
3080 .ioctl = tcp_ioctl,
3081 .init = tcp_v4_init_sock,
3082 .destroy = tcp_v4_destroy_sock,
3083 .shutdown = tcp_shutdown,
3084 .setsockopt = tcp_setsockopt,
3085 .getsockopt = tcp_getsockopt,
3086 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3087 .keepalive = tcp_set_keepalive,
3088 .recvmsg = tcp_recvmsg,
3089 .sendmsg = tcp_sendmsg,
3090 .sendpage = tcp_sendpage,
3091 .backlog_rcv = tcp_v4_do_rcv,
3092 .release_cb = tcp_release_cb,
3093 .hash = inet_hash,
3094 .unhash = inet_unhash,
3095 .get_port = inet_csk_get_port,
3096 #ifdef CONFIG_BPF_SYSCALL
3097 .psock_update_sk_prot = tcp_bpf_update_proto,
3098 #endif
3099 .enter_memory_pressure = tcp_enter_memory_pressure,
3100 .leave_memory_pressure = tcp_leave_memory_pressure,
3101 .stream_memory_free = tcp_stream_memory_free,
3102 .sockets_allocated = &tcp_sockets_allocated,
3103 .orphan_count = &tcp_orphan_count,
3104 .memory_allocated = &tcp_memory_allocated,
3105 .memory_pressure = &tcp_memory_pressure,
3106 .sysctl_mem = sysctl_tcp_mem,
3107 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3108 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3109 .max_header = MAX_TCP_HEADER,
3110 .obj_size = sizeof(struct tcp_sock),
3111 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3112 .twsk_prot = &tcp_timewait_sock_ops,
3113 .rsk_prot = &tcp_request_sock_ops,
3114 .h.hashinfo = &tcp_hashinfo,
3115 .no_autobind = true,
3116 .diag_destroy = tcp_abort,
3117 };
3118 EXPORT_SYMBOL(tcp_prot);
3119
tcp_sk_exit(struct net * net)3120 static void __net_exit tcp_sk_exit(struct net *net)
3121 {
3122 int cpu;
3123
3124 if (net->ipv4.tcp_congestion_control)
3125 bpf_module_put(net->ipv4.tcp_congestion_control,
3126 net->ipv4.tcp_congestion_control->owner);
3127
3128 for_each_possible_cpu(cpu)
3129 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3130 free_percpu(net->ipv4.tcp_sk);
3131 }
3132
tcp_sk_init(struct net * net)3133 static int __net_init tcp_sk_init(struct net *net)
3134 {
3135 int res, cpu, cnt;
3136
3137 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3138 if (!net->ipv4.tcp_sk)
3139 return -ENOMEM;
3140
3141 for_each_possible_cpu(cpu) {
3142 struct sock *sk;
3143
3144 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3145 IPPROTO_TCP, net);
3146 if (res)
3147 goto fail;
3148 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3149
3150 /* Please enforce IP_DF and IPID==0 for RST and
3151 * ACK sent in SYN-RECV and TIME-WAIT state.
3152 */
3153 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3154
3155 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3156 }
3157
3158 net->ipv4.sysctl_tcp_ecn = 2;
3159 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3160
3161 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3162 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3163 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3164 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3165 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3166
3167 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3168 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3169 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3170
3171 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3172 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3173 net->ipv4.sysctl_tcp_syncookies = 1;
3174 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3175 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3176 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3177 net->ipv4.sysctl_tcp_orphan_retries = 0;
3178 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3179 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3180 net->ipv4.sysctl_tcp_tw_reuse = 2;
3181 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3182
3183 cnt = tcp_hashinfo.ehash_mask + 1;
3184 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3185 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3186
3187 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3188 net->ipv4.sysctl_tcp_sack = 1;
3189 net->ipv4.sysctl_tcp_window_scaling = 1;
3190 net->ipv4.sysctl_tcp_timestamps = 1;
3191 net->ipv4.sysctl_tcp_early_retrans = 3;
3192 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3193 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3194 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3195 net->ipv4.sysctl_tcp_max_reordering = 300;
3196 net->ipv4.sysctl_tcp_dsack = 1;
3197 net->ipv4.sysctl_tcp_app_win = 31;
3198 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3199 net->ipv4.sysctl_tcp_frto = 2;
3200 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3201 /* This limits the percentage of the congestion window which we
3202 * will allow a single TSO frame to consume. Building TSO frames
3203 * which are too large can cause TCP streams to be bursty.
3204 */
3205 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3206 /* Default TSQ limit of 16 TSO segments */
3207 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3208 /* rfc5961 challenge ack rate limiting */
3209 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3210 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3211 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3212 net->ipv4.sysctl_tcp_autocorking = 1;
3213 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3214 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3215 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3216 if (net != &init_net) {
3217 memcpy(net->ipv4.sysctl_tcp_rmem,
3218 init_net.ipv4.sysctl_tcp_rmem,
3219 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3220 memcpy(net->ipv4.sysctl_tcp_wmem,
3221 init_net.ipv4.sysctl_tcp_wmem,
3222 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3223 }
3224 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3225 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3226 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3227 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3228 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3229 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3230
3231 /* Reno is always built in */
3232 if (!net_eq(net, &init_net) &&
3233 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3234 init_net.ipv4.tcp_congestion_control->owner))
3235 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3236 else
3237 net->ipv4.tcp_congestion_control = &tcp_reno;
3238
3239 return 0;
3240 fail:
3241 tcp_sk_exit(net);
3242
3243 return res;
3244 }
3245
tcp_sk_exit_batch(struct list_head * net_exit_list)3246 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3247 {
3248 struct net *net;
3249
3250 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3251
3252 list_for_each_entry(net, net_exit_list, exit_list)
3253 tcp_fastopen_ctx_destroy(net);
3254 }
3255
3256 static struct pernet_operations __net_initdata tcp_sk_ops = {
3257 .init = tcp_sk_init,
3258 .exit = tcp_sk_exit,
3259 .exit_batch = tcp_sk_exit_batch,
3260 };
3261
3262 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3263 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3264 struct sock_common *sk_common, uid_t uid)
3265
3266 #define INIT_BATCH_SZ 16
3267
3268 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3269 {
3270 struct bpf_tcp_iter_state *iter = priv_data;
3271 int err;
3272
3273 err = bpf_iter_init_seq_net(priv_data, aux);
3274 if (err)
3275 return err;
3276
3277 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3278 if (err) {
3279 bpf_iter_fini_seq_net(priv_data);
3280 return err;
3281 }
3282
3283 return 0;
3284 }
3285
bpf_iter_fini_tcp(void * priv_data)3286 static void bpf_iter_fini_tcp(void *priv_data)
3287 {
3288 struct bpf_tcp_iter_state *iter = priv_data;
3289
3290 bpf_iter_fini_seq_net(priv_data);
3291 kvfree(iter->batch);
3292 }
3293
3294 static const struct bpf_iter_seq_info tcp_seq_info = {
3295 .seq_ops = &bpf_iter_tcp_seq_ops,
3296 .init_seq_private = bpf_iter_init_tcp,
3297 .fini_seq_private = bpf_iter_fini_tcp,
3298 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3299 };
3300
3301 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3302 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3303 const struct bpf_prog *prog)
3304 {
3305 switch (func_id) {
3306 case BPF_FUNC_setsockopt:
3307 return &bpf_sk_setsockopt_proto;
3308 case BPF_FUNC_getsockopt:
3309 return &bpf_sk_getsockopt_proto;
3310 default:
3311 return NULL;
3312 }
3313 }
3314
3315 static struct bpf_iter_reg tcp_reg_info = {
3316 .target = "tcp",
3317 .ctx_arg_info_size = 1,
3318 .ctx_arg_info = {
3319 { offsetof(struct bpf_iter__tcp, sk_common),
3320 PTR_TO_BTF_ID_OR_NULL },
3321 },
3322 .get_func_proto = bpf_iter_tcp_get_func_proto,
3323 .seq_info = &tcp_seq_info,
3324 };
3325
bpf_iter_register(void)3326 static void __init bpf_iter_register(void)
3327 {
3328 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3329 if (bpf_iter_reg_target(&tcp_reg_info))
3330 pr_warn("Warning: could not register bpf iterator tcp\n");
3331 }
3332
3333 #endif
3334
tcp_v4_init(void)3335 void __init tcp_v4_init(void)
3336 {
3337 if (register_pernet_subsys(&tcp_sk_ops))
3338 panic("Failed to create the TCP control socket.\n");
3339
3340 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3341 bpf_iter_register();
3342 #endif
3343 }
3344