• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
tcp_v4_init_seq(const struct sk_buff * skb)94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
110 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
111 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 	struct tcp_sock *tp = tcp_sk(sk);
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
326 		inet_reset_saddr(sk);
327 	ip_rt_put(rt);
328 	sk->sk_route_caps = 0;
329 	inet->inet_dport = 0;
330 	return err;
331 }
332 EXPORT_SYMBOL(tcp_v4_connect);
333 
334 /*
335  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
336  * It can be called through tcp_release_cb() if socket was owned by user
337  * at the time tcp_v4_err() was called to handle ICMP message.
338  */
tcp_v4_mtu_reduced(struct sock * sk)339 void tcp_v4_mtu_reduced(struct sock *sk)
340 {
341 	struct inet_sock *inet = inet_sk(sk);
342 	struct dst_entry *dst;
343 	u32 mtu;
344 
345 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 		return;
347 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
348 	dst = inet_csk_update_pmtu(sk, mtu);
349 	if (!dst)
350 		return;
351 
352 	/* Something is about to be wrong... Remember soft error
353 	 * for the case, if this connection will not able to recover.
354 	 */
355 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
356 		sk->sk_err_soft = EMSGSIZE;
357 
358 	mtu = dst_mtu(dst);
359 
360 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
361 	    ip_sk_accept_pmtu(sk) &&
362 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
363 		tcp_sync_mss(sk, mtu);
364 
365 		/* Resend the TCP packet because it's
366 		 * clear that the old packet has been
367 		 * dropped. This is the new "fast" path mtu
368 		 * discovery.
369 		 */
370 		tcp_simple_retransmit(sk);
371 	} /* else let the usual retransmit timer handle it */
372 }
373 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374 
do_redirect(struct sk_buff * skb,struct sock * sk)375 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 {
377 	struct dst_entry *dst = __sk_dst_check(sk, 0);
378 
379 	if (dst)
380 		dst->ops->redirect(dst, sk, skb);
381 }
382 
383 
384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)385 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 {
387 	struct request_sock *req = inet_reqsk(sk);
388 	struct net *net = sock_net(sk);
389 
390 	/* ICMPs are not backlogged, hence we cannot get
391 	 * an established socket here.
392 	 */
393 	if (seq != tcp_rsk(req)->snt_isn) {
394 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 	} else if (abort) {
396 		/*
397 		 * Still in SYN_RECV, just remove it silently.
398 		 * There is no good way to pass the error to the newly
399 		 * created socket, and POSIX does not want network
400 		 * errors returned from accept().
401 		 */
402 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
403 		tcp_listendrop(req->rsk_listener);
404 	}
405 	reqsk_put(req);
406 }
407 EXPORT_SYMBOL(tcp_req_err);
408 
409 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411 {
412 	struct inet_connection_sock *icsk = inet_csk(sk);
413 	struct tcp_sock *tp = tcp_sk(sk);
414 	struct sk_buff *skb;
415 	s32 remaining;
416 	u32 delta_us;
417 
418 	if (sock_owned_by_user(sk))
419 		return;
420 
421 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
422 	    !icsk->icsk_backoff)
423 		return;
424 
425 	skb = tcp_rtx_queue_head(sk);
426 	if (WARN_ON_ONCE(!skb))
427 		return;
428 
429 	icsk->icsk_backoff--;
430 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432 
433 	tcp_mstamp_refresh(tp);
434 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 
437 	if (remaining > 0) {
438 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439 					  remaining, TCP_RTO_MAX);
440 	} else {
441 		/* RTO revert clocked out retransmission.
442 		 * Will retransmit now.
443 		 */
444 		tcp_retransmit_timer(sk);
445 	}
446 }
447 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 
449 /*
450  * This routine is called by the ICMP module when it gets some
451  * sort of error condition.  If err < 0 then the socket should
452  * be closed and the error returned to the user.  If err > 0
453  * it's just the icmp type << 8 | icmp code.  After adjustment
454  * header points to the first 8 bytes of the tcp header.  We need
455  * to find the appropriate port.
456  *
457  * The locking strategy used here is very "optimistic". When
458  * someone else accesses the socket the ICMP is just dropped
459  * and for some paths there is no check at all.
460  * A more general error queue to queue errors for later handling
461  * is probably better.
462  *
463  */
464 
tcp_v4_err(struct sk_buff * skb,u32 info)465 int tcp_v4_err(struct sk_buff *skb, u32 info)
466 {
467 	const struct iphdr *iph = (const struct iphdr *)skb->data;
468 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
469 	struct tcp_sock *tp;
470 	struct inet_sock *inet;
471 	const int type = icmp_hdr(skb)->type;
472 	const int code = icmp_hdr(skb)->code;
473 	struct sock *sk;
474 	struct request_sock *fastopen;
475 	u32 seq, snd_una;
476 	int err;
477 	struct net *net = dev_net(skb->dev);
478 
479 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
480 				       th->dest, iph->saddr, ntohs(th->source),
481 				       inet_iif(skb), 0);
482 	if (!sk) {
483 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 		return -ENOENT;
485 	}
486 	if (sk->sk_state == TCP_TIME_WAIT) {
487 		inet_twsk_put(inet_twsk(sk));
488 		return 0;
489 	}
490 	seq = ntohl(th->seq);
491 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
492 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493 				     type == ICMP_TIME_EXCEEDED ||
494 				     (type == ICMP_DEST_UNREACH &&
495 				      (code == ICMP_NET_UNREACH ||
496 				       code == ICMP_HOST_UNREACH)));
497 		return 0;
498 	}
499 
500 	bh_lock_sock(sk);
501 	/* If too many ICMPs get dropped on busy
502 	 * servers this needs to be solved differently.
503 	 * We do take care of PMTU discovery (RFC1191) special case :
504 	 * we can receive locally generated ICMP messages while socket is held.
505 	 */
506 	if (sock_owned_by_user(sk)) {
507 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
508 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
509 	}
510 	if (sk->sk_state == TCP_CLOSE)
511 		goto out;
512 
513 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
514 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
515 		goto out;
516 	}
517 
518 	tp = tcp_sk(sk);
519 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
520 	fastopen = rcu_dereference(tp->fastopen_rsk);
521 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
522 	if (sk->sk_state != TCP_LISTEN &&
523 	    !between(seq, snd_una, tp->snd_nxt)) {
524 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
525 		goto out;
526 	}
527 
528 	switch (type) {
529 	case ICMP_REDIRECT:
530 		if (!sock_owned_by_user(sk))
531 			do_redirect(skb, sk);
532 		goto out;
533 	case ICMP_SOURCE_QUENCH:
534 		/* Just silently ignore these. */
535 		goto out;
536 	case ICMP_PARAMETERPROB:
537 		err = EPROTO;
538 		break;
539 	case ICMP_DEST_UNREACH:
540 		if (code > NR_ICMP_UNREACH)
541 			goto out;
542 
543 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
544 			/* We are not interested in TCP_LISTEN and open_requests
545 			 * (SYN-ACKs send out by Linux are always <576bytes so
546 			 * they should go through unfragmented).
547 			 */
548 			if (sk->sk_state == TCP_LISTEN)
549 				goto out;
550 
551 			WRITE_ONCE(tp->mtu_info, info);
552 			if (!sock_owned_by_user(sk)) {
553 				tcp_v4_mtu_reduced(sk);
554 			} else {
555 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
556 					sock_hold(sk);
557 			}
558 			goto out;
559 		}
560 
561 		err = icmp_err_convert[code].errno;
562 		/* check if this ICMP message allows revert of backoff.
563 		 * (see RFC 6069)
564 		 */
565 		if (!fastopen &&
566 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
567 			tcp_ld_RTO_revert(sk, seq);
568 		break;
569 	case ICMP_TIME_EXCEEDED:
570 		err = EHOSTUNREACH;
571 		break;
572 	default:
573 		goto out;
574 	}
575 
576 	switch (sk->sk_state) {
577 	case TCP_SYN_SENT:
578 	case TCP_SYN_RECV:
579 		/* Only in fast or simultaneous open. If a fast open socket is
580 		 * already accepted it is treated as a connected one below.
581 		 */
582 		if (fastopen && !fastopen->sk)
583 			break;
584 
585 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
586 
587 		if (!sock_owned_by_user(sk)) {
588 			sk->sk_err = err;
589 
590 			sk->sk_error_report(sk);
591 
592 			tcp_done(sk);
593 		} else {
594 			sk->sk_err_soft = err;
595 		}
596 		goto out;
597 	}
598 
599 	/* If we've already connected we will keep trying
600 	 * until we time out, or the user gives up.
601 	 *
602 	 * rfc1122 4.2.3.9 allows to consider as hard errors
603 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
604 	 * but it is obsoleted by pmtu discovery).
605 	 *
606 	 * Note, that in modern internet, where routing is unreliable
607 	 * and in each dark corner broken firewalls sit, sending random
608 	 * errors ordered by their masters even this two messages finally lose
609 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
610 	 *
611 	 * Now we are in compliance with RFCs.
612 	 *							--ANK (980905)
613 	 */
614 
615 	inet = inet_sk(sk);
616 	if (!sock_owned_by_user(sk) && inet->recverr) {
617 		sk->sk_err = err;
618 		sk->sk_error_report(sk);
619 	} else	{ /* Only an error on timeout */
620 		sk->sk_err_soft = err;
621 	}
622 
623 out:
624 	bh_unlock_sock(sk);
625 	sock_put(sk);
626 	return 0;
627 }
628 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)629 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
630 {
631 	struct tcphdr *th = tcp_hdr(skb);
632 
633 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
634 	skb->csum_start = skb_transport_header(skb) - skb->head;
635 	skb->csum_offset = offsetof(struct tcphdr, check);
636 }
637 
638 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)639 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
640 {
641 	const struct inet_sock *inet = inet_sk(sk);
642 
643 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
644 }
645 EXPORT_SYMBOL(tcp_v4_send_check);
646 
647 /*
648  *	This routine will send an RST to the other tcp.
649  *
650  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
651  *		      for reset.
652  *	Answer: if a packet caused RST, it is not for a socket
653  *		existing in our system, if it is matched to a socket,
654  *		it is just duplicate segment or bug in other side's TCP.
655  *		So that we build reply only basing on parameters
656  *		arrived with segment.
657  *	Exception: precedence violation. We do not implement it in any case.
658  */
659 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)660 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
661 {
662 	const struct tcphdr *th = tcp_hdr(skb);
663 	struct {
664 		struct tcphdr th;
665 #ifdef CONFIG_TCP_MD5SIG
666 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
667 #endif
668 	} rep;
669 	struct ip_reply_arg arg;
670 #ifdef CONFIG_TCP_MD5SIG
671 	struct tcp_md5sig_key *key = NULL;
672 	const __u8 *hash_location = NULL;
673 	unsigned char newhash[16];
674 	int genhash;
675 	struct sock *sk1 = NULL;
676 #endif
677 	u64 transmit_time = 0;
678 	struct sock *ctl_sk;
679 	struct net *net;
680 
681 	/* Never send a reset in response to a reset. */
682 	if (th->rst)
683 		return;
684 
685 	/* If sk not NULL, it means we did a successful lookup and incoming
686 	 * route had to be correct. prequeue might have dropped our dst.
687 	 */
688 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
689 		return;
690 
691 	/* Swap the send and the receive. */
692 	memset(&rep, 0, sizeof(rep));
693 	rep.th.dest   = th->source;
694 	rep.th.source = th->dest;
695 	rep.th.doff   = sizeof(struct tcphdr) / 4;
696 	rep.th.rst    = 1;
697 
698 	if (th->ack) {
699 		rep.th.seq = th->ack_seq;
700 	} else {
701 		rep.th.ack = 1;
702 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
703 				       skb->len - (th->doff << 2));
704 	}
705 
706 	memset(&arg, 0, sizeof(arg));
707 	arg.iov[0].iov_base = (unsigned char *)&rep;
708 	arg.iov[0].iov_len  = sizeof(rep.th);
709 
710 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
711 #ifdef CONFIG_TCP_MD5SIG
712 	rcu_read_lock();
713 	hash_location = tcp_parse_md5sig_option(th);
714 	if (sk && sk_fullsock(sk)) {
715 		const union tcp_md5_addr *addr;
716 		int l3index;
717 
718 		/* sdif set, means packet ingressed via a device
719 		 * in an L3 domain and inet_iif is set to it.
720 		 */
721 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
722 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
723 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
724 	} else if (hash_location) {
725 		const union tcp_md5_addr *addr;
726 		int sdif = tcp_v4_sdif(skb);
727 		int dif = inet_iif(skb);
728 		int l3index;
729 
730 		/*
731 		 * active side is lost. Try to find listening socket through
732 		 * source port, and then find md5 key through listening socket.
733 		 * we are not loose security here:
734 		 * Incoming packet is checked with md5 hash with finding key,
735 		 * no RST generated if md5 hash doesn't match.
736 		 */
737 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
738 					     ip_hdr(skb)->saddr,
739 					     th->source, ip_hdr(skb)->daddr,
740 					     ntohs(th->source), dif, sdif);
741 		/* don't send rst if it can't find key */
742 		if (!sk1)
743 			goto out;
744 
745 		/* sdif set, means packet ingressed via a device
746 		 * in an L3 domain and dif is set to it.
747 		 */
748 		l3index = sdif ? dif : 0;
749 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
750 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
751 		if (!key)
752 			goto out;
753 
754 
755 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
756 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
757 			goto out;
758 
759 	}
760 
761 	if (key) {
762 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
763 				   (TCPOPT_NOP << 16) |
764 				   (TCPOPT_MD5SIG << 8) |
765 				   TCPOLEN_MD5SIG);
766 		/* Update length and the length the header thinks exists */
767 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
768 		rep.th.doff = arg.iov[0].iov_len / 4;
769 
770 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
771 				     key, ip_hdr(skb)->saddr,
772 				     ip_hdr(skb)->daddr, &rep.th);
773 	}
774 #endif
775 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
776 				      ip_hdr(skb)->saddr, /* XXX */
777 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
778 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
779 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
780 
781 	/* When socket is gone, all binding information is lost.
782 	 * routing might fail in this case. No choice here, if we choose to force
783 	 * input interface, we will misroute in case of asymmetric route.
784 	 */
785 	if (sk) {
786 		arg.bound_dev_if = sk->sk_bound_dev_if;
787 		if (sk_fullsock(sk))
788 			trace_tcp_send_reset(sk, skb);
789 	}
790 
791 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
792 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
793 
794 	arg.tos = ip_hdr(skb)->tos;
795 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
796 	local_bh_disable();
797 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
798 	if (sk) {
799 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
801 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
802 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
803 		transmit_time = tcp_transmit_time(sk);
804 	}
805 	ip_send_unicast_reply(ctl_sk,
806 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
807 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
808 			      &arg, arg.iov[0].iov_len,
809 			      transmit_time);
810 
811 	ctl_sk->sk_mark = 0;
812 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
813 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
814 	local_bh_enable();
815 
816 #ifdef CONFIG_TCP_MD5SIG
817 out:
818 	rcu_read_unlock();
819 #endif
820 }
821 
822 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
823    outside socket context is ugly, certainly. What can I do?
824  */
825 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)826 static void tcp_v4_send_ack(const struct sock *sk,
827 			    struct sk_buff *skb, u32 seq, u32 ack,
828 			    u32 win, u32 tsval, u32 tsecr, int oif,
829 			    struct tcp_md5sig_key *key,
830 			    int reply_flags, u8 tos)
831 {
832 	const struct tcphdr *th = tcp_hdr(skb);
833 	struct {
834 		struct tcphdr th;
835 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
836 #ifdef CONFIG_TCP_MD5SIG
837 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
838 #endif
839 			];
840 	} rep;
841 	struct net *net = sock_net(sk);
842 	struct ip_reply_arg arg;
843 	struct sock *ctl_sk;
844 	u64 transmit_time;
845 
846 	memset(&rep.th, 0, sizeof(struct tcphdr));
847 	memset(&arg, 0, sizeof(arg));
848 
849 	arg.iov[0].iov_base = (unsigned char *)&rep;
850 	arg.iov[0].iov_len  = sizeof(rep.th);
851 	if (tsecr) {
852 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
853 				   (TCPOPT_TIMESTAMP << 8) |
854 				   TCPOLEN_TIMESTAMP);
855 		rep.opt[1] = htonl(tsval);
856 		rep.opt[2] = htonl(tsecr);
857 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
858 	}
859 
860 	/* Swap the send and the receive. */
861 	rep.th.dest    = th->source;
862 	rep.th.source  = th->dest;
863 	rep.th.doff    = arg.iov[0].iov_len / 4;
864 	rep.th.seq     = htonl(seq);
865 	rep.th.ack_seq = htonl(ack);
866 	rep.th.ack     = 1;
867 	rep.th.window  = htons(win);
868 
869 #ifdef CONFIG_TCP_MD5SIG
870 	if (key) {
871 		int offset = (tsecr) ? 3 : 0;
872 
873 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
874 					  (TCPOPT_NOP << 16) |
875 					  (TCPOPT_MD5SIG << 8) |
876 					  TCPOLEN_MD5SIG);
877 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
878 		rep.th.doff = arg.iov[0].iov_len/4;
879 
880 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
881 				    key, ip_hdr(skb)->saddr,
882 				    ip_hdr(skb)->daddr, &rep.th);
883 	}
884 #endif
885 	arg.flags = reply_flags;
886 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
887 				      ip_hdr(skb)->saddr, /* XXX */
888 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
889 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
890 	if (oif)
891 		arg.bound_dev_if = oif;
892 	arg.tos = tos;
893 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
894 	local_bh_disable();
895 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
896 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
898 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
899 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
900 	transmit_time = tcp_transmit_time(sk);
901 	ip_send_unicast_reply(ctl_sk,
902 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
903 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
904 			      &arg, arg.iov[0].iov_len,
905 			      transmit_time);
906 
907 	ctl_sk->sk_mark = 0;
908 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
909 	local_bh_enable();
910 }
911 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)912 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
913 {
914 	struct inet_timewait_sock *tw = inet_twsk(sk);
915 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
916 
917 	tcp_v4_send_ack(sk, skb,
918 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
919 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
920 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
921 			tcptw->tw_ts_recent,
922 			tw->tw_bound_dev_if,
923 			tcp_twsk_md5_key(tcptw),
924 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
925 			tw->tw_tos
926 			);
927 
928 	inet_twsk_put(tw);
929 }
930 
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)931 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
932 				  struct request_sock *req)
933 {
934 	const union tcp_md5_addr *addr;
935 	int l3index;
936 
937 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
938 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
939 	 */
940 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
941 					     tcp_sk(sk)->snd_nxt;
942 
943 	/* RFC 7323 2.3
944 	 * The window field (SEG.WND) of every outgoing segment, with the
945 	 * exception of <SYN> segments, MUST be right-shifted by
946 	 * Rcv.Wind.Shift bits:
947 	 */
948 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
949 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
950 	tcp_v4_send_ack(sk, skb, seq,
951 			tcp_rsk(req)->rcv_nxt,
952 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
953 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
954 			READ_ONCE(req->ts_recent),
955 			0,
956 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
957 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
958 			ip_hdr(skb)->tos);
959 }
960 
961 /*
962  *	Send a SYN-ACK after having received a SYN.
963  *	This still operates on a request_sock only, not on a big
964  *	socket.
965  */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)966 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
967 			      struct flowi *fl,
968 			      struct request_sock *req,
969 			      struct tcp_fastopen_cookie *foc,
970 			      enum tcp_synack_type synack_type,
971 			      struct sk_buff *syn_skb)
972 {
973 	const struct inet_request_sock *ireq = inet_rsk(req);
974 	struct flowi4 fl4;
975 	int err = -1;
976 	struct sk_buff *skb;
977 	u8 tos;
978 
979 	/* First, grab a route. */
980 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
981 		return -1;
982 
983 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
984 
985 	if (skb) {
986 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
987 
988 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
989 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
990 				(inet_sk(sk)->tos & INET_ECN_MASK) :
991 				inet_sk(sk)->tos;
992 
993 		if (!INET_ECN_is_capable(tos) &&
994 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
995 			tos |= INET_ECN_ECT_0;
996 
997 		rcu_read_lock();
998 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
999 					    ireq->ir_rmt_addr,
1000 					    rcu_dereference(ireq->ireq_opt),
1001 					    tos);
1002 		rcu_read_unlock();
1003 		err = net_xmit_eval(err);
1004 	}
1005 
1006 	return err;
1007 }
1008 
1009 /*
1010  *	IPv4 request_sock destructor.
1011  */
tcp_v4_reqsk_destructor(struct request_sock * req)1012 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1013 {
1014 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1015 }
1016 
1017 #ifdef CONFIG_TCP_MD5SIG
1018 /*
1019  * RFC2385 MD5 checksumming requires a mapping of
1020  * IP address->MD5 Key.
1021  * We need to maintain these in the sk structure.
1022  */
1023 
1024 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1025 EXPORT_SYMBOL(tcp_md5_needed);
1026 
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1027 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1028 {
1029 	if (!old)
1030 		return true;
1031 
1032 	/* l3index always overrides non-l3index */
1033 	if (old->l3index && new->l3index == 0)
1034 		return false;
1035 	if (old->l3index == 0 && new->l3index)
1036 		return true;
1037 
1038 	return old->prefixlen < new->prefixlen;
1039 }
1040 
1041 /* Find the Key structure for an address.  */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1042 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1043 					   const union tcp_md5_addr *addr,
1044 					   int family)
1045 {
1046 	const struct tcp_sock *tp = tcp_sk(sk);
1047 	struct tcp_md5sig_key *key;
1048 	const struct tcp_md5sig_info *md5sig;
1049 	__be32 mask;
1050 	struct tcp_md5sig_key *best_match = NULL;
1051 	bool match;
1052 
1053 	/* caller either holds rcu_read_lock() or socket lock */
1054 	md5sig = rcu_dereference_check(tp->md5sig_info,
1055 				       lockdep_sock_is_held(sk));
1056 	if (!md5sig)
1057 		return NULL;
1058 
1059 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1060 				 lockdep_sock_is_held(sk)) {
1061 		if (key->family != family)
1062 			continue;
1063 		if (key->l3index && key->l3index != l3index)
1064 			continue;
1065 		if (family == AF_INET) {
1066 			mask = inet_make_mask(key->prefixlen);
1067 			match = (key->addr.a4.s_addr & mask) ==
1068 				(addr->a4.s_addr & mask);
1069 #if IS_ENABLED(CONFIG_IPV6)
1070 		} else if (family == AF_INET6) {
1071 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1072 						  key->prefixlen);
1073 #endif
1074 		} else {
1075 			match = false;
1076 		}
1077 
1078 		if (match && better_md5_match(best_match, key))
1079 			best_match = key;
1080 	}
1081 	return best_match;
1082 }
1083 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084 
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086 						      const union tcp_md5_addr *addr,
1087 						      int family, u8 prefixlen,
1088 						      int l3index)
1089 {
1090 	const struct tcp_sock *tp = tcp_sk(sk);
1091 	struct tcp_md5sig_key *key;
1092 	unsigned int size = sizeof(struct in_addr);
1093 	const struct tcp_md5sig_info *md5sig;
1094 
1095 	/* caller either holds rcu_read_lock() or socket lock */
1096 	md5sig = rcu_dereference_check(tp->md5sig_info,
1097 				       lockdep_sock_is_held(sk));
1098 	if (!md5sig)
1099 		return NULL;
1100 #if IS_ENABLED(CONFIG_IPV6)
1101 	if (family == AF_INET6)
1102 		size = sizeof(struct in6_addr);
1103 #endif
1104 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 				 lockdep_sock_is_held(sk)) {
1106 		if (key->family != family)
1107 			continue;
1108 		if (key->l3index != l3index)
1109 			continue;
1110 		if (!memcmp(&key->addr, addr, size) &&
1111 		    key->prefixlen == prefixlen)
1112 			return key;
1113 	}
1114 	return NULL;
1115 }
1116 
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1117 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118 					 const struct sock *addr_sk)
1119 {
1120 	const union tcp_md5_addr *addr;
1121 	int l3index;
1122 
1123 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124 						 addr_sk->sk_bound_dev_if);
1125 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129 
1130 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,const u8 * newkey,u8 newkeylen,gfp_t gfp)1131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132 		   int family, u8 prefixlen, int l3index,
1133 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134 {
1135 	/* Add Key to the list */
1136 	struct tcp_md5sig_key *key;
1137 	struct tcp_sock *tp = tcp_sk(sk);
1138 	struct tcp_md5sig_info *md5sig;
1139 
1140 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141 	if (key) {
1142 		/* Pre-existing entry - just update that one.
1143 		 * Note that the key might be used concurrently.
1144 		 * data_race() is telling kcsan that we do not care of
1145 		 * key mismatches, since changing MD5 key on live flows
1146 		 * can lead to packet drops.
1147 		 */
1148 		data_race(memcpy(key->key, newkey, newkeylen));
1149 
1150 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151 		 * Also note that a reader could catch new key->keylen value
1152 		 * but old key->key[], this is the reason we use __GFP_ZERO
1153 		 * at sock_kmalloc() time below these lines.
1154 		 */
1155 		WRITE_ONCE(key->keylen, newkeylen);
1156 
1157 		return 0;
1158 	}
1159 
1160 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1161 					   lockdep_sock_is_held(sk));
1162 	if (!md5sig) {
1163 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1164 		if (!md5sig)
1165 			return -ENOMEM;
1166 
1167 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168 		INIT_HLIST_HEAD(&md5sig->head);
1169 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1170 	}
1171 
1172 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173 	if (!key)
1174 		return -ENOMEM;
1175 	if (!tcp_alloc_md5sig_pool()) {
1176 		sock_kfree_s(sk, key, sizeof(*key));
1177 		return -ENOMEM;
1178 	}
1179 
1180 	memcpy(key->key, newkey, newkeylen);
1181 	key->keylen = newkeylen;
1182 	key->family = family;
1183 	key->prefixlen = prefixlen;
1184 	key->l3index = l3index;
1185 	memcpy(&key->addr, addr,
1186 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1187 				      sizeof(struct in_addr));
1188 	hlist_add_head_rcu(&key->node, &md5sig->head);
1189 	return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_add);
1192 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index)1193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194 		   u8 prefixlen, int l3index)
1195 {
1196 	struct tcp_md5sig_key *key;
1197 
1198 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199 	if (!key)
1200 		return -ENOENT;
1201 	hlist_del_rcu(&key->node);
1202 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203 	kfree_rcu(key, rcu);
1204 	return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_del);
1207 
tcp_clear_md5_list(struct sock * sk)1208 static void tcp_clear_md5_list(struct sock *sk)
1209 {
1210 	struct tcp_sock *tp = tcp_sk(sk);
1211 	struct tcp_md5sig_key *key;
1212 	struct hlist_node *n;
1213 	struct tcp_md5sig_info *md5sig;
1214 
1215 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216 
1217 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218 		hlist_del_rcu(&key->node);
1219 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220 		kfree_rcu(key, rcu);
1221 	}
1222 }
1223 
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225 				 sockptr_t optval, int optlen)
1226 {
1227 	struct tcp_md5sig cmd;
1228 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229 	const union tcp_md5_addr *addr;
1230 	u8 prefixlen = 32;
1231 	int l3index = 0;
1232 
1233 	if (optlen < sizeof(cmd))
1234 		return -EINVAL;
1235 
1236 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237 		return -EFAULT;
1238 
1239 	if (sin->sin_family != AF_INET)
1240 		return -EINVAL;
1241 
1242 	if (optname == TCP_MD5SIG_EXT &&
1243 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244 		prefixlen = cmd.tcpm_prefixlen;
1245 		if (prefixlen > 32)
1246 			return -EINVAL;
1247 	}
1248 
1249 	if (optname == TCP_MD5SIG_EXT &&
1250 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251 		struct net_device *dev;
1252 
1253 		rcu_read_lock();
1254 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255 		if (dev && netif_is_l3_master(dev))
1256 			l3index = dev->ifindex;
1257 
1258 		rcu_read_unlock();
1259 
1260 		/* ok to reference set/not set outside of rcu;
1261 		 * right now device MUST be an L3 master
1262 		 */
1263 		if (!dev || !l3index)
1264 			return -EINVAL;
1265 	}
1266 
1267 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268 
1269 	if (!cmd.tcpm_keylen)
1270 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271 
1272 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273 		return -EINVAL;
1274 
1275 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 }
1278 
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280 				   __be32 daddr, __be32 saddr,
1281 				   const struct tcphdr *th, int nbytes)
1282 {
1283 	struct tcp4_pseudohdr *bp;
1284 	struct scatterlist sg;
1285 	struct tcphdr *_th;
1286 
1287 	bp = hp->scratch;
1288 	bp->saddr = saddr;
1289 	bp->daddr = daddr;
1290 	bp->pad = 0;
1291 	bp->protocol = IPPROTO_TCP;
1292 	bp->len = cpu_to_be16(nbytes);
1293 
1294 	_th = (struct tcphdr *)(bp + 1);
1295 	memcpy(_th, th, sizeof(*th));
1296 	_th->check = 0;
1297 
1298 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300 				sizeof(*bp) + sizeof(*th));
1301 	return crypto_ahash_update(hp->md5_req);
1302 }
1303 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1304 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306 {
1307 	struct tcp_md5sig_pool *hp;
1308 	struct ahash_request *req;
1309 
1310 	hp = tcp_get_md5sig_pool();
1311 	if (!hp)
1312 		goto clear_hash_noput;
1313 	req = hp->md5_req;
1314 
1315 	if (crypto_ahash_init(req))
1316 		goto clear_hash;
1317 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318 		goto clear_hash;
1319 	if (tcp_md5_hash_key(hp, key))
1320 		goto clear_hash;
1321 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322 	if (crypto_ahash_final(req))
1323 		goto clear_hash;
1324 
1325 	tcp_put_md5sig_pool();
1326 	return 0;
1327 
1328 clear_hash:
1329 	tcp_put_md5sig_pool();
1330 clear_hash_noput:
1331 	memset(md5_hash, 0, 16);
1332 	return 1;
1333 }
1334 
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1335 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336 			const struct sock *sk,
1337 			const struct sk_buff *skb)
1338 {
1339 	struct tcp_md5sig_pool *hp;
1340 	struct ahash_request *req;
1341 	const struct tcphdr *th = tcp_hdr(skb);
1342 	__be32 saddr, daddr;
1343 
1344 	if (sk) { /* valid for establish/request sockets */
1345 		saddr = sk->sk_rcv_saddr;
1346 		daddr = sk->sk_daddr;
1347 	} else {
1348 		const struct iphdr *iph = ip_hdr(skb);
1349 		saddr = iph->saddr;
1350 		daddr = iph->daddr;
1351 	}
1352 
1353 	hp = tcp_get_md5sig_pool();
1354 	if (!hp)
1355 		goto clear_hash_noput;
1356 	req = hp->md5_req;
1357 
1358 	if (crypto_ahash_init(req))
1359 		goto clear_hash;
1360 
1361 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362 		goto clear_hash;
1363 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364 		goto clear_hash;
1365 	if (tcp_md5_hash_key(hp, key))
1366 		goto clear_hash;
1367 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368 	if (crypto_ahash_final(req))
1369 		goto clear_hash;
1370 
1371 	tcp_put_md5sig_pool();
1372 	return 0;
1373 
1374 clear_hash:
1375 	tcp_put_md5sig_pool();
1376 clear_hash_noput:
1377 	memset(md5_hash, 0, 16);
1378 	return 1;
1379 }
1380 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381 
1382 #endif
1383 
1384 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb,int dif,int sdif)1385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386 				    const struct sk_buff *skb,
1387 				    int dif, int sdif)
1388 {
1389 #ifdef CONFIG_TCP_MD5SIG
1390 	/*
1391 	 * This gets called for each TCP segment that arrives
1392 	 * so we want to be efficient.
1393 	 * We have 3 drop cases:
1394 	 * o No MD5 hash and one expected.
1395 	 * o MD5 hash and we're not expecting one.
1396 	 * o MD5 hash and its wrong.
1397 	 */
1398 	const __u8 *hash_location = NULL;
1399 	struct tcp_md5sig_key *hash_expected;
1400 	const struct iphdr *iph = ip_hdr(skb);
1401 	const struct tcphdr *th = tcp_hdr(skb);
1402 	const union tcp_md5_addr *addr;
1403 	unsigned char newhash[16];
1404 	int genhash, l3index;
1405 
1406 	/* sdif set, means packet ingressed via a device
1407 	 * in an L3 domain and dif is set to the l3mdev
1408 	 */
1409 	l3index = sdif ? dif : 0;
1410 
1411 	addr = (union tcp_md5_addr *)&iph->saddr;
1412 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413 	hash_location = tcp_parse_md5sig_option(th);
1414 
1415 	/* We've parsed the options - do we have a hash? */
1416 	if (!hash_expected && !hash_location)
1417 		return false;
1418 
1419 	if (hash_expected && !hash_location) {
1420 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421 		return true;
1422 	}
1423 
1424 	if (!hash_expected && hash_location) {
1425 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426 		return true;
1427 	}
1428 
1429 	/* Okay, so this is hash_expected and hash_location -
1430 	 * so we need to calculate the checksum.
1431 	 */
1432 	genhash = tcp_v4_md5_hash_skb(newhash,
1433 				      hash_expected,
1434 				      NULL, skb);
1435 
1436 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439 				     &iph->saddr, ntohs(th->source),
1440 				     &iph->daddr, ntohs(th->dest),
1441 				     genhash ? " tcp_v4_calc_md5_hash failed"
1442 				     : "", l3index);
1443 		return true;
1444 	}
1445 	return false;
1446 #endif
1447 	return false;
1448 }
1449 
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1450 static void tcp_v4_init_req(struct request_sock *req,
1451 			    const struct sock *sk_listener,
1452 			    struct sk_buff *skb)
1453 {
1454 	struct inet_request_sock *ireq = inet_rsk(req);
1455 	struct net *net = sock_net(sk_listener);
1456 
1457 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 }
1461 
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1462 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463 					  struct flowi *fl,
1464 					  const struct request_sock *req)
1465 {
1466 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1467 }
1468 
1469 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1470 	.family		=	PF_INET,
1471 	.obj_size	=	sizeof(struct tcp_request_sock),
1472 	.rtx_syn_ack	=	tcp_rtx_synack,
1473 	.send_ack	=	tcp_v4_reqsk_send_ack,
1474 	.destructor	=	tcp_v4_reqsk_destructor,
1475 	.send_reset	=	tcp_v4_send_reset,
1476 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1477 };
1478 
1479 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1480 	.mss_clamp	=	TCP_MSS_DEFAULT,
1481 #ifdef CONFIG_TCP_MD5SIG
1482 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1483 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1484 #endif
1485 	.init_req	=	tcp_v4_init_req,
1486 #ifdef CONFIG_SYN_COOKIES
1487 	.cookie_init_seq =	cookie_v4_init_sequence,
1488 #endif
1489 	.route_req	=	tcp_v4_route_req,
1490 	.init_seq	=	tcp_v4_init_seq,
1491 	.init_ts_off	=	tcp_v4_init_ts_off,
1492 	.send_synack	=	tcp_v4_send_synack,
1493 };
1494 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1495 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1496 {
1497 	/* Never answer to SYNs send to broadcast or multicast */
1498 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1499 		goto drop;
1500 
1501 	return tcp_conn_request(&tcp_request_sock_ops,
1502 				&tcp_request_sock_ipv4_ops, sk, skb);
1503 
1504 drop:
1505 	tcp_listendrop(sk);
1506 	return 0;
1507 }
1508 EXPORT_SYMBOL(tcp_v4_conn_request);
1509 
1510 
1511 /*
1512  * The three way handshake has completed - we got a valid synack -
1513  * now create the new socket.
1514  */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1515 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1516 				  struct request_sock *req,
1517 				  struct dst_entry *dst,
1518 				  struct request_sock *req_unhash,
1519 				  bool *own_req)
1520 {
1521 	struct inet_request_sock *ireq;
1522 	bool found_dup_sk = false;
1523 	struct inet_sock *newinet;
1524 	struct tcp_sock *newtp;
1525 	struct sock *newsk;
1526 #ifdef CONFIG_TCP_MD5SIG
1527 	const union tcp_md5_addr *addr;
1528 	struct tcp_md5sig_key *key;
1529 	int l3index;
1530 #endif
1531 	struct ip_options_rcu *inet_opt;
1532 
1533 	if (sk_acceptq_is_full(sk))
1534 		goto exit_overflow;
1535 
1536 	newsk = tcp_create_openreq_child(sk, req, skb);
1537 	if (!newsk)
1538 		goto exit_nonewsk;
1539 
1540 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1541 	inet_sk_rx_dst_set(newsk, skb);
1542 
1543 	newtp		      = tcp_sk(newsk);
1544 	newinet		      = inet_sk(newsk);
1545 	ireq		      = inet_rsk(req);
1546 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1547 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1548 	newsk->sk_bound_dev_if = ireq->ir_iif;
1549 	newinet->inet_saddr   = ireq->ir_loc_addr;
1550 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1551 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1552 	newinet->mc_index     = inet_iif(skb);
1553 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1554 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1555 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1556 	if (inet_opt)
1557 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1558 	newinet->inet_id = prandom_u32();
1559 
1560 	/* Set ToS of the new socket based upon the value of incoming SYN.
1561 	 * ECT bits are set later in tcp_init_transfer().
1562 	 */
1563 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1564 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1565 
1566 	if (!dst) {
1567 		dst = inet_csk_route_child_sock(sk, newsk, req);
1568 		if (!dst)
1569 			goto put_and_exit;
1570 	} else {
1571 		/* syncookie case : see end of cookie_v4_check() */
1572 	}
1573 	sk_setup_caps(newsk, dst);
1574 
1575 	tcp_ca_openreq_child(newsk, dst);
1576 
1577 	tcp_sync_mss(newsk, dst_mtu(dst));
1578 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1579 
1580 	tcp_initialize_rcv_mss(newsk);
1581 
1582 #ifdef CONFIG_TCP_MD5SIG
1583 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1584 	/* Copy over the MD5 key from the original socket */
1585 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1586 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1587 	if (key) {
1588 		/*
1589 		 * We're using one, so create a matching key
1590 		 * on the newsk structure. If we fail to get
1591 		 * memory, then we end up not copying the key
1592 		 * across. Shucks.
1593 		 */
1594 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1595 			       key->key, key->keylen, GFP_ATOMIC);
1596 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1597 	}
1598 #endif
1599 
1600 	if (__inet_inherit_port(sk, newsk) < 0)
1601 		goto put_and_exit;
1602 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1603 				       &found_dup_sk);
1604 	if (likely(*own_req)) {
1605 		tcp_move_syn(newtp, req);
1606 		ireq->ireq_opt = NULL;
1607 	} else {
1608 		newinet->inet_opt = NULL;
1609 
1610 		if (!req_unhash && found_dup_sk) {
1611 			/* This code path should only be executed in the
1612 			 * syncookie case only
1613 			 */
1614 			bh_unlock_sock(newsk);
1615 			sock_put(newsk);
1616 			newsk = NULL;
1617 		}
1618 	}
1619 	return newsk;
1620 
1621 exit_overflow:
1622 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1623 exit_nonewsk:
1624 	dst_release(dst);
1625 exit:
1626 	tcp_listendrop(sk);
1627 	return NULL;
1628 put_and_exit:
1629 	newinet->inet_opt = NULL;
1630 	inet_csk_prepare_forced_close(newsk);
1631 	tcp_done(newsk);
1632 	goto exit;
1633 }
1634 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1635 
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1636 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1637 {
1638 #ifdef CONFIG_SYN_COOKIES
1639 	const struct tcphdr *th = tcp_hdr(skb);
1640 
1641 	if (!th->syn)
1642 		sk = cookie_v4_check(sk, skb);
1643 #endif
1644 	return sk;
1645 }
1646 
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1647 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1648 			 struct tcphdr *th, u32 *cookie)
1649 {
1650 	u16 mss = 0;
1651 #ifdef CONFIG_SYN_COOKIES
1652 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1653 				    &tcp_request_sock_ipv4_ops, sk, th);
1654 	if (mss) {
1655 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1656 		tcp_synq_overflow(sk);
1657 	}
1658 #endif
1659 	return mss;
1660 }
1661 
1662 /* The socket must have it's spinlock held when we get
1663  * here, unless it is a TCP_LISTEN socket.
1664  *
1665  * We have a potential double-lock case here, so even when
1666  * doing backlog processing we use the BH locking scheme.
1667  * This is because we cannot sleep with the original spinlock
1668  * held.
1669  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1670 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1671 {
1672 	struct sock *rsk;
1673 
1674 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1675 		struct dst_entry *dst;
1676 
1677 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1678 						lockdep_sock_is_held(sk));
1679 
1680 		sock_rps_save_rxhash(sk, skb);
1681 		sk_mark_napi_id(sk, skb);
1682 		if (dst) {
1683 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1684 			    !dst->ops->check(dst, 0)) {
1685 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1686 				dst_release(dst);
1687 			}
1688 		}
1689 		tcp_rcv_established(sk, skb);
1690 		return 0;
1691 	}
1692 
1693 	if (tcp_checksum_complete(skb))
1694 		goto csum_err;
1695 
1696 	if (sk->sk_state == TCP_LISTEN) {
1697 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1698 
1699 		if (!nsk)
1700 			goto discard;
1701 		if (nsk != sk) {
1702 			if (tcp_child_process(sk, nsk, skb)) {
1703 				rsk = nsk;
1704 				goto reset;
1705 			}
1706 			return 0;
1707 		}
1708 	} else
1709 		sock_rps_save_rxhash(sk, skb);
1710 
1711 	if (tcp_rcv_state_process(sk, skb)) {
1712 		rsk = sk;
1713 		goto reset;
1714 	}
1715 	return 0;
1716 
1717 reset:
1718 	tcp_v4_send_reset(rsk, skb);
1719 discard:
1720 	kfree_skb(skb);
1721 	/* Be careful here. If this function gets more complicated and
1722 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1723 	 * might be destroyed here. This current version compiles correctly,
1724 	 * but you have been warned.
1725 	 */
1726 	return 0;
1727 
1728 csum_err:
1729 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1730 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1731 	goto discard;
1732 }
1733 EXPORT_SYMBOL(tcp_v4_do_rcv);
1734 
tcp_v4_early_demux(struct sk_buff * skb)1735 int tcp_v4_early_demux(struct sk_buff *skb)
1736 {
1737 	const struct iphdr *iph;
1738 	const struct tcphdr *th;
1739 	struct sock *sk;
1740 
1741 	if (skb->pkt_type != PACKET_HOST)
1742 		return 0;
1743 
1744 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1745 		return 0;
1746 
1747 	iph = ip_hdr(skb);
1748 	th = tcp_hdr(skb);
1749 
1750 	if (th->doff < sizeof(struct tcphdr) / 4)
1751 		return 0;
1752 
1753 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1754 				       iph->saddr, th->source,
1755 				       iph->daddr, ntohs(th->dest),
1756 				       skb->skb_iif, inet_sdif(skb));
1757 	if (sk) {
1758 		skb->sk = sk;
1759 		skb->destructor = sock_edemux;
1760 		if (sk_fullsock(sk)) {
1761 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1762 
1763 			if (dst)
1764 				dst = dst_check(dst, 0);
1765 			if (dst &&
1766 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1767 				skb_dst_set_noref(skb, dst);
1768 		}
1769 	}
1770 	return 0;
1771 }
1772 
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1773 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1774 {
1775 	u32 limit, tail_gso_size, tail_gso_segs;
1776 	struct skb_shared_info *shinfo;
1777 	const struct tcphdr *th;
1778 	struct tcphdr *thtail;
1779 	struct sk_buff *tail;
1780 	unsigned int hdrlen;
1781 	bool fragstolen;
1782 	u32 gso_segs;
1783 	u32 gso_size;
1784 	int delta;
1785 
1786 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1787 	 * we can fix skb->truesize to its real value to avoid future drops.
1788 	 * This is valid because skb is not yet charged to the socket.
1789 	 * It has been noticed pure SACK packets were sometimes dropped
1790 	 * (if cooked by drivers without copybreak feature).
1791 	 */
1792 	skb_condense(skb);
1793 
1794 	skb_dst_drop(skb);
1795 
1796 	if (unlikely(tcp_checksum_complete(skb))) {
1797 		bh_unlock_sock(sk);
1798 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1799 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1800 		return true;
1801 	}
1802 
1803 	/* Attempt coalescing to last skb in backlog, even if we are
1804 	 * above the limits.
1805 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1806 	 */
1807 	th = (const struct tcphdr *)skb->data;
1808 	hdrlen = th->doff * 4;
1809 
1810 	tail = sk->sk_backlog.tail;
1811 	if (!tail)
1812 		goto no_coalesce;
1813 	thtail = (struct tcphdr *)tail->data;
1814 
1815 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1816 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1817 	    ((TCP_SKB_CB(tail)->tcp_flags |
1818 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1819 	    !((TCP_SKB_CB(tail)->tcp_flags &
1820 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1821 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1822 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1823 #ifdef CONFIG_TLS_DEVICE
1824 	    tail->decrypted != skb->decrypted ||
1825 #endif
1826 	    !mptcp_skb_can_collapse(tail, skb) ||
1827 	    thtail->doff != th->doff ||
1828 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1829 		goto no_coalesce;
1830 
1831 	__skb_pull(skb, hdrlen);
1832 
1833 	shinfo = skb_shinfo(skb);
1834 	gso_size = shinfo->gso_size ?: skb->len;
1835 	gso_segs = shinfo->gso_segs ?: 1;
1836 
1837 	shinfo = skb_shinfo(tail);
1838 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1839 	tail_gso_segs = shinfo->gso_segs ?: 1;
1840 
1841 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1842 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1843 
1844 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1845 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1846 			thtail->window = th->window;
1847 		}
1848 
1849 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1850 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1851 		 * is not entered if we append a packet with a FIN.
1852 		 * SYN, RST, URG are not present.
1853 		 * ACK is set on both packets.
1854 		 * PSH : we do not really care in TCP stack,
1855 		 *       at least for 'GRO' packets.
1856 		 */
1857 		thtail->fin |= th->fin;
1858 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1859 
1860 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1861 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1862 			tail->tstamp = skb->tstamp;
1863 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1864 		}
1865 
1866 		/* Not as strict as GRO. We only need to carry mss max value */
1867 		shinfo->gso_size = max(gso_size, tail_gso_size);
1868 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1869 
1870 		sk->sk_backlog.len += delta;
1871 		__NET_INC_STATS(sock_net(sk),
1872 				LINUX_MIB_TCPBACKLOGCOALESCE);
1873 		kfree_skb_partial(skb, fragstolen);
1874 		return false;
1875 	}
1876 	__skb_push(skb, hdrlen);
1877 
1878 no_coalesce:
1879 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1880 
1881 	/* Only socket owner can try to collapse/prune rx queues
1882 	 * to reduce memory overhead, so add a little headroom here.
1883 	 * Few sockets backlog are possibly concurrently non empty.
1884 	 */
1885 	limit += 64 * 1024;
1886 
1887 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1888 		bh_unlock_sock(sk);
1889 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1890 		return true;
1891 	}
1892 	return false;
1893 }
1894 EXPORT_SYMBOL(tcp_add_backlog);
1895 
tcp_filter(struct sock * sk,struct sk_buff * skb)1896 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1897 {
1898 	struct tcphdr *th = (struct tcphdr *)skb->data;
1899 
1900 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1901 }
1902 EXPORT_SYMBOL(tcp_filter);
1903 
tcp_v4_restore_cb(struct sk_buff * skb)1904 static void tcp_v4_restore_cb(struct sk_buff *skb)
1905 {
1906 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1907 		sizeof(struct inet_skb_parm));
1908 }
1909 
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1910 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1911 			   const struct tcphdr *th)
1912 {
1913 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1914 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1915 	 */
1916 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1917 		sizeof(struct inet_skb_parm));
1918 	barrier();
1919 
1920 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1921 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1922 				    skb->len - th->doff * 4);
1923 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1924 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1925 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1926 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1927 	TCP_SKB_CB(skb)->sacked	 = 0;
1928 	TCP_SKB_CB(skb)->has_rxtstamp =
1929 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1930 }
1931 
1932 /*
1933  *	From tcp_input.c
1934  */
1935 
tcp_v4_rcv(struct sk_buff * skb)1936 int tcp_v4_rcv(struct sk_buff *skb)
1937 {
1938 	struct net *net = dev_net(skb->dev);
1939 	struct sk_buff *skb_to_free;
1940 	int sdif = inet_sdif(skb);
1941 	int dif = inet_iif(skb);
1942 	const struct iphdr *iph;
1943 	const struct tcphdr *th;
1944 	bool refcounted;
1945 	struct sock *sk;
1946 	int ret;
1947 
1948 	if (skb->pkt_type != PACKET_HOST)
1949 		goto discard_it;
1950 
1951 	/* Count it even if it's bad */
1952 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1953 
1954 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1955 		goto discard_it;
1956 
1957 	th = (const struct tcphdr *)skb->data;
1958 
1959 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1960 		goto bad_packet;
1961 	if (!pskb_may_pull(skb, th->doff * 4))
1962 		goto discard_it;
1963 
1964 	/* An explanation is required here, I think.
1965 	 * Packet length and doff are validated by header prediction,
1966 	 * provided case of th->doff==0 is eliminated.
1967 	 * So, we defer the checks. */
1968 
1969 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1970 		goto csum_error;
1971 
1972 	th = (const struct tcphdr *)skb->data;
1973 	iph = ip_hdr(skb);
1974 lookup:
1975 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1976 			       th->dest, sdif, &refcounted);
1977 	if (!sk)
1978 		goto no_tcp_socket;
1979 
1980 process:
1981 	if (sk->sk_state == TCP_TIME_WAIT)
1982 		goto do_time_wait;
1983 
1984 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1985 		struct request_sock *req = inet_reqsk(sk);
1986 		bool req_stolen = false;
1987 		struct sock *nsk;
1988 
1989 		sk = req->rsk_listener;
1990 		if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1991 			     tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1992 			sk_drops_add(sk, skb);
1993 			reqsk_put(req);
1994 			goto discard_it;
1995 		}
1996 		if (tcp_checksum_complete(skb)) {
1997 			reqsk_put(req);
1998 			goto csum_error;
1999 		}
2000 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2001 			inet_csk_reqsk_queue_drop_and_put(sk, req);
2002 			goto lookup;
2003 		}
2004 		/* We own a reference on the listener, increase it again
2005 		 * as we might lose it too soon.
2006 		 */
2007 		sock_hold(sk);
2008 		refcounted = true;
2009 		nsk = NULL;
2010 		if (!tcp_filter(sk, skb)) {
2011 			th = (const struct tcphdr *)skb->data;
2012 			iph = ip_hdr(skb);
2013 			tcp_v4_fill_cb(skb, iph, th);
2014 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2015 		}
2016 		if (!nsk) {
2017 			reqsk_put(req);
2018 			if (req_stolen) {
2019 				/* Another cpu got exclusive access to req
2020 				 * and created a full blown socket.
2021 				 * Try to feed this packet to this socket
2022 				 * instead of discarding it.
2023 				 */
2024 				tcp_v4_restore_cb(skb);
2025 				sock_put(sk);
2026 				goto lookup;
2027 			}
2028 			goto discard_and_relse;
2029 		}
2030 		nf_reset_ct(skb);
2031 		if (nsk == sk) {
2032 			reqsk_put(req);
2033 			tcp_v4_restore_cb(skb);
2034 		} else if (tcp_child_process(sk, nsk, skb)) {
2035 			tcp_v4_send_reset(nsk, skb);
2036 			goto discard_and_relse;
2037 		} else {
2038 			sock_put(sk);
2039 			return 0;
2040 		}
2041 	}
2042 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2043 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2044 		goto discard_and_relse;
2045 	}
2046 
2047 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2048 		goto discard_and_relse;
2049 
2050 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2051 		goto discard_and_relse;
2052 
2053 	nf_reset_ct(skb);
2054 
2055 	if (tcp_filter(sk, skb))
2056 		goto discard_and_relse;
2057 	th = (const struct tcphdr *)skb->data;
2058 	iph = ip_hdr(skb);
2059 	tcp_v4_fill_cb(skb, iph, th);
2060 
2061 	skb->dev = NULL;
2062 
2063 	if (sk->sk_state == TCP_LISTEN) {
2064 		ret = tcp_v4_do_rcv(sk, skb);
2065 		goto put_and_return;
2066 	}
2067 
2068 	sk_incoming_cpu_update(sk);
2069 
2070 	bh_lock_sock_nested(sk);
2071 	tcp_segs_in(tcp_sk(sk), skb);
2072 	ret = 0;
2073 	if (!sock_owned_by_user(sk)) {
2074 		skb_to_free = sk->sk_rx_skb_cache;
2075 		sk->sk_rx_skb_cache = NULL;
2076 		ret = tcp_v4_do_rcv(sk, skb);
2077 	} else {
2078 		if (tcp_add_backlog(sk, skb))
2079 			goto discard_and_relse;
2080 		skb_to_free = NULL;
2081 	}
2082 	bh_unlock_sock(sk);
2083 	if (skb_to_free)
2084 		__kfree_skb(skb_to_free);
2085 
2086 put_and_return:
2087 	if (refcounted)
2088 		sock_put(sk);
2089 
2090 	return ret;
2091 
2092 no_tcp_socket:
2093 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2094 		goto discard_it;
2095 
2096 	tcp_v4_fill_cb(skb, iph, th);
2097 
2098 	if (tcp_checksum_complete(skb)) {
2099 csum_error:
2100 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2101 bad_packet:
2102 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2103 	} else {
2104 		tcp_v4_send_reset(NULL, skb);
2105 	}
2106 
2107 discard_it:
2108 	/* Discard frame. */
2109 	kfree_skb(skb);
2110 	return 0;
2111 
2112 discard_and_relse:
2113 	sk_drops_add(sk, skb);
2114 	if (refcounted)
2115 		sock_put(sk);
2116 	goto discard_it;
2117 
2118 do_time_wait:
2119 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2120 		inet_twsk_put(inet_twsk(sk));
2121 		goto discard_it;
2122 	}
2123 
2124 	tcp_v4_fill_cb(skb, iph, th);
2125 
2126 	if (tcp_checksum_complete(skb)) {
2127 		inet_twsk_put(inet_twsk(sk));
2128 		goto csum_error;
2129 	}
2130 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2131 	case TCP_TW_SYN: {
2132 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2133 							&tcp_hashinfo, skb,
2134 							__tcp_hdrlen(th),
2135 							iph->saddr, th->source,
2136 							iph->daddr, th->dest,
2137 							inet_iif(skb),
2138 							sdif);
2139 		if (sk2) {
2140 			inet_twsk_deschedule_put(inet_twsk(sk));
2141 			sk = sk2;
2142 			tcp_v4_restore_cb(skb);
2143 			refcounted = false;
2144 			goto process;
2145 		}
2146 	}
2147 		/* to ACK */
2148 		fallthrough;
2149 	case TCP_TW_ACK:
2150 		tcp_v4_timewait_ack(sk, skb);
2151 		break;
2152 	case TCP_TW_RST:
2153 		tcp_v4_send_reset(sk, skb);
2154 		inet_twsk_deschedule_put(inet_twsk(sk));
2155 		goto discard_it;
2156 	case TCP_TW_SUCCESS:;
2157 	}
2158 	goto discard_it;
2159 }
2160 
2161 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2162 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2163 	.twsk_unique	= tcp_twsk_unique,
2164 	.twsk_destructor= tcp_twsk_destructor,
2165 };
2166 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2167 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2168 {
2169 	struct dst_entry *dst = skb_dst(skb);
2170 
2171 	if (dst && dst_hold_safe(dst)) {
2172 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2173 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2174 	}
2175 }
2176 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2177 
2178 const struct inet_connection_sock_af_ops ipv4_specific = {
2179 	.queue_xmit	   = ip_queue_xmit,
2180 	.send_check	   = tcp_v4_send_check,
2181 	.rebuild_header	   = inet_sk_rebuild_header,
2182 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2183 	.conn_request	   = tcp_v4_conn_request,
2184 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2185 	.net_header_len	   = sizeof(struct iphdr),
2186 	.setsockopt	   = ip_setsockopt,
2187 	.getsockopt	   = ip_getsockopt,
2188 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2189 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2190 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2191 };
2192 EXPORT_SYMBOL(ipv4_specific);
2193 
2194 #ifdef CONFIG_TCP_MD5SIG
2195 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2196 	.md5_lookup		= tcp_v4_md5_lookup,
2197 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2198 	.md5_parse		= tcp_v4_parse_md5_keys,
2199 };
2200 #endif
2201 
2202 /* NOTE: A lot of things set to zero explicitly by call to
2203  *       sk_alloc() so need not be done here.
2204  */
tcp_v4_init_sock(struct sock * sk)2205 static int tcp_v4_init_sock(struct sock *sk)
2206 {
2207 	struct inet_connection_sock *icsk = inet_csk(sk);
2208 
2209 	tcp_init_sock(sk);
2210 
2211 	icsk->icsk_af_ops = &ipv4_specific;
2212 
2213 #ifdef CONFIG_TCP_MD5SIG
2214 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2215 #endif
2216 
2217 	return 0;
2218 }
2219 
tcp_v4_destroy_sock(struct sock * sk)2220 void tcp_v4_destroy_sock(struct sock *sk)
2221 {
2222 	struct tcp_sock *tp = tcp_sk(sk);
2223 
2224 	trace_tcp_destroy_sock(sk);
2225 
2226 	tcp_clear_xmit_timers(sk);
2227 
2228 	tcp_cleanup_congestion_control(sk);
2229 
2230 	tcp_cleanup_ulp(sk);
2231 
2232 	/* Cleanup up the write buffer. */
2233 	tcp_write_queue_purge(sk);
2234 
2235 	/* Check if we want to disable active TFO */
2236 	tcp_fastopen_active_disable_ofo_check(sk);
2237 
2238 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2239 	skb_rbtree_purge(&tp->out_of_order_queue);
2240 
2241 #ifdef CONFIG_TCP_MD5SIG
2242 	/* Clean up the MD5 key list, if any */
2243 	if (tp->md5sig_info) {
2244 		tcp_clear_md5_list(sk);
2245 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2246 		tp->md5sig_info = NULL;
2247 	}
2248 #endif
2249 
2250 	/* Clean up a referenced TCP bind bucket. */
2251 	if (inet_csk(sk)->icsk_bind_hash)
2252 		inet_put_port(sk);
2253 
2254 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2255 
2256 	/* If socket is aborted during connect operation */
2257 	tcp_free_fastopen_req(tp);
2258 	tcp_fastopen_destroy_cipher(sk);
2259 	tcp_saved_syn_free(tp);
2260 
2261 	sk_sockets_allocated_dec(sk);
2262 }
2263 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2264 
2265 #ifdef CONFIG_PROC_FS
2266 /* Proc filesystem TCP sock list dumping. */
2267 
2268 /*
2269  * Get next listener socket follow cur.  If cur is NULL, get first socket
2270  * starting from bucket given in st->bucket; when st->bucket is zero the
2271  * very first socket in the hash table is returned.
2272  */
listening_get_next(struct seq_file * seq,void * cur)2273 static void *listening_get_next(struct seq_file *seq, void *cur)
2274 {
2275 	struct tcp_seq_afinfo *afinfo;
2276 	struct tcp_iter_state *st = seq->private;
2277 	struct net *net = seq_file_net(seq);
2278 	struct inet_listen_hashbucket *ilb;
2279 	struct hlist_nulls_node *node;
2280 	struct sock *sk = cur;
2281 
2282 	if (st->bpf_seq_afinfo)
2283 		afinfo = st->bpf_seq_afinfo;
2284 	else
2285 		afinfo = PDE_DATA(file_inode(seq->file));
2286 
2287 	if (!sk) {
2288 get_head:
2289 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2290 		spin_lock(&ilb->lock);
2291 		sk = sk_nulls_head(&ilb->nulls_head);
2292 		st->offset = 0;
2293 		goto get_sk;
2294 	}
2295 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2296 	++st->num;
2297 	++st->offset;
2298 
2299 	sk = sk_nulls_next(sk);
2300 get_sk:
2301 	sk_nulls_for_each_from(sk, node) {
2302 		if (!net_eq(sock_net(sk), net))
2303 			continue;
2304 		if (afinfo->family == AF_UNSPEC ||
2305 		    sk->sk_family == afinfo->family)
2306 			return sk;
2307 	}
2308 	spin_unlock(&ilb->lock);
2309 	st->offset = 0;
2310 	if (++st->bucket < INET_LHTABLE_SIZE)
2311 		goto get_head;
2312 	return NULL;
2313 }
2314 
listening_get_idx(struct seq_file * seq,loff_t * pos)2315 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2316 {
2317 	struct tcp_iter_state *st = seq->private;
2318 	void *rc;
2319 
2320 	st->bucket = 0;
2321 	st->offset = 0;
2322 	rc = listening_get_next(seq, NULL);
2323 
2324 	while (rc && *pos) {
2325 		rc = listening_get_next(seq, rc);
2326 		--*pos;
2327 	}
2328 	return rc;
2329 }
2330 
empty_bucket(const struct tcp_iter_state * st)2331 static inline bool empty_bucket(const struct tcp_iter_state *st)
2332 {
2333 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2334 }
2335 
2336 /*
2337  * Get first established socket starting from bucket given in st->bucket.
2338  * If st->bucket is zero, the very first socket in the hash is returned.
2339  */
established_get_first(struct seq_file * seq)2340 static void *established_get_first(struct seq_file *seq)
2341 {
2342 	struct tcp_seq_afinfo *afinfo;
2343 	struct tcp_iter_state *st = seq->private;
2344 	struct net *net = seq_file_net(seq);
2345 	void *rc = NULL;
2346 
2347 	if (st->bpf_seq_afinfo)
2348 		afinfo = st->bpf_seq_afinfo;
2349 	else
2350 		afinfo = PDE_DATA(file_inode(seq->file));
2351 
2352 	st->offset = 0;
2353 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2354 		struct sock *sk;
2355 		struct hlist_nulls_node *node;
2356 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2357 
2358 		/* Lockless fast path for the common case of empty buckets */
2359 		if (empty_bucket(st))
2360 			continue;
2361 
2362 		spin_lock_bh(lock);
2363 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2364 			if ((afinfo->family != AF_UNSPEC &&
2365 			     sk->sk_family != afinfo->family) ||
2366 			    !net_eq(sock_net(sk), net)) {
2367 				continue;
2368 			}
2369 			rc = sk;
2370 			goto out;
2371 		}
2372 		spin_unlock_bh(lock);
2373 	}
2374 out:
2375 	return rc;
2376 }
2377 
established_get_next(struct seq_file * seq,void * cur)2378 static void *established_get_next(struct seq_file *seq, void *cur)
2379 {
2380 	struct tcp_seq_afinfo *afinfo;
2381 	struct sock *sk = cur;
2382 	struct hlist_nulls_node *node;
2383 	struct tcp_iter_state *st = seq->private;
2384 	struct net *net = seq_file_net(seq);
2385 
2386 	if (st->bpf_seq_afinfo)
2387 		afinfo = st->bpf_seq_afinfo;
2388 	else
2389 		afinfo = PDE_DATA(file_inode(seq->file));
2390 
2391 	++st->num;
2392 	++st->offset;
2393 
2394 	sk = sk_nulls_next(sk);
2395 
2396 	sk_nulls_for_each_from(sk, node) {
2397 		if ((afinfo->family == AF_UNSPEC ||
2398 		     sk->sk_family == afinfo->family) &&
2399 		    net_eq(sock_net(sk), net))
2400 			return sk;
2401 	}
2402 
2403 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2404 	++st->bucket;
2405 	return established_get_first(seq);
2406 }
2407 
established_get_idx(struct seq_file * seq,loff_t pos)2408 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2409 {
2410 	struct tcp_iter_state *st = seq->private;
2411 	void *rc;
2412 
2413 	st->bucket = 0;
2414 	rc = established_get_first(seq);
2415 
2416 	while (rc && pos) {
2417 		rc = established_get_next(seq, rc);
2418 		--pos;
2419 	}
2420 	return rc;
2421 }
2422 
tcp_get_idx(struct seq_file * seq,loff_t pos)2423 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2424 {
2425 	void *rc;
2426 	struct tcp_iter_state *st = seq->private;
2427 
2428 	st->state = TCP_SEQ_STATE_LISTENING;
2429 	rc	  = listening_get_idx(seq, &pos);
2430 
2431 	if (!rc) {
2432 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2433 		rc	  = established_get_idx(seq, pos);
2434 	}
2435 
2436 	return rc;
2437 }
2438 
tcp_seek_last_pos(struct seq_file * seq)2439 static void *tcp_seek_last_pos(struct seq_file *seq)
2440 {
2441 	struct tcp_iter_state *st = seq->private;
2442 	int bucket = st->bucket;
2443 	int offset = st->offset;
2444 	int orig_num = st->num;
2445 	void *rc = NULL;
2446 
2447 	switch (st->state) {
2448 	case TCP_SEQ_STATE_LISTENING:
2449 		if (st->bucket >= INET_LHTABLE_SIZE)
2450 			break;
2451 		st->state = TCP_SEQ_STATE_LISTENING;
2452 		rc = listening_get_next(seq, NULL);
2453 		while (offset-- && rc && bucket == st->bucket)
2454 			rc = listening_get_next(seq, rc);
2455 		if (rc)
2456 			break;
2457 		st->bucket = 0;
2458 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2459 		fallthrough;
2460 	case TCP_SEQ_STATE_ESTABLISHED:
2461 		if (st->bucket > tcp_hashinfo.ehash_mask)
2462 			break;
2463 		rc = established_get_first(seq);
2464 		while (offset-- && rc && bucket == st->bucket)
2465 			rc = established_get_next(seq, rc);
2466 	}
2467 
2468 	st->num = orig_num;
2469 
2470 	return rc;
2471 }
2472 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2473 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2474 {
2475 	struct tcp_iter_state *st = seq->private;
2476 	void *rc;
2477 
2478 	if (*pos && *pos == st->last_pos) {
2479 		rc = tcp_seek_last_pos(seq);
2480 		if (rc)
2481 			goto out;
2482 	}
2483 
2484 	st->state = TCP_SEQ_STATE_LISTENING;
2485 	st->num = 0;
2486 	st->bucket = 0;
2487 	st->offset = 0;
2488 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2489 
2490 out:
2491 	st->last_pos = *pos;
2492 	return rc;
2493 }
2494 EXPORT_SYMBOL(tcp_seq_start);
2495 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2496 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2497 {
2498 	struct tcp_iter_state *st = seq->private;
2499 	void *rc = NULL;
2500 
2501 	if (v == SEQ_START_TOKEN) {
2502 		rc = tcp_get_idx(seq, 0);
2503 		goto out;
2504 	}
2505 
2506 	switch (st->state) {
2507 	case TCP_SEQ_STATE_LISTENING:
2508 		rc = listening_get_next(seq, v);
2509 		if (!rc) {
2510 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2511 			st->bucket = 0;
2512 			st->offset = 0;
2513 			rc	  = established_get_first(seq);
2514 		}
2515 		break;
2516 	case TCP_SEQ_STATE_ESTABLISHED:
2517 		rc = established_get_next(seq, v);
2518 		break;
2519 	}
2520 out:
2521 	++*pos;
2522 	st->last_pos = *pos;
2523 	return rc;
2524 }
2525 EXPORT_SYMBOL(tcp_seq_next);
2526 
tcp_seq_stop(struct seq_file * seq,void * v)2527 void tcp_seq_stop(struct seq_file *seq, void *v)
2528 {
2529 	struct tcp_iter_state *st = seq->private;
2530 
2531 	switch (st->state) {
2532 	case TCP_SEQ_STATE_LISTENING:
2533 		if (v != SEQ_START_TOKEN)
2534 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2535 		break;
2536 	case TCP_SEQ_STATE_ESTABLISHED:
2537 		if (v)
2538 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2539 		break;
2540 	}
2541 }
2542 EXPORT_SYMBOL(tcp_seq_stop);
2543 
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2544 static void get_openreq4(const struct request_sock *req,
2545 			 struct seq_file *f, int i)
2546 {
2547 	const struct inet_request_sock *ireq = inet_rsk(req);
2548 	long delta = req->rsk_timer.expires - jiffies;
2549 
2550 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2551 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2552 		i,
2553 		ireq->ir_loc_addr,
2554 		ireq->ir_num,
2555 		ireq->ir_rmt_addr,
2556 		ntohs(ireq->ir_rmt_port),
2557 		TCP_SYN_RECV,
2558 		0, 0, /* could print option size, but that is af dependent. */
2559 		1,    /* timers active (only the expire timer) */
2560 		jiffies_delta_to_clock_t(delta),
2561 		req->num_timeout,
2562 		from_kuid_munged(seq_user_ns(f),
2563 				 sock_i_uid(req->rsk_listener)),
2564 		0,  /* non standard timer */
2565 		0, /* open_requests have no inode */
2566 		0,
2567 		req);
2568 }
2569 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2570 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2571 {
2572 	int timer_active;
2573 	unsigned long timer_expires;
2574 	const struct tcp_sock *tp = tcp_sk(sk);
2575 	const struct inet_connection_sock *icsk = inet_csk(sk);
2576 	const struct inet_sock *inet = inet_sk(sk);
2577 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2578 	__be32 dest = inet->inet_daddr;
2579 	__be32 src = inet->inet_rcv_saddr;
2580 	__u16 destp = ntohs(inet->inet_dport);
2581 	__u16 srcp = ntohs(inet->inet_sport);
2582 	int rx_queue;
2583 	int state;
2584 
2585 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2586 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2587 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2588 		timer_active	= 1;
2589 		timer_expires	= icsk->icsk_timeout;
2590 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2591 		timer_active	= 4;
2592 		timer_expires	= icsk->icsk_timeout;
2593 	} else if (timer_pending(&sk->sk_timer)) {
2594 		timer_active	= 2;
2595 		timer_expires	= sk->sk_timer.expires;
2596 	} else {
2597 		timer_active	= 0;
2598 		timer_expires = jiffies;
2599 	}
2600 
2601 	state = inet_sk_state_load(sk);
2602 	if (state == TCP_LISTEN)
2603 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2604 	else
2605 		/* Because we don't lock the socket,
2606 		 * we might find a transient negative value.
2607 		 */
2608 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2609 				      READ_ONCE(tp->copied_seq), 0);
2610 
2611 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2612 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2613 		i, src, srcp, dest, destp, state,
2614 		READ_ONCE(tp->write_seq) - tp->snd_una,
2615 		rx_queue,
2616 		timer_active,
2617 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2618 		icsk->icsk_retransmits,
2619 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2620 		icsk->icsk_probes_out,
2621 		sock_i_ino(sk),
2622 		refcount_read(&sk->sk_refcnt), sk,
2623 		jiffies_to_clock_t(icsk->icsk_rto),
2624 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2625 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2626 		tp->snd_cwnd,
2627 		state == TCP_LISTEN ?
2628 		    fastopenq->max_qlen :
2629 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2630 }
2631 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2632 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2633 			       struct seq_file *f, int i)
2634 {
2635 	long delta = tw->tw_timer.expires - jiffies;
2636 	__be32 dest, src;
2637 	__u16 destp, srcp;
2638 
2639 	dest  = tw->tw_daddr;
2640 	src   = tw->tw_rcv_saddr;
2641 	destp = ntohs(tw->tw_dport);
2642 	srcp  = ntohs(tw->tw_sport);
2643 
2644 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2645 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2646 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2647 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2648 		refcount_read(&tw->tw_refcnt), tw);
2649 }
2650 
2651 #define TMPSZ 150
2652 
tcp4_seq_show(struct seq_file * seq,void * v)2653 static int tcp4_seq_show(struct seq_file *seq, void *v)
2654 {
2655 	struct tcp_iter_state *st;
2656 	struct sock *sk = v;
2657 
2658 	seq_setwidth(seq, TMPSZ - 1);
2659 	if (v == SEQ_START_TOKEN) {
2660 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2661 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2662 			   "inode");
2663 		goto out;
2664 	}
2665 	st = seq->private;
2666 
2667 	if (sk->sk_state == TCP_TIME_WAIT)
2668 		get_timewait4_sock(v, seq, st->num);
2669 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2670 		get_openreq4(v, seq, st->num);
2671 	else
2672 		get_tcp4_sock(v, seq, st->num);
2673 out:
2674 	seq_pad(seq, '\n');
2675 	return 0;
2676 }
2677 
2678 #ifdef CONFIG_BPF_SYSCALL
2679 struct bpf_iter__tcp {
2680 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2681 	__bpf_md_ptr(struct sock_common *, sk_common);
2682 	uid_t uid __aligned(8);
2683 };
2684 
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2685 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2686 			     struct sock_common *sk_common, uid_t uid)
2687 {
2688 	struct bpf_iter__tcp ctx;
2689 
2690 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2691 	ctx.meta = meta;
2692 	ctx.sk_common = sk_common;
2693 	ctx.uid = uid;
2694 	return bpf_iter_run_prog(prog, &ctx);
2695 }
2696 
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2697 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2698 {
2699 	struct bpf_iter_meta meta;
2700 	struct bpf_prog *prog;
2701 	struct sock *sk = v;
2702 	uid_t uid;
2703 
2704 	if (v == SEQ_START_TOKEN)
2705 		return 0;
2706 
2707 	if (sk->sk_state == TCP_TIME_WAIT) {
2708 		uid = 0;
2709 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2710 		const struct request_sock *req = v;
2711 
2712 		uid = from_kuid_munged(seq_user_ns(seq),
2713 				       sock_i_uid(req->rsk_listener));
2714 	} else {
2715 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2716 	}
2717 
2718 	meta.seq = seq;
2719 	prog = bpf_iter_get_info(&meta, false);
2720 	return tcp_prog_seq_show(prog, &meta, v, uid);
2721 }
2722 
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2723 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2724 {
2725 	struct bpf_iter_meta meta;
2726 	struct bpf_prog *prog;
2727 
2728 	if (!v) {
2729 		meta.seq = seq;
2730 		prog = bpf_iter_get_info(&meta, true);
2731 		if (prog)
2732 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2733 	}
2734 
2735 	tcp_seq_stop(seq, v);
2736 }
2737 
2738 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2739 	.show		= bpf_iter_tcp_seq_show,
2740 	.start		= tcp_seq_start,
2741 	.next		= tcp_seq_next,
2742 	.stop		= bpf_iter_tcp_seq_stop,
2743 };
2744 #endif
2745 
2746 static const struct seq_operations tcp4_seq_ops = {
2747 	.show		= tcp4_seq_show,
2748 	.start		= tcp_seq_start,
2749 	.next		= tcp_seq_next,
2750 	.stop		= tcp_seq_stop,
2751 };
2752 
2753 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2754 	.family		= AF_INET,
2755 };
2756 
tcp4_proc_init_net(struct net * net)2757 static int __net_init tcp4_proc_init_net(struct net *net)
2758 {
2759 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2760 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2761 		return -ENOMEM;
2762 	return 0;
2763 }
2764 
tcp4_proc_exit_net(struct net * net)2765 static void __net_exit tcp4_proc_exit_net(struct net *net)
2766 {
2767 	remove_proc_entry("tcp", net->proc_net);
2768 }
2769 
2770 static struct pernet_operations tcp4_net_ops = {
2771 	.init = tcp4_proc_init_net,
2772 	.exit = tcp4_proc_exit_net,
2773 };
2774 
tcp4_proc_init(void)2775 int __init tcp4_proc_init(void)
2776 {
2777 	return register_pernet_subsys(&tcp4_net_ops);
2778 }
2779 
tcp4_proc_exit(void)2780 void tcp4_proc_exit(void)
2781 {
2782 	unregister_pernet_subsys(&tcp4_net_ops);
2783 }
2784 #endif /* CONFIG_PROC_FS */
2785 
2786 struct proto tcp_prot = {
2787 	.name			= "TCP",
2788 	.owner			= THIS_MODULE,
2789 	.close			= tcp_close,
2790 	.pre_connect		= tcp_v4_pre_connect,
2791 	.connect		= tcp_v4_connect,
2792 	.disconnect		= tcp_disconnect,
2793 	.accept			= inet_csk_accept,
2794 	.ioctl			= tcp_ioctl,
2795 	.init			= tcp_v4_init_sock,
2796 	.destroy		= tcp_v4_destroy_sock,
2797 	.shutdown		= tcp_shutdown,
2798 	.setsockopt		= tcp_setsockopt,
2799 	.getsockopt		= tcp_getsockopt,
2800 	.keepalive		= tcp_set_keepalive,
2801 	.recvmsg		= tcp_recvmsg,
2802 	.sendmsg		= tcp_sendmsg,
2803 	.sendpage		= tcp_sendpage,
2804 	.backlog_rcv		= tcp_v4_do_rcv,
2805 	.release_cb		= tcp_release_cb,
2806 	.hash			= inet_hash,
2807 	.unhash			= inet_unhash,
2808 	.get_port		= inet_csk_get_port,
2809 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2810 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2811 	.stream_memory_free	= tcp_stream_memory_free,
2812 	.sockets_allocated	= &tcp_sockets_allocated,
2813 	.orphan_count		= &tcp_orphan_count,
2814 	.memory_allocated	= &tcp_memory_allocated,
2815 	.memory_pressure	= &tcp_memory_pressure,
2816 	.sysctl_mem		= sysctl_tcp_mem,
2817 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2818 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2819 	.max_header		= MAX_TCP_HEADER,
2820 	.obj_size		= sizeof(struct tcp_sock),
2821 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2822 	.twsk_prot		= &tcp_timewait_sock_ops,
2823 	.rsk_prot		= &tcp_request_sock_ops,
2824 	.h.hashinfo		= &tcp_hashinfo,
2825 	.no_autobind		= true,
2826 	.diag_destroy		= tcp_abort,
2827 };
2828 EXPORT_SYMBOL(tcp_prot);
2829 
tcp_sk_exit(struct net * net)2830 static void __net_exit tcp_sk_exit(struct net *net)
2831 {
2832 	int cpu;
2833 
2834 	if (net->ipv4.tcp_congestion_control)
2835 		bpf_module_put(net->ipv4.tcp_congestion_control,
2836 			       net->ipv4.tcp_congestion_control->owner);
2837 
2838 	for_each_possible_cpu(cpu)
2839 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2840 	free_percpu(net->ipv4.tcp_sk);
2841 }
2842 
tcp_sk_init(struct net * net)2843 static int __net_init tcp_sk_init(struct net *net)
2844 {
2845 	int res, cpu, cnt;
2846 
2847 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2848 	if (!net->ipv4.tcp_sk)
2849 		return -ENOMEM;
2850 
2851 	for_each_possible_cpu(cpu) {
2852 		struct sock *sk;
2853 
2854 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2855 					   IPPROTO_TCP, net);
2856 		if (res)
2857 			goto fail;
2858 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2859 
2860 		/* Please enforce IP_DF and IPID==0 for RST and
2861 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2862 		 */
2863 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2864 
2865 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2866 	}
2867 
2868 	net->ipv4.sysctl_tcp_ecn = 2;
2869 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2870 
2871 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2872 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2873 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2874 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2875 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2876 
2877 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2878 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2879 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2880 
2881 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2882 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2883 	net->ipv4.sysctl_tcp_syncookies = 1;
2884 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2885 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2886 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2887 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2888 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2889 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2890 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2891 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2892 
2893 	cnt = tcp_hashinfo.ehash_mask + 1;
2894 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2895 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2896 
2897 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2898 	net->ipv4.sysctl_tcp_sack = 1;
2899 	net->ipv4.sysctl_tcp_window_scaling = 1;
2900 	net->ipv4.sysctl_tcp_timestamps = 1;
2901 	net->ipv4.sysctl_tcp_early_retrans = 3;
2902 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2903 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2904 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2905 	net->ipv4.sysctl_tcp_max_reordering = 300;
2906 	net->ipv4.sysctl_tcp_dsack = 1;
2907 	net->ipv4.sysctl_tcp_app_win = 31;
2908 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2909 	net->ipv4.sysctl_tcp_frto = 2;
2910 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2911 	/* This limits the percentage of the congestion window which we
2912 	 * will allow a single TSO frame to consume.  Building TSO frames
2913 	 * which are too large can cause TCP streams to be bursty.
2914 	 */
2915 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2916 	/* Default TSQ limit of 16 TSO segments */
2917 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2918 	/* rfc5961 challenge ack rate limiting */
2919 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2920 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2921 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2922 	net->ipv4.sysctl_tcp_autocorking = 1;
2923 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2924 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2925 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2926 	if (net != &init_net) {
2927 		memcpy(net->ipv4.sysctl_tcp_rmem,
2928 		       init_net.ipv4.sysctl_tcp_rmem,
2929 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2930 		memcpy(net->ipv4.sysctl_tcp_wmem,
2931 		       init_net.ipv4.sysctl_tcp_wmem,
2932 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2933 	}
2934 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2935 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2936 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2937 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2938 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2939 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2940 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2941 
2942 	/* Reno is always built in */
2943 	if (!net_eq(net, &init_net) &&
2944 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2945 			       init_net.ipv4.tcp_congestion_control->owner))
2946 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2947 	else
2948 		net->ipv4.tcp_congestion_control = &tcp_reno;
2949 
2950 	return 0;
2951 fail:
2952 	tcp_sk_exit(net);
2953 
2954 	return res;
2955 }
2956 
tcp_sk_exit_batch(struct list_head * net_exit_list)2957 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2958 {
2959 	struct net *net;
2960 
2961 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2962 
2963 	list_for_each_entry(net, net_exit_list, exit_list)
2964 		tcp_fastopen_ctx_destroy(net);
2965 }
2966 
2967 static struct pernet_operations __net_initdata tcp_sk_ops = {
2968        .init	   = tcp_sk_init,
2969        .exit	   = tcp_sk_exit,
2970        .exit_batch = tcp_sk_exit_batch,
2971 };
2972 
2973 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2974 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2975 		     struct sock_common *sk_common, uid_t uid)
2976 
2977 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2978 {
2979 	struct tcp_iter_state *st = priv_data;
2980 	struct tcp_seq_afinfo *afinfo;
2981 	int ret;
2982 
2983 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2984 	if (!afinfo)
2985 		return -ENOMEM;
2986 
2987 	afinfo->family = AF_UNSPEC;
2988 	st->bpf_seq_afinfo = afinfo;
2989 	ret = bpf_iter_init_seq_net(priv_data, aux);
2990 	if (ret)
2991 		kfree(afinfo);
2992 	return ret;
2993 }
2994 
bpf_iter_fini_tcp(void * priv_data)2995 static void bpf_iter_fini_tcp(void *priv_data)
2996 {
2997 	struct tcp_iter_state *st = priv_data;
2998 
2999 	kfree(st->bpf_seq_afinfo);
3000 	bpf_iter_fini_seq_net(priv_data);
3001 }
3002 
3003 static const struct bpf_iter_seq_info tcp_seq_info = {
3004 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3005 	.init_seq_private	= bpf_iter_init_tcp,
3006 	.fini_seq_private	= bpf_iter_fini_tcp,
3007 	.seq_priv_size		= sizeof(struct tcp_iter_state),
3008 };
3009 
3010 static struct bpf_iter_reg tcp_reg_info = {
3011 	.target			= "tcp",
3012 	.ctx_arg_info_size	= 1,
3013 	.ctx_arg_info		= {
3014 		{ offsetof(struct bpf_iter__tcp, sk_common),
3015 		  PTR_TO_BTF_ID_OR_NULL },
3016 	},
3017 	.seq_info		= &tcp_seq_info,
3018 };
3019 
bpf_iter_register(void)3020 static void __init bpf_iter_register(void)
3021 {
3022 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3023 	if (bpf_iter_reg_target(&tcp_reg_info))
3024 		pr_warn("Warning: could not register bpf iterator tcp\n");
3025 }
3026 
3027 #endif
3028 
tcp_v4_init(void)3029 void __init tcp_v4_init(void)
3030 {
3031 	if (register_pernet_subsys(&tcp_sk_ops))
3032 		panic("Failed to create the TCP control socket.\n");
3033 
3034 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3035 	bpf_iter_register();
3036 #endif
3037 }
3038