• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97 
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
100 
tcp_v4_init_sequence(const struct sk_buff * skb)101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104 					  ip_hdr(skb)->saddr,
105 					  tcp_hdr(skb)->dest,
106 					  tcp_hdr(skb)->source);
107 }
108 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 	struct tcp_sock *tp = tcp_sk(sk);
113 
114 	/* With PAWS, it is safe from the viewpoint
115 	   of data integrity. Even without PAWS it is safe provided sequence
116 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117 
118 	   Actually, the idea is close to VJ's one, only timestamp cache is
119 	   held not per host, but per port pair and TW bucket is used as state
120 	   holder.
121 
122 	   If TW bucket has been already destroyed we fall back to VJ's scheme
123 	   and use initial timestamp retrieved from peer table.
124 	 */
125 	if (tcptw->tw_ts_recent_stamp &&
126 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
127 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 		if (tp->write_seq == 0)
130 			tp->write_seq = 1;
131 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
132 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 		sock_hold(sktw);
134 		return 1;
135 	}
136 
137 	return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140 
141 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct tcp_sock *tp = tcp_sk(sk);
147 	__be16 orig_sport, orig_dport;
148 	__be32 daddr, nexthop;
149 	struct flowi4 *fl4;
150 	struct rtable *rt;
151 	int err;
152 	struct ip_options_rcu *inet_opt;
153 
154 	if (addr_len < sizeof(struct sockaddr_in))
155 		return -EINVAL;
156 
157 	if (usin->sin_family != AF_INET)
158 		return -EAFNOSUPPORT;
159 
160 	nexthop = daddr = usin->sin_addr.s_addr;
161 	inet_opt = rcu_dereference_protected(inet->inet_opt,
162 					     sock_owned_by_user(sk));
163 	if (inet_opt && inet_opt->opt.srr) {
164 		if (!daddr)
165 			return -EINVAL;
166 		nexthop = inet_opt->opt.faddr;
167 	}
168 
169 	orig_sport = inet->inet_sport;
170 	orig_dport = usin->sin_port;
171 	fl4 = &inet->cork.fl.u.ip4;
172 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 			      IPPROTO_TCP,
175 			      orig_sport, orig_dport, sk, true);
176 	if (IS_ERR(rt)) {
177 		err = PTR_ERR(rt);
178 		if (err == -ENETUNREACH)
179 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180 		return err;
181 	}
182 
183 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184 		ip_rt_put(rt);
185 		return -ENETUNREACH;
186 	}
187 
188 	if (!inet_opt || !inet_opt->opt.srr)
189 		daddr = fl4->daddr;
190 
191 	if (!inet->inet_saddr)
192 		inet->inet_saddr = fl4->saddr;
193 	inet->inet_rcv_saddr = inet->inet_saddr;
194 
195 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196 		/* Reset inherited state */
197 		tp->rx_opt.ts_recent	   = 0;
198 		tp->rx_opt.ts_recent_stamp = 0;
199 		if (likely(!tp->repair))
200 			tp->write_seq	   = 0;
201 	}
202 
203 	if (tcp_death_row.sysctl_tw_recycle &&
204 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
205 		tcp_fetch_timewait_stamp(sk, &rt->dst);
206 
207 	inet->inet_dport = usin->sin_port;
208 	inet->inet_daddr = daddr;
209 
210 	inet_csk(sk)->icsk_ext_hdr_len = 0;
211 	if (inet_opt)
212 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
213 
214 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
215 
216 	/* Socket identity is still unknown (sport may be zero).
217 	 * However we set state to SYN-SENT and not releasing socket
218 	 * lock select source port, enter ourselves into the hash tables and
219 	 * complete initialization after this.
220 	 */
221 	tcp_set_state(sk, TCP_SYN_SENT);
222 	err = inet_hash_connect(&tcp_death_row, sk);
223 	if (err)
224 		goto failure;
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
tcp_v4_mtu_reduced(struct sock * sk)271 static void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
291 		tcp_sync_mss(sk, mtu);
292 
293 		/* Resend the TCP packet because it's
294 		 * clear that the old packet has been
295 		 * dropped. This is the new "fast" path mtu
296 		 * discovery.
297 		 */
298 		tcp_simple_retransmit(sk);
299 	} /* else let the usual retransmit timer handle it */
300 }
301 
do_redirect(struct sk_buff * skb,struct sock * sk)302 static void do_redirect(struct sk_buff *skb, struct sock *sk)
303 {
304 	struct dst_entry *dst = __sk_dst_check(sk, 0);
305 
306 	if (dst)
307 		dst->ops->redirect(dst, sk, skb);
308 }
309 
310 /*
311  * This routine is called by the ICMP module when it gets some
312  * sort of error condition.  If err < 0 then the socket should
313  * be closed and the error returned to the user.  If err > 0
314  * it's just the icmp type << 8 | icmp code.  After adjustment
315  * header points to the first 8 bytes of the tcp header.  We need
316  * to find the appropriate port.
317  *
318  * The locking strategy used here is very "optimistic". When
319  * someone else accesses the socket the ICMP is just dropped
320  * and for some paths there is no check at all.
321  * A more general error queue to queue errors for later handling
322  * is probably better.
323  *
324  */
325 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
327 {
328 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
329 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
330 	struct inet_connection_sock *icsk;
331 	struct tcp_sock *tp;
332 	struct inet_sock *inet;
333 	const int type = icmp_hdr(icmp_skb)->type;
334 	const int code = icmp_hdr(icmp_skb)->code;
335 	struct sock *sk;
336 	struct sk_buff *skb;
337 	struct request_sock *req;
338 	__u32 seq;
339 	__u32 remaining;
340 	int err;
341 	struct net *net = dev_net(icmp_skb->dev);
342 
343 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
344 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
345 		return;
346 	}
347 
348 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
349 			iph->saddr, th->source, inet_iif(icmp_skb));
350 	if (!sk) {
351 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
352 		return;
353 	}
354 	if (sk->sk_state == TCP_TIME_WAIT) {
355 		inet_twsk_put(inet_twsk(sk));
356 		return;
357 	}
358 
359 	bh_lock_sock(sk);
360 	/* If too many ICMPs get dropped on busy
361 	 * servers this needs to be solved differently.
362 	 * We do take care of PMTU discovery (RFC1191) special case :
363 	 * we can receive locally generated ICMP messages while socket is held.
364 	 */
365 	if (sock_owned_by_user(sk)) {
366 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
367 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
368 	}
369 	if (sk->sk_state == TCP_CLOSE)
370 		goto out;
371 
372 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
373 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
374 		goto out;
375 	}
376 
377 	icsk = inet_csk(sk);
378 	tp = tcp_sk(sk);
379 	req = tp->fastopen_rsk;
380 	seq = ntohl(th->seq);
381 	if (sk->sk_state != TCP_LISTEN &&
382 	    !between(seq, tp->snd_una, tp->snd_nxt) &&
383 	    (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
384 		/* For a Fast Open socket, allow seq to be snt_isn. */
385 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
386 		goto out;
387 	}
388 
389 	switch (type) {
390 	case ICMP_REDIRECT:
391 		do_redirect(icmp_skb, sk);
392 		goto out;
393 	case ICMP_SOURCE_QUENCH:
394 		/* Just silently ignore these. */
395 		goto out;
396 	case ICMP_PARAMETERPROB:
397 		err = EPROTO;
398 		break;
399 	case ICMP_DEST_UNREACH:
400 		if (code > NR_ICMP_UNREACH)
401 			goto out;
402 
403 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
404 			/* We are not interested in TCP_LISTEN and open_requests
405 			 * (SYN-ACKs send out by Linux are always <576bytes so
406 			 * they should go through unfragmented).
407 			 */
408 			if (sk->sk_state == TCP_LISTEN)
409 				goto out;
410 
411 			tp->mtu_info = info;
412 			if (!sock_owned_by_user(sk)) {
413 				tcp_v4_mtu_reduced(sk);
414 			} else {
415 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
416 					sock_hold(sk);
417 			}
418 			goto out;
419 		}
420 
421 		err = icmp_err_convert[code].errno;
422 		/* check if icmp_skb allows revert of backoff
423 		 * (see draft-zimmermann-tcp-lcd) */
424 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
425 			break;
426 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
427 		    !icsk->icsk_backoff)
428 			break;
429 
430 		/* XXX (TFO) - revisit the following logic for TFO */
431 
432 		if (sock_owned_by_user(sk))
433 			break;
434 
435 		icsk->icsk_backoff--;
436 		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
437 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
438 		tcp_bound_rto(sk);
439 
440 		skb = tcp_write_queue_head(sk);
441 		BUG_ON(!skb);
442 
443 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
445 
446 		if (remaining) {
447 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448 						  remaining, TCP_RTO_MAX);
449 		} else {
450 			/* RTO revert clocked out retransmission.
451 			 * Will retransmit now */
452 			tcp_retransmit_timer(sk);
453 		}
454 
455 		break;
456 	case ICMP_TIME_EXCEEDED:
457 		err = EHOSTUNREACH;
458 		break;
459 	default:
460 		goto out;
461 	}
462 
463 	/* XXX (TFO) - if it's a TFO socket and has been accepted, rather
464 	 * than following the TCP_SYN_RECV case and closing the socket,
465 	 * we ignore the ICMP error and keep trying like a fully established
466 	 * socket. Is this the right thing to do?
467 	 */
468 	if (req && req->sk == NULL)
469 		goto out;
470 
471 	switch (sk->sk_state) {
472 		struct request_sock *req, **prev;
473 	case TCP_LISTEN:
474 		if (sock_owned_by_user(sk))
475 			goto out;
476 
477 		req = inet_csk_search_req(sk, &prev, th->dest,
478 					  iph->daddr, iph->saddr);
479 		if (!req)
480 			goto out;
481 
482 		/* ICMPs are not backlogged, hence we cannot get
483 		   an established socket here.
484 		 */
485 		WARN_ON(req->sk);
486 
487 		if (seq != tcp_rsk(req)->snt_isn) {
488 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
489 			goto out;
490 		}
491 
492 		/*
493 		 * Still in SYN_RECV, just remove it silently.
494 		 * There is no good way to pass the error to the newly
495 		 * created socket, and POSIX does not want network
496 		 * errors returned from accept().
497 		 */
498 		inet_csk_reqsk_queue_drop(sk, req, prev);
499 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
500 		goto out;
501 
502 	case TCP_SYN_SENT:
503 	case TCP_SYN_RECV:  /* Cannot happen.
504 			       It can f.e. if SYNs crossed,
505 			       or Fast Open.
506 			     */
507 		if (!sock_owned_by_user(sk)) {
508 			sk->sk_err = err;
509 
510 			sk->sk_error_report(sk);
511 
512 			tcp_done(sk);
513 		} else {
514 			sk->sk_err_soft = err;
515 		}
516 		goto out;
517 	}
518 
519 	/* If we've already connected we will keep trying
520 	 * until we time out, or the user gives up.
521 	 *
522 	 * rfc1122 4.2.3.9 allows to consider as hard errors
523 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524 	 * but it is obsoleted by pmtu discovery).
525 	 *
526 	 * Note, that in modern internet, where routing is unreliable
527 	 * and in each dark corner broken firewalls sit, sending random
528 	 * errors ordered by their masters even this two messages finally lose
529 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
530 	 *
531 	 * Now we are in compliance with RFCs.
532 	 *							--ANK (980905)
533 	 */
534 
535 	inet = inet_sk(sk);
536 	if (!sock_owned_by_user(sk) && inet->recverr) {
537 		sk->sk_err = err;
538 		sk->sk_error_report(sk);
539 	} else	{ /* Only an error on timeout */
540 		sk->sk_err_soft = err;
541 	}
542 
543 out:
544 	bh_unlock_sock(sk);
545 	sock_put(sk);
546 }
547 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)548 static void __tcp_v4_send_check(struct sk_buff *skb,
549 				__be32 saddr, __be32 daddr)
550 {
551 	struct tcphdr *th = tcp_hdr(skb);
552 
553 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
554 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
555 		skb->csum_start = skb_transport_header(skb) - skb->head;
556 		skb->csum_offset = offsetof(struct tcphdr, check);
557 	} else {
558 		th->check = tcp_v4_check(skb->len, saddr, daddr,
559 					 csum_partial(th,
560 						      th->doff << 2,
561 						      skb->csum));
562 	}
563 }
564 
565 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)566 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
567 {
568 	const struct inet_sock *inet = inet_sk(sk);
569 
570 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571 }
572 EXPORT_SYMBOL(tcp_v4_send_check);
573 
tcp_v4_gso_send_check(struct sk_buff * skb)574 int tcp_v4_gso_send_check(struct sk_buff *skb)
575 {
576 	const struct iphdr *iph;
577 	struct tcphdr *th;
578 
579 	if (!pskb_may_pull(skb, sizeof(*th)))
580 		return -EINVAL;
581 
582 	iph = ip_hdr(skb);
583 	th = tcp_hdr(skb);
584 
585 	th->check = 0;
586 	skb->ip_summed = CHECKSUM_PARTIAL;
587 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
588 	return 0;
589 }
590 
591 /*
592  *	This routine will send an RST to the other tcp.
593  *
594  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
595  *		      for reset.
596  *	Answer: if a packet caused RST, it is not for a socket
597  *		existing in our system, if it is matched to a socket,
598  *		it is just duplicate segment or bug in other side's TCP.
599  *		So that we build reply only basing on parameters
600  *		arrived with segment.
601  *	Exception: precedence violation. We do not implement it in any case.
602  */
603 
tcp_v4_send_reset(struct sock * sk,struct sk_buff * skb)604 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
605 {
606 	const struct tcphdr *th = tcp_hdr(skb);
607 	struct {
608 		struct tcphdr th;
609 #ifdef CONFIG_TCP_MD5SIG
610 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
611 #endif
612 	} rep;
613 	struct ip_reply_arg arg;
614 #ifdef CONFIG_TCP_MD5SIG
615 	struct tcp_md5sig_key *key;
616 	const __u8 *hash_location = NULL;
617 	unsigned char newhash[16];
618 	int genhash;
619 	struct sock *sk1 = NULL;
620 #endif
621 	struct net *net;
622 
623 	/* Never send a reset in response to a reset. */
624 	if (th->rst)
625 		return;
626 
627 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
628 		return;
629 
630 	/* Swap the send and the receive. */
631 	memset(&rep, 0, sizeof(rep));
632 	rep.th.dest   = th->source;
633 	rep.th.source = th->dest;
634 	rep.th.doff   = sizeof(struct tcphdr) / 4;
635 	rep.th.rst    = 1;
636 
637 	if (th->ack) {
638 		rep.th.seq = th->ack_seq;
639 	} else {
640 		rep.th.ack = 1;
641 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
642 				       skb->len - (th->doff << 2));
643 	}
644 
645 	memset(&arg, 0, sizeof(arg));
646 	arg.iov[0].iov_base = (unsigned char *)&rep;
647 	arg.iov[0].iov_len  = sizeof(rep.th);
648 
649 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
650 #ifdef CONFIG_TCP_MD5SIG
651 	hash_location = tcp_parse_md5sig_option(th);
652 	if (!sk && hash_location) {
653 		/*
654 		 * active side is lost. Try to find listening socket through
655 		 * source port, and then find md5 key through listening socket.
656 		 * we are not loose security here:
657 		 * Incoming packet is checked with md5 hash with finding key,
658 		 * no RST generated if md5 hash doesn't match.
659 		 */
660 		sk1 = __inet_lookup_listener(net,
661 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
662 					     th->source, ip_hdr(skb)->daddr,
663 					     ntohs(th->source), inet_iif(skb));
664 		/* don't send rst if it can't find key */
665 		if (!sk1)
666 			return;
667 		rcu_read_lock();
668 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
669 					&ip_hdr(skb)->saddr, AF_INET);
670 		if (!key)
671 			goto release_sk1;
672 
673 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
674 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
675 			goto release_sk1;
676 	} else {
677 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
678 					     &ip_hdr(skb)->saddr,
679 					     AF_INET) : NULL;
680 	}
681 
682 	if (key) {
683 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
684 				   (TCPOPT_NOP << 16) |
685 				   (TCPOPT_MD5SIG << 8) |
686 				   TCPOLEN_MD5SIG);
687 		/* Update length and the length the header thinks exists */
688 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689 		rep.th.doff = arg.iov[0].iov_len / 4;
690 
691 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
692 				     key, ip_hdr(skb)->saddr,
693 				     ip_hdr(skb)->daddr, &rep.th);
694 	}
695 #endif
696 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
697 				      ip_hdr(skb)->saddr, /* XXX */
698 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
699 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
700 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
701 	/* When socket is gone, all binding information is lost.
702 	 * routing might fail in this case. No choice here, if we choose to force
703 	 * input interface, we will misroute in case of asymmetric route.
704 	 */
705 	if (sk)
706 		arg.bound_dev_if = sk->sk_bound_dev_if;
707 
708 	arg.tos = ip_hdr(skb)->tos;
709 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
710 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
711 			      skb, ip_hdr(skb)->saddr,
712 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
713 
714 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
715 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
716 
717 #ifdef CONFIG_TCP_MD5SIG
718 release_sk1:
719 	if (sk1) {
720 		rcu_read_unlock();
721 		sock_put(sk1);
722 	}
723 #endif
724 }
725 
726 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
727    outside socket context is ugly, certainly. What can I do?
728  */
729 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)730 static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
731 			    u32 seq, u32 ack,
732 			    u32 win, u32 tsval, u32 tsecr, int oif,
733 			    struct tcp_md5sig_key *key,
734 			    int reply_flags, u8 tos)
735 {
736 	const struct tcphdr *th = tcp_hdr(skb);
737 	struct {
738 		struct tcphdr th;
739 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
740 #ifdef CONFIG_TCP_MD5SIG
741 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
742 #endif
743 			];
744 	} rep;
745 	struct ip_reply_arg arg;
746 	struct net *net = sock_net(sk);
747 
748 	memset(&rep.th, 0, sizeof(struct tcphdr));
749 	memset(&arg, 0, sizeof(arg));
750 
751 	arg.iov[0].iov_base = (unsigned char *)&rep;
752 	arg.iov[0].iov_len  = sizeof(rep.th);
753 	if (tsecr) {
754 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
755 				   (TCPOPT_TIMESTAMP << 8) |
756 				   TCPOLEN_TIMESTAMP);
757 		rep.opt[1] = htonl(tsval);
758 		rep.opt[2] = htonl(tsecr);
759 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
760 	}
761 
762 	/* Swap the send and the receive. */
763 	rep.th.dest    = th->source;
764 	rep.th.source  = th->dest;
765 	rep.th.doff    = arg.iov[0].iov_len / 4;
766 	rep.th.seq     = htonl(seq);
767 	rep.th.ack_seq = htonl(ack);
768 	rep.th.ack     = 1;
769 	rep.th.window  = htons(win);
770 
771 #ifdef CONFIG_TCP_MD5SIG
772 	if (key) {
773 		int offset = (tsecr) ? 3 : 0;
774 
775 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
776 					  (TCPOPT_NOP << 16) |
777 					  (TCPOPT_MD5SIG << 8) |
778 					  TCPOLEN_MD5SIG);
779 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
780 		rep.th.doff = arg.iov[0].iov_len/4;
781 
782 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
783 				    key, ip_hdr(skb)->saddr,
784 				    ip_hdr(skb)->daddr, &rep.th);
785 	}
786 #endif
787 	arg.flags = reply_flags;
788 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 				      ip_hdr(skb)->saddr, /* XXX */
790 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 	if (oif)
793 		arg.bound_dev_if = oif;
794 	arg.tos = tos;
795 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
796 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
797 			      skb, ip_hdr(skb)->saddr,
798 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
799 
800 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
801 }
802 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)803 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
804 {
805 	struct inet_timewait_sock *tw = inet_twsk(sk);
806 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
807 
808 	tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
809 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
810 			tcp_time_stamp + tcptw->tw_ts_offset,
811 			tcptw->tw_ts_recent,
812 			tw->tw_bound_dev_if,
813 			tcp_twsk_md5_key(tcptw),
814 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
815 			tw->tw_tos
816 			);
817 
818 	inet_twsk_put(tw);
819 }
820 
tcp_v4_reqsk_send_ack(struct sock * sk,struct sk_buff * skb,struct request_sock * req)821 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
822 				  struct request_sock *req)
823 {
824 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
825 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
826 	 */
827 	tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
828 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
829 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
830 			tcp_time_stamp,
831 			req->ts_recent,
832 			0,
833 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
834 					  AF_INET),
835 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
836 			ip_hdr(skb)->tos);
837 }
838 
839 /*
840  *	Send a SYN-ACK after having received a SYN.
841  *	This still operates on a request_sock only, not on a big
842  *	socket.
843  */
tcp_v4_send_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,u16 queue_mapping,bool nocache)844 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
845 			      struct request_sock *req,
846 			      u16 queue_mapping,
847 			      bool nocache)
848 {
849 	const struct inet_request_sock *ireq = inet_rsk(req);
850 	struct flowi4 fl4;
851 	int err = -1;
852 	struct sk_buff * skb;
853 
854 	/* First, grab a route. */
855 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
856 		return -1;
857 
858 	skb = tcp_make_synack(sk, dst, req, NULL);
859 
860 	if (skb) {
861 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
862 
863 		skb_set_queue_mapping(skb, queue_mapping);
864 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
865 					    ireq->rmt_addr,
866 					    ireq->opt);
867 		err = net_xmit_eval(err);
868 		if (!tcp_rsk(req)->snt_synack && !err)
869 			tcp_rsk(req)->snt_synack = tcp_time_stamp;
870 	}
871 
872 	return err;
873 }
874 
tcp_v4_rtx_synack(struct sock * sk,struct request_sock * req)875 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
876 {
877 	int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
878 
879 	if (!res)
880 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
881 	return res;
882 }
883 
884 /*
885  *	IPv4 request_sock destructor.
886  */
tcp_v4_reqsk_destructor(struct request_sock * req)887 static void tcp_v4_reqsk_destructor(struct request_sock *req)
888 {
889 	kfree(inet_rsk(req)->opt);
890 }
891 
892 /*
893  * Return true if a syncookie should be sent
894  */
tcp_syn_flood_action(struct sock * sk,const struct sk_buff * skb,const char * proto)895 bool tcp_syn_flood_action(struct sock *sk,
896 			 const struct sk_buff *skb,
897 			 const char *proto)
898 {
899 	const char *msg = "Dropping request";
900 	bool want_cookie = false;
901 	struct listen_sock *lopt;
902 
903 
904 
905 #ifdef CONFIG_SYN_COOKIES
906 	if (sysctl_tcp_syncookies) {
907 		msg = "Sending cookies";
908 		want_cookie = true;
909 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
910 	} else
911 #endif
912 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
913 
914 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
915 	if (!lopt->synflood_warned) {
916 		lopt->synflood_warned = 1;
917 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
918 			proto, ntohs(tcp_hdr(skb)->dest), msg);
919 	}
920 	return want_cookie;
921 }
922 EXPORT_SYMBOL(tcp_syn_flood_action);
923 
924 /*
925  * Save and compile IPv4 options into the request_sock if needed.
926  */
tcp_v4_save_options(struct sk_buff * skb)927 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
928 {
929 	const struct ip_options *opt = &(IPCB(skb)->opt);
930 	struct ip_options_rcu *dopt = NULL;
931 
932 	if (opt && opt->optlen) {
933 		int opt_size = sizeof(*dopt) + opt->optlen;
934 
935 		dopt = kmalloc(opt_size, GFP_ATOMIC);
936 		if (dopt) {
937 			if (ip_options_echo(&dopt->opt, skb)) {
938 				kfree(dopt);
939 				dopt = NULL;
940 			}
941 		}
942 	}
943 	return dopt;
944 }
945 
946 #ifdef CONFIG_TCP_MD5SIG
947 /*
948  * RFC2385 MD5 checksumming requires a mapping of
949  * IP address->MD5 Key.
950  * We need to maintain these in the sk structure.
951  */
952 
953 /* Find the Key structure for an address.  */
tcp_md5_do_lookup(struct sock * sk,const union tcp_md5_addr * addr,int family)954 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
955 					 const union tcp_md5_addr *addr,
956 					 int family)
957 {
958 	struct tcp_sock *tp = tcp_sk(sk);
959 	struct tcp_md5sig_key *key;
960 	unsigned int size = sizeof(struct in_addr);
961 	struct tcp_md5sig_info *md5sig;
962 
963 	/* caller either holds rcu_read_lock() or socket lock */
964 	md5sig = rcu_dereference_check(tp->md5sig_info,
965 				       sock_owned_by_user(sk) ||
966 				       lockdep_is_held(&sk->sk_lock.slock));
967 	if (!md5sig)
968 		return NULL;
969 #if IS_ENABLED(CONFIG_IPV6)
970 	if (family == AF_INET6)
971 		size = sizeof(struct in6_addr);
972 #endif
973 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
974 		if (key->family != family)
975 			continue;
976 		if (!memcmp(&key->addr, addr, size))
977 			return key;
978 	}
979 	return NULL;
980 }
981 EXPORT_SYMBOL(tcp_md5_do_lookup);
982 
tcp_v4_md5_lookup(struct sock * sk,struct sock * addr_sk)983 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
984 					 struct sock *addr_sk)
985 {
986 	union tcp_md5_addr *addr;
987 
988 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
989 	return tcp_md5_do_lookup(sk, addr, AF_INET);
990 }
991 EXPORT_SYMBOL(tcp_v4_md5_lookup);
992 
tcp_v4_reqsk_md5_lookup(struct sock * sk,struct request_sock * req)993 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
994 						      struct request_sock *req)
995 {
996 	union tcp_md5_addr *addr;
997 
998 	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
999 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1000 }
1001 
1002 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,const u8 * newkey,u8 newkeylen,gfp_t gfp)1003 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1004 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1005 {
1006 	/* Add Key to the list */
1007 	struct tcp_md5sig_key *key;
1008 	struct tcp_sock *tp = tcp_sk(sk);
1009 	struct tcp_md5sig_info *md5sig;
1010 
1011 	key = tcp_md5_do_lookup(sk, addr, family);
1012 	if (key) {
1013 		/* Pre-existing entry - just update that one. */
1014 		memcpy(key->key, newkey, newkeylen);
1015 		key->keylen = newkeylen;
1016 		return 0;
1017 	}
1018 
1019 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1020 					   sock_owned_by_user(sk));
1021 	if (!md5sig) {
1022 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1023 		if (!md5sig)
1024 			return -ENOMEM;
1025 
1026 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1027 		INIT_HLIST_HEAD(&md5sig->head);
1028 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1029 	}
1030 
1031 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1032 	if (!key)
1033 		return -ENOMEM;
1034 	if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1035 		sock_kfree_s(sk, key, sizeof(*key));
1036 		return -ENOMEM;
1037 	}
1038 
1039 	memcpy(key->key, newkey, newkeylen);
1040 	key->keylen = newkeylen;
1041 	key->family = family;
1042 	memcpy(&key->addr, addr,
1043 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1044 				      sizeof(struct in_addr));
1045 	hlist_add_head_rcu(&key->node, &md5sig->head);
1046 	return 0;
1047 }
1048 EXPORT_SYMBOL(tcp_md5_do_add);
1049 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family)1050 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1051 {
1052 	struct tcp_sock *tp = tcp_sk(sk);
1053 	struct tcp_md5sig_key *key;
1054 	struct tcp_md5sig_info *md5sig;
1055 
1056 	key = tcp_md5_do_lookup(sk, addr, family);
1057 	if (!key)
1058 		return -ENOENT;
1059 	hlist_del_rcu(&key->node);
1060 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1061 	kfree_rcu(key, rcu);
1062 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1063 					   sock_owned_by_user(sk));
1064 	if (hlist_empty(&md5sig->head))
1065 		tcp_free_md5sig_pool();
1066 	return 0;
1067 }
1068 EXPORT_SYMBOL(tcp_md5_do_del);
1069 
tcp_clear_md5_list(struct sock * sk)1070 static void tcp_clear_md5_list(struct sock *sk)
1071 {
1072 	struct tcp_sock *tp = tcp_sk(sk);
1073 	struct tcp_md5sig_key *key;
1074 	struct hlist_node *n;
1075 	struct tcp_md5sig_info *md5sig;
1076 
1077 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1078 
1079 	if (!hlist_empty(&md5sig->head))
1080 		tcp_free_md5sig_pool();
1081 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1082 		hlist_del_rcu(&key->node);
1083 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1084 		kfree_rcu(key, rcu);
1085 	}
1086 }
1087 
tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)1088 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1089 				 int optlen)
1090 {
1091 	struct tcp_md5sig cmd;
1092 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1093 
1094 	if (optlen < sizeof(cmd))
1095 		return -EINVAL;
1096 
1097 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1098 		return -EFAULT;
1099 
1100 	if (sin->sin_family != AF_INET)
1101 		return -EINVAL;
1102 
1103 	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1104 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1105 				      AF_INET);
1106 
1107 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1108 		return -EINVAL;
1109 
1110 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1111 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1112 			      GFP_KERNEL);
1113 }
1114 
tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1115 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1116 					__be32 daddr, __be32 saddr, int nbytes)
1117 {
1118 	struct tcp4_pseudohdr *bp;
1119 	struct scatterlist sg;
1120 
1121 	bp = &hp->md5_blk.ip4;
1122 
1123 	/*
1124 	 * 1. the TCP pseudo-header (in the order: source IP address,
1125 	 * destination IP address, zero-padded protocol number, and
1126 	 * segment length)
1127 	 */
1128 	bp->saddr = saddr;
1129 	bp->daddr = daddr;
1130 	bp->pad = 0;
1131 	bp->protocol = IPPROTO_TCP;
1132 	bp->len = cpu_to_be16(nbytes);
1133 
1134 	sg_init_one(&sg, bp, sizeof(*bp));
1135 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1136 }
1137 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1138 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1139 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1140 {
1141 	struct tcp_md5sig_pool *hp;
1142 	struct hash_desc *desc;
1143 
1144 	hp = tcp_get_md5sig_pool();
1145 	if (!hp)
1146 		goto clear_hash_noput;
1147 	desc = &hp->md5_desc;
1148 
1149 	if (crypto_hash_init(desc))
1150 		goto clear_hash;
1151 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1152 		goto clear_hash;
1153 	if (tcp_md5_hash_header(hp, th))
1154 		goto clear_hash;
1155 	if (tcp_md5_hash_key(hp, key))
1156 		goto clear_hash;
1157 	if (crypto_hash_final(desc, md5_hash))
1158 		goto clear_hash;
1159 
1160 	tcp_put_md5sig_pool();
1161 	return 0;
1162 
1163 clear_hash:
1164 	tcp_put_md5sig_pool();
1165 clear_hash_noput:
1166 	memset(md5_hash, 0, 16);
1167 	return 1;
1168 }
1169 
tcp_v4_md5_hash_skb(char * md5_hash,struct tcp_md5sig_key * key,const struct sock * sk,const struct request_sock * req,const struct sk_buff * skb)1170 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1171 			const struct sock *sk, const struct request_sock *req,
1172 			const struct sk_buff *skb)
1173 {
1174 	struct tcp_md5sig_pool *hp;
1175 	struct hash_desc *desc;
1176 	const struct tcphdr *th = tcp_hdr(skb);
1177 	__be32 saddr, daddr;
1178 
1179 	if (sk) {
1180 		saddr = inet_sk(sk)->inet_saddr;
1181 		daddr = inet_sk(sk)->inet_daddr;
1182 	} else if (req) {
1183 		saddr = inet_rsk(req)->loc_addr;
1184 		daddr = inet_rsk(req)->rmt_addr;
1185 	} else {
1186 		const struct iphdr *iph = ip_hdr(skb);
1187 		saddr = iph->saddr;
1188 		daddr = iph->daddr;
1189 	}
1190 
1191 	hp = tcp_get_md5sig_pool();
1192 	if (!hp)
1193 		goto clear_hash_noput;
1194 	desc = &hp->md5_desc;
1195 
1196 	if (crypto_hash_init(desc))
1197 		goto clear_hash;
1198 
1199 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1200 		goto clear_hash;
1201 	if (tcp_md5_hash_header(hp, th))
1202 		goto clear_hash;
1203 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1204 		goto clear_hash;
1205 	if (tcp_md5_hash_key(hp, key))
1206 		goto clear_hash;
1207 	if (crypto_hash_final(desc, md5_hash))
1208 		goto clear_hash;
1209 
1210 	tcp_put_md5sig_pool();
1211 	return 0;
1212 
1213 clear_hash:
1214 	tcp_put_md5sig_pool();
1215 clear_hash_noput:
1216 	memset(md5_hash, 0, 16);
1217 	return 1;
1218 }
1219 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1220 
tcp_v4_inbound_md5_hash(struct sock * sk,const struct sk_buff * skb)1221 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1222 {
1223 	/*
1224 	 * This gets called for each TCP segment that arrives
1225 	 * so we want to be efficient.
1226 	 * We have 3 drop cases:
1227 	 * o No MD5 hash and one expected.
1228 	 * o MD5 hash and we're not expecting one.
1229 	 * o MD5 hash and its wrong.
1230 	 */
1231 	const __u8 *hash_location = NULL;
1232 	struct tcp_md5sig_key *hash_expected;
1233 	const struct iphdr *iph = ip_hdr(skb);
1234 	const struct tcphdr *th = tcp_hdr(skb);
1235 	int genhash;
1236 	unsigned char newhash[16];
1237 
1238 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1239 					  AF_INET);
1240 	hash_location = tcp_parse_md5sig_option(th);
1241 
1242 	/* We've parsed the options - do we have a hash? */
1243 	if (!hash_expected && !hash_location)
1244 		return false;
1245 
1246 	if (hash_expected && !hash_location) {
1247 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1248 		return true;
1249 	}
1250 
1251 	if (!hash_expected && hash_location) {
1252 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1253 		return true;
1254 	}
1255 
1256 	/* Okay, so this is hash_expected and hash_location -
1257 	 * so we need to calculate the checksum.
1258 	 */
1259 	genhash = tcp_v4_md5_hash_skb(newhash,
1260 				      hash_expected,
1261 				      NULL, NULL, skb);
1262 
1263 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1264 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1265 				     &iph->saddr, ntohs(th->source),
1266 				     &iph->daddr, ntohs(th->dest),
1267 				     genhash ? " tcp_v4_calc_md5_hash failed"
1268 				     : "");
1269 		return true;
1270 	}
1271 	return false;
1272 }
1273 
1274 #endif
1275 
1276 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1277 	.family		=	PF_INET,
1278 	.obj_size	=	sizeof(struct tcp_request_sock),
1279 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1280 	.send_ack	=	tcp_v4_reqsk_send_ack,
1281 	.destructor	=	tcp_v4_reqsk_destructor,
1282 	.send_reset	=	tcp_v4_send_reset,
1283 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1284 };
1285 
1286 #ifdef CONFIG_TCP_MD5SIG
1287 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1288 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1289 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1290 };
1291 #endif
1292 
tcp_fastopen_check(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct tcp_fastopen_cookie * foc,struct tcp_fastopen_cookie * valid_foc)1293 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1294 			       struct request_sock *req,
1295 			       struct tcp_fastopen_cookie *foc,
1296 			       struct tcp_fastopen_cookie *valid_foc)
1297 {
1298 	bool skip_cookie = false;
1299 	struct fastopen_queue *fastopenq;
1300 
1301 	if (likely(!fastopen_cookie_present(foc))) {
1302 		/* See include/net/tcp.h for the meaning of these knobs */
1303 		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1304 		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1305 		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1306 			skip_cookie = true; /* no cookie to validate */
1307 		else
1308 			return false;
1309 	}
1310 	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1311 	/* A FO option is present; bump the counter. */
1312 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1313 
1314 	/* Make sure the listener has enabled fastopen, and we don't
1315 	 * exceed the max # of pending TFO requests allowed before trying
1316 	 * to validating the cookie in order to avoid burning CPU cycles
1317 	 * unnecessarily.
1318 	 *
1319 	 * XXX (TFO) - The implication of checking the max_qlen before
1320 	 * processing a cookie request is that clients can't differentiate
1321 	 * between qlen overflow causing Fast Open to be disabled
1322 	 * temporarily vs a server not supporting Fast Open at all.
1323 	 */
1324 	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1325 	    fastopenq == NULL || fastopenq->max_qlen == 0)
1326 		return false;
1327 
1328 	if (fastopenq->qlen >= fastopenq->max_qlen) {
1329 		struct request_sock *req1;
1330 		spin_lock(&fastopenq->lock);
1331 		req1 = fastopenq->rskq_rst_head;
1332 		if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1333 			spin_unlock(&fastopenq->lock);
1334 			NET_INC_STATS_BH(sock_net(sk),
1335 			    LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1336 			/* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1337 			foc->len = -1;
1338 			return false;
1339 		}
1340 		fastopenq->rskq_rst_head = req1->dl_next;
1341 		fastopenq->qlen--;
1342 		spin_unlock(&fastopenq->lock);
1343 		reqsk_free(req1);
1344 	}
1345 	if (skip_cookie) {
1346 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1347 		return true;
1348 	}
1349 	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1350 		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1351 			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1352 			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1353 			    memcmp(&foc->val[0], &valid_foc->val[0],
1354 			    TCP_FASTOPEN_COOKIE_SIZE) != 0)
1355 				return false;
1356 			valid_foc->len = -1;
1357 		}
1358 		/* Acknowledge the data received from the peer. */
1359 		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1360 		return true;
1361 	} else if (foc->len == 0) { /* Client requesting a cookie */
1362 		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1363 		NET_INC_STATS_BH(sock_net(sk),
1364 		    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1365 	} else {
1366 		/* Client sent a cookie with wrong size. Treat it
1367 		 * the same as invalid and return a valid one.
1368 		 */
1369 		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1370 	}
1371 	return false;
1372 }
1373 
tcp_v4_conn_req_fastopen(struct sock * sk,struct sk_buff * skb,struct sk_buff * skb_synack,struct request_sock * req)1374 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1375 				    struct sk_buff *skb,
1376 				    struct sk_buff *skb_synack,
1377 				    struct request_sock *req)
1378 {
1379 	struct tcp_sock *tp = tcp_sk(sk);
1380 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1381 	const struct inet_request_sock *ireq = inet_rsk(req);
1382 	struct sock *child;
1383 	int err;
1384 
1385 	req->num_retrans = 0;
1386 	req->num_timeout = 0;
1387 	req->sk = NULL;
1388 
1389 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1390 	if (child == NULL) {
1391 		NET_INC_STATS_BH(sock_net(sk),
1392 				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1393 		kfree_skb(skb_synack);
1394 		return -1;
1395 	}
1396 	err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1397 				    ireq->rmt_addr, ireq->opt);
1398 	err = net_xmit_eval(err);
1399 	if (!err)
1400 		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1401 	/* XXX (TFO) - is it ok to ignore error and continue? */
1402 
1403 	spin_lock(&queue->fastopenq->lock);
1404 	queue->fastopenq->qlen++;
1405 	spin_unlock(&queue->fastopenq->lock);
1406 
1407 	/* Initialize the child socket. Have to fix some values to take
1408 	 * into account the child is a Fast Open socket and is created
1409 	 * only out of the bits carried in the SYN packet.
1410 	 */
1411 	tp = tcp_sk(child);
1412 
1413 	tp->fastopen_rsk = req;
1414 	/* Do a hold on the listner sk so that if the listener is being
1415 	 * closed, the child that has been accepted can live on and still
1416 	 * access listen_lock.
1417 	 */
1418 	sock_hold(sk);
1419 	tcp_rsk(req)->listener = sk;
1420 
1421 	/* RFC1323: The window in SYN & SYN/ACK segments is never
1422 	 * scaled. So correct it appropriately.
1423 	 */
1424 	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1425 
1426 	/* Activate the retrans timer so that SYNACK can be retransmitted.
1427 	 * The request socket is not added to the SYN table of the parent
1428 	 * because it's been added to the accept queue directly.
1429 	 */
1430 	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1431 	    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1432 
1433 	/* Add the child socket directly into the accept queue */
1434 	inet_csk_reqsk_queue_add(sk, req, child);
1435 
1436 	/* Now finish processing the fastopen child socket. */
1437 	inet_csk(child)->icsk_af_ops->rebuild_header(child);
1438 	tcp_init_congestion_control(child);
1439 	tcp_mtup_init(child);
1440 	tcp_init_buffer_space(child);
1441 	tcp_init_metrics(child);
1442 
1443 	/* Queue the data carried in the SYN packet. We need to first
1444 	 * bump skb's refcnt because the caller will attempt to free it.
1445 	 *
1446 	 * XXX (TFO) - we honor a zero-payload TFO request for now.
1447 	 * (Any reason not to?)
1448 	 */
1449 	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1450 		/* Don't queue the skb if there is no payload in SYN.
1451 		 * XXX (TFO) - How about SYN+FIN?
1452 		 */
1453 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1454 	} else {
1455 		skb = skb_get(skb);
1456 		skb_dst_drop(skb);
1457 		__skb_pull(skb, tcp_hdr(skb)->doff * 4);
1458 		skb_set_owner_r(skb, child);
1459 		__skb_queue_tail(&child->sk_receive_queue, skb);
1460 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1461 		tp->syn_data_acked = 1;
1462 	}
1463 	sk->sk_data_ready(sk, 0);
1464 	bh_unlock_sock(child);
1465 	sock_put(child);
1466 	WARN_ON(req->sk == NULL);
1467 	return 0;
1468 }
1469 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1470 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1471 {
1472 	struct tcp_options_received tmp_opt;
1473 	struct request_sock *req;
1474 	struct inet_request_sock *ireq;
1475 	struct tcp_sock *tp = tcp_sk(sk);
1476 	struct dst_entry *dst = NULL;
1477 	__be32 saddr = ip_hdr(skb)->saddr;
1478 	__be32 daddr = ip_hdr(skb)->daddr;
1479 	__u32 isn = TCP_SKB_CB(skb)->when;
1480 	bool want_cookie = false;
1481 	struct flowi4 fl4;
1482 	struct tcp_fastopen_cookie foc = { .len = -1 };
1483 	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1484 	struct sk_buff *skb_synack;
1485 	int do_fastopen;
1486 
1487 	/* Never answer to SYNs send to broadcast or multicast */
1488 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489 		goto drop;
1490 
1491 	/* TW buckets are converted to open requests without
1492 	 * limitations, they conserve resources and peer is
1493 	 * evidently real one.
1494 	 */
1495 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1496 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1497 		if (!want_cookie)
1498 			goto drop;
1499 	}
1500 
1501 	/* Accept backlog is full. If we have already queued enough
1502 	 * of warm entries in syn queue, drop request. It is better than
1503 	 * clogging syn queue with openreqs with exponentially increasing
1504 	 * timeout.
1505 	 */
1506 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1507 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1508 		goto drop;
1509 	}
1510 
1511 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1512 	if (!req)
1513 		goto drop;
1514 
1515 #ifdef CONFIG_TCP_MD5SIG
1516 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1517 #endif
1518 
1519 	tcp_clear_options(&tmp_opt);
1520 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1521 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1522 	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1523 
1524 	if (want_cookie && !tmp_opt.saw_tstamp)
1525 		tcp_clear_options(&tmp_opt);
1526 
1527 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1528 	tcp_openreq_init(req, &tmp_opt, skb);
1529 
1530 	ireq = inet_rsk(req);
1531 	ireq->loc_addr = daddr;
1532 	ireq->rmt_addr = saddr;
1533 	ireq->no_srccheck = inet_sk(sk)->transparent;
1534 	ireq->opt = tcp_v4_save_options(skb);
1535 	ireq->ir_mark = inet_request_mark(sk, skb);
1536 
1537 	if (security_inet_conn_request(sk, skb, req))
1538 		goto drop_and_free;
1539 
1540 	if (!want_cookie || tmp_opt.tstamp_ok)
1541 		TCP_ECN_create_request(req, skb, sock_net(sk));
1542 
1543 	if (want_cookie) {
1544 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1545 		req->cookie_ts = tmp_opt.tstamp_ok;
1546 	} else if (!isn) {
1547 		/* VJ's idea. We save last timestamp seen
1548 		 * from the destination in peer table, when entering
1549 		 * state TIME-WAIT, and check against it before
1550 		 * accepting new connection request.
1551 		 *
1552 		 * If "isn" is not zero, this request hit alive
1553 		 * timewait bucket, so that all the necessary checks
1554 		 * are made in the function processing timewait state.
1555 		 */
1556 		if (tmp_opt.saw_tstamp &&
1557 		    tcp_death_row.sysctl_tw_recycle &&
1558 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1559 		    fl4.daddr == saddr) {
1560 			if (!tcp_peer_is_proven(req, dst, true)) {
1561 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1562 				goto drop_and_release;
1563 			}
1564 		}
1565 		/* Kill the following clause, if you dislike this way. */
1566 		else if (!sysctl_tcp_syncookies &&
1567 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1568 			  (sysctl_max_syn_backlog >> 2)) &&
1569 			 !tcp_peer_is_proven(req, dst, false)) {
1570 			/* Without syncookies last quarter of
1571 			 * backlog is filled with destinations,
1572 			 * proven to be alive.
1573 			 * It means that we continue to communicate
1574 			 * to destinations, already remembered
1575 			 * to the moment of synflood.
1576 			 */
1577 			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1578 				       &saddr, ntohs(tcp_hdr(skb)->source));
1579 			goto drop_and_release;
1580 		}
1581 
1582 		isn = tcp_v4_init_sequence(skb);
1583 	}
1584 	tcp_rsk(req)->snt_isn = isn;
1585 
1586 	if (dst == NULL) {
1587 		dst = inet_csk_route_req(sk, &fl4, req);
1588 		if (dst == NULL)
1589 			goto drop_and_free;
1590 	}
1591 	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1592 
1593 	/* We don't call tcp_v4_send_synack() directly because we need
1594 	 * to make sure a child socket can be created successfully before
1595 	 * sending back synack!
1596 	 *
1597 	 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1598 	 * (or better yet, call tcp_send_synack() in the child context
1599 	 * directly, but will have to fix bunch of other code first)
1600 	 * after syn_recv_sock() except one will need to first fix the
1601 	 * latter to remove its dependency on the current implementation
1602 	 * of tcp_v4_send_synack()->tcp_select_initial_window().
1603 	 */
1604 	skb_synack = tcp_make_synack(sk, dst, req,
1605 	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1606 
1607 	if (skb_synack) {
1608 		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1609 		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1610 	} else
1611 		goto drop_and_free;
1612 
1613 	if (likely(!do_fastopen)) {
1614 		int err;
1615 		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1616 		     ireq->rmt_addr, ireq->opt);
1617 		err = net_xmit_eval(err);
1618 		if (err || want_cookie)
1619 			goto drop_and_free;
1620 
1621 		tcp_rsk(req)->snt_synack = tcp_time_stamp;
1622 		tcp_rsk(req)->listener = NULL;
1623 		/* Add the request_sock to the SYN table */
1624 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1625 		if (fastopen_cookie_present(&foc) && foc.len != 0)
1626 			NET_INC_STATS_BH(sock_net(sk),
1627 			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1628 	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1629 		goto drop_and_free;
1630 
1631 	return 0;
1632 
1633 drop_and_release:
1634 	dst_release(dst);
1635 drop_and_free:
1636 	reqsk_free(req);
1637 drop:
1638 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1639 	return 0;
1640 }
1641 EXPORT_SYMBOL(tcp_v4_conn_request);
1642 
1643 
1644 /*
1645  * The three way handshake has completed - we got a valid synack -
1646  * now create the new socket.
1647  */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst)1648 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1649 				  struct request_sock *req,
1650 				  struct dst_entry *dst)
1651 {
1652 	struct inet_request_sock *ireq;
1653 	struct inet_sock *newinet;
1654 	struct tcp_sock *newtp;
1655 	struct sock *newsk;
1656 #ifdef CONFIG_TCP_MD5SIG
1657 	struct tcp_md5sig_key *key;
1658 #endif
1659 	struct ip_options_rcu *inet_opt;
1660 
1661 	if (sk_acceptq_is_full(sk))
1662 		goto exit_overflow;
1663 
1664 	newsk = tcp_create_openreq_child(sk, req, skb);
1665 	if (!newsk)
1666 		goto exit_nonewsk;
1667 
1668 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1669 	inet_sk_rx_dst_set(newsk, skb);
1670 
1671 	newtp		      = tcp_sk(newsk);
1672 	newinet		      = inet_sk(newsk);
1673 	ireq		      = inet_rsk(req);
1674 	newinet->inet_daddr   = ireq->rmt_addr;
1675 	newinet->inet_rcv_saddr = ireq->loc_addr;
1676 	newinet->inet_saddr	      = ireq->loc_addr;
1677 	inet_opt	      = ireq->opt;
1678 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1679 	ireq->opt	      = NULL;
1680 	newinet->mc_index     = inet_iif(skb);
1681 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1682 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1683 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1684 	if (inet_opt)
1685 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1686 	newinet->inet_id = newtp->write_seq ^ jiffies;
1687 
1688 	if (!dst) {
1689 		dst = inet_csk_route_child_sock(sk, newsk, req);
1690 		if (!dst)
1691 			goto put_and_exit;
1692 	} else {
1693 		/* syncookie case : see end of cookie_v4_check() */
1694 	}
1695 	sk_setup_caps(newsk, dst);
1696 
1697 	tcp_mtup_init(newsk);
1698 	tcp_sync_mss(newsk, dst_mtu(dst));
1699 	newtp->advmss = dst_metric_advmss(dst);
1700 	if (tcp_sk(sk)->rx_opt.user_mss &&
1701 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1702 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1703 
1704 	tcp_initialize_rcv_mss(newsk);
1705 	tcp_synack_rtt_meas(newsk, req);
1706 	newtp->total_retrans = req->num_retrans;
1707 
1708 #ifdef CONFIG_TCP_MD5SIG
1709 	/* Copy over the MD5 key from the original socket */
1710 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1711 				AF_INET);
1712 	if (key != NULL) {
1713 		/*
1714 		 * We're using one, so create a matching key
1715 		 * on the newsk structure. If we fail to get
1716 		 * memory, then we end up not copying the key
1717 		 * across. Shucks.
1718 		 */
1719 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1720 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1721 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1722 	}
1723 #endif
1724 
1725 	if (__inet_inherit_port(sk, newsk) < 0)
1726 		goto put_and_exit;
1727 	__inet_hash_nolisten(newsk, NULL);
1728 
1729 	return newsk;
1730 
1731 exit_overflow:
1732 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1733 exit_nonewsk:
1734 	dst_release(dst);
1735 exit:
1736 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1737 	return NULL;
1738 put_and_exit:
1739 	inet_csk_prepare_forced_close(newsk);
1740 	tcp_done(newsk);
1741 	goto exit;
1742 }
1743 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1744 
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1745 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1746 {
1747 	struct tcphdr *th = tcp_hdr(skb);
1748 	const struct iphdr *iph = ip_hdr(skb);
1749 	struct sock *nsk;
1750 	struct request_sock **prev;
1751 	/* Find possible connection requests. */
1752 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1753 						       iph->saddr, iph->daddr);
1754 	if (req)
1755 		return tcp_check_req(sk, skb, req, prev, false);
1756 
1757 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1758 			th->source, iph->daddr, th->dest, inet_iif(skb));
1759 
1760 	if (nsk) {
1761 		if (nsk->sk_state != TCP_TIME_WAIT) {
1762 			bh_lock_sock(nsk);
1763 			return nsk;
1764 		}
1765 		inet_twsk_put(inet_twsk(nsk));
1766 		return NULL;
1767 	}
1768 
1769 #ifdef CONFIG_SYN_COOKIES
1770 	if (!th->syn)
1771 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1772 #endif
1773 	return sk;
1774 }
1775 
tcp_v4_checksum_init(struct sk_buff * skb)1776 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1777 {
1778 	const struct iphdr *iph = ip_hdr(skb);
1779 
1780 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1781 		if (!tcp_v4_check(skb->len, iph->saddr,
1782 				  iph->daddr, skb->csum)) {
1783 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1784 			return 0;
1785 		}
1786 	}
1787 
1788 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1789 				       skb->len, IPPROTO_TCP, 0);
1790 
1791 	if (skb->len <= 76) {
1792 		return __skb_checksum_complete(skb);
1793 	}
1794 	return 0;
1795 }
1796 
1797 
1798 /* The socket must have it's spinlock held when we get
1799  * here.
1800  *
1801  * We have a potential double-lock case here, so even when
1802  * doing backlog processing we use the BH locking scheme.
1803  * This is because we cannot sleep with the original spinlock
1804  * held.
1805  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1806 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1807 {
1808 	struct sock *rsk;
1809 #ifdef CONFIG_TCP_MD5SIG
1810 	/*
1811 	 * We really want to reject the packet as early as possible
1812 	 * if:
1813 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1814 	 *  o There is an MD5 option and we're not expecting one
1815 	 */
1816 	if (tcp_v4_inbound_md5_hash(sk, skb))
1817 		goto discard;
1818 #endif
1819 
1820 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1821 		struct dst_entry *dst = sk->sk_rx_dst;
1822 
1823 		sock_rps_save_rxhash(sk, skb);
1824 		if (dst) {
1825 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1826 			    dst->ops->check(dst, 0) == NULL) {
1827 				dst_release(dst);
1828 				sk->sk_rx_dst = NULL;
1829 			}
1830 		}
1831 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1832 			rsk = sk;
1833 			goto reset;
1834 		}
1835 		return 0;
1836 	}
1837 
1838 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1839 		goto csum_err;
1840 
1841 	if (sk->sk_state == TCP_LISTEN) {
1842 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1843 		if (!nsk)
1844 			goto discard;
1845 
1846 		if (nsk != sk) {
1847 			sock_rps_save_rxhash(nsk, skb);
1848 			if (tcp_child_process(sk, nsk, skb)) {
1849 				rsk = nsk;
1850 				goto reset;
1851 			}
1852 			return 0;
1853 		}
1854 	} else
1855 		sock_rps_save_rxhash(sk, skb);
1856 
1857 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1858 		rsk = sk;
1859 		goto reset;
1860 	}
1861 	return 0;
1862 
1863 reset:
1864 	tcp_v4_send_reset(rsk, skb);
1865 discard:
1866 	kfree_skb(skb);
1867 	/* Be careful here. If this function gets more complicated and
1868 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1869 	 * might be destroyed here. This current version compiles correctly,
1870 	 * but you have been warned.
1871 	 */
1872 	return 0;
1873 
1874 csum_err:
1875 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1876 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1877 	goto discard;
1878 }
1879 EXPORT_SYMBOL(tcp_v4_do_rcv);
1880 
tcp_v4_early_demux(struct sk_buff * skb)1881 void tcp_v4_early_demux(struct sk_buff *skb)
1882 {
1883 	const struct iphdr *iph;
1884 	const struct tcphdr *th;
1885 	struct sock *sk;
1886 
1887 	if (skb->pkt_type != PACKET_HOST)
1888 		return;
1889 
1890 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1891 		return;
1892 
1893 	iph = ip_hdr(skb);
1894 	th = tcp_hdr(skb);
1895 
1896 	if (th->doff < sizeof(struct tcphdr) / 4)
1897 		return;
1898 
1899 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1900 				       iph->saddr, th->source,
1901 				       iph->daddr, ntohs(th->dest),
1902 				       skb->skb_iif);
1903 	if (sk) {
1904 		skb->sk = sk;
1905 		skb->destructor = sock_edemux;
1906 		if (sk->sk_state != TCP_TIME_WAIT) {
1907 			struct dst_entry *dst = sk->sk_rx_dst;
1908 
1909 			if (dst)
1910 				dst = dst_check(dst, 0);
1911 			if (dst &&
1912 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1913 				skb_dst_set_noref(skb, dst);
1914 		}
1915 	}
1916 }
1917 
1918 /* Packet is added to VJ-style prequeue for processing in process
1919  * context, if a reader task is waiting. Apparently, this exciting
1920  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1921  * failed somewhere. Latency? Burstiness? Well, at least now we will
1922  * see, why it failed. 8)8)				  --ANK
1923  *
1924  */
tcp_prequeue(struct sock * sk,struct sk_buff * skb)1925 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1926 {
1927 	struct tcp_sock *tp = tcp_sk(sk);
1928 
1929 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1930 		return false;
1931 
1932 	if (skb->len <= tcp_hdrlen(skb) &&
1933 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1934 		return false;
1935 
1936 	skb_dst_force(skb);
1937 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1938 	tp->ucopy.memory += skb->truesize;
1939 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1940 		struct sk_buff *skb1;
1941 
1942 		BUG_ON(sock_owned_by_user(sk));
1943 
1944 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1945 			sk_backlog_rcv(sk, skb1);
1946 			NET_INC_STATS_BH(sock_net(sk),
1947 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1948 		}
1949 
1950 		tp->ucopy.memory = 0;
1951 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1952 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1953 					   POLLIN | POLLRDNORM | POLLRDBAND);
1954 		if (!inet_csk_ack_scheduled(sk))
1955 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1956 						  (3 * tcp_rto_min(sk)) / 4,
1957 						  TCP_RTO_MAX);
1958 	}
1959 	return true;
1960 }
1961 EXPORT_SYMBOL(tcp_prequeue);
1962 
1963 /*
1964  *	From tcp_input.c
1965  */
1966 
tcp_v4_rcv(struct sk_buff * skb)1967 int tcp_v4_rcv(struct sk_buff *skb)
1968 {
1969 	const struct iphdr *iph;
1970 	const struct tcphdr *th;
1971 	struct sock *sk;
1972 	int ret;
1973 	struct net *net = dev_net(skb->dev);
1974 
1975 	if (skb->pkt_type != PACKET_HOST)
1976 		goto discard_it;
1977 
1978 	/* Count it even if it's bad */
1979 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1980 
1981 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1982 		goto discard_it;
1983 
1984 	th = tcp_hdr(skb);
1985 
1986 	if (th->doff < sizeof(struct tcphdr) / 4)
1987 		goto bad_packet;
1988 	if (!pskb_may_pull(skb, th->doff * 4))
1989 		goto discard_it;
1990 
1991 	/* An explanation is required here, I think.
1992 	 * Packet length and doff are validated by header prediction,
1993 	 * provided case of th->doff==0 is eliminated.
1994 	 * So, we defer the checks. */
1995 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1996 		goto csum_error;
1997 
1998 	th = tcp_hdr(skb);
1999 	iph = ip_hdr(skb);
2000 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2001 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2002 				    skb->len - th->doff * 4);
2003 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2004 	TCP_SKB_CB(skb)->when	 = 0;
2005 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2006 	TCP_SKB_CB(skb)->sacked	 = 0;
2007 
2008 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2009 	if (!sk)
2010 		goto no_tcp_socket;
2011 
2012 process:
2013 	if (sk->sk_state == TCP_TIME_WAIT)
2014 		goto do_time_wait;
2015 
2016 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2017 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2018 		goto discard_and_relse;
2019 	}
2020 
2021 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2022 		goto discard_and_relse;
2023 	nf_reset(skb);
2024 
2025 	if (sk_filter(sk, skb))
2026 		goto discard_and_relse;
2027 
2028 	skb->dev = NULL;
2029 
2030 	bh_lock_sock_nested(sk);
2031 	ret = 0;
2032 	if (!sock_owned_by_user(sk)) {
2033 #ifdef CONFIG_NET_DMA
2034 		struct tcp_sock *tp = tcp_sk(sk);
2035 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2036 			tp->ucopy.dma_chan = net_dma_find_channel();
2037 		if (tp->ucopy.dma_chan)
2038 			ret = tcp_v4_do_rcv(sk, skb);
2039 		else
2040 #endif
2041 		{
2042 			if (!tcp_prequeue(sk, skb))
2043 				ret = tcp_v4_do_rcv(sk, skb);
2044 		}
2045 	} else if (unlikely(sk_add_backlog(sk, skb,
2046 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
2047 		bh_unlock_sock(sk);
2048 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2049 		goto discard_and_relse;
2050 	}
2051 	bh_unlock_sock(sk);
2052 
2053 	sock_put(sk);
2054 
2055 	return ret;
2056 
2057 no_tcp_socket:
2058 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2059 		goto discard_it;
2060 
2061 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2062 csum_error:
2063 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2064 bad_packet:
2065 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2066 	} else {
2067 		tcp_v4_send_reset(NULL, skb);
2068 	}
2069 
2070 discard_it:
2071 	/* Discard frame. */
2072 	kfree_skb(skb);
2073 	return 0;
2074 
2075 discard_and_relse:
2076 	sock_put(sk);
2077 	goto discard_it;
2078 
2079 do_time_wait:
2080 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2081 		inet_twsk_put(inet_twsk(sk));
2082 		goto discard_it;
2083 	}
2084 
2085 	if (skb->len < (th->doff << 2)) {
2086 		inet_twsk_put(inet_twsk(sk));
2087 		goto bad_packet;
2088 	}
2089 	if (tcp_checksum_complete(skb)) {
2090 		inet_twsk_put(inet_twsk(sk));
2091 		goto csum_error;
2092 	}
2093 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2094 	case TCP_TW_SYN: {
2095 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2096 							&tcp_hashinfo,
2097 							iph->saddr, th->source,
2098 							iph->daddr, th->dest,
2099 							inet_iif(skb));
2100 		if (sk2) {
2101 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2102 			inet_twsk_put(inet_twsk(sk));
2103 			sk = sk2;
2104 			goto process;
2105 		}
2106 		/* Fall through to ACK */
2107 	}
2108 	case TCP_TW_ACK:
2109 		tcp_v4_timewait_ack(sk, skb);
2110 		break;
2111 	case TCP_TW_RST:
2112 		goto no_tcp_socket;
2113 	case TCP_TW_SUCCESS:;
2114 	}
2115 	goto discard_it;
2116 }
2117 
2118 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2119 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2120 	.twsk_unique	= tcp_twsk_unique,
2121 	.twsk_destructor= tcp_twsk_destructor,
2122 };
2123 
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2124 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2125 {
2126 	struct dst_entry *dst = skb_dst(skb);
2127 
2128 	dst_hold(dst);
2129 	sk->sk_rx_dst = dst;
2130 	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2131 }
2132 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2133 
2134 const struct inet_connection_sock_af_ops ipv4_specific = {
2135 	.queue_xmit	   = ip_queue_xmit,
2136 	.send_check	   = tcp_v4_send_check,
2137 	.rebuild_header	   = inet_sk_rebuild_header,
2138 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2139 	.conn_request	   = tcp_v4_conn_request,
2140 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2141 	.net_header_len	   = sizeof(struct iphdr),
2142 	.setsockopt	   = ip_setsockopt,
2143 	.getsockopt	   = ip_getsockopt,
2144 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2145 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2146 	.bind_conflict	   = inet_csk_bind_conflict,
2147 #ifdef CONFIG_COMPAT
2148 	.compat_setsockopt = compat_ip_setsockopt,
2149 	.compat_getsockopt = compat_ip_getsockopt,
2150 #endif
2151 };
2152 EXPORT_SYMBOL(ipv4_specific);
2153 
2154 #ifdef CONFIG_TCP_MD5SIG
2155 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2156 	.md5_lookup		= tcp_v4_md5_lookup,
2157 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2158 	.md5_parse		= tcp_v4_parse_md5_keys,
2159 };
2160 #endif
2161 
2162 /* NOTE: A lot of things set to zero explicitly by call to
2163  *       sk_alloc() so need not be done here.
2164  */
tcp_v4_init_sock(struct sock * sk)2165 static int tcp_v4_init_sock(struct sock *sk)
2166 {
2167 	struct inet_connection_sock *icsk = inet_csk(sk);
2168 
2169 	tcp_init_sock(sk);
2170 
2171 	icsk->icsk_af_ops = &ipv4_specific;
2172 
2173 #ifdef CONFIG_TCP_MD5SIG
2174 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2175 #endif
2176 
2177 	return 0;
2178 }
2179 
tcp_v4_destroy_sock(struct sock * sk)2180 void tcp_v4_destroy_sock(struct sock *sk)
2181 {
2182 	struct tcp_sock *tp = tcp_sk(sk);
2183 
2184 	tcp_clear_xmit_timers(sk);
2185 
2186 	tcp_cleanup_congestion_control(sk);
2187 
2188 	/* Cleanup up the write buffer. */
2189 	tcp_write_queue_purge(sk);
2190 
2191 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2192 	__skb_queue_purge(&tp->out_of_order_queue);
2193 
2194 #ifdef CONFIG_TCP_MD5SIG
2195 	/* Clean up the MD5 key list, if any */
2196 	if (tp->md5sig_info) {
2197 		tcp_clear_md5_list(sk);
2198 		kfree_rcu(tp->md5sig_info, rcu);
2199 		tp->md5sig_info = NULL;
2200 	}
2201 #endif
2202 
2203 #ifdef CONFIG_NET_DMA
2204 	/* Cleans up our sk_async_wait_queue */
2205 	__skb_queue_purge(&sk->sk_async_wait_queue);
2206 #endif
2207 
2208 	/* Clean prequeue, it must be empty really */
2209 	__skb_queue_purge(&tp->ucopy.prequeue);
2210 
2211 	/* Clean up a referenced TCP bind bucket. */
2212 	if (inet_csk(sk)->icsk_bind_hash)
2213 		inet_put_port(sk);
2214 
2215 	BUG_ON(tp->fastopen_rsk != NULL);
2216 
2217 	/* If socket is aborted during connect operation */
2218 	tcp_free_fastopen_req(tp);
2219 
2220 	sk_sockets_allocated_dec(sk);
2221 	sock_release_memcg(sk);
2222 }
2223 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2224 
2225 #ifdef CONFIG_PROC_FS
2226 /* Proc filesystem TCP sock list dumping. */
2227 
tw_head(struct hlist_nulls_head * head)2228 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2229 {
2230 	return hlist_nulls_empty(head) ? NULL :
2231 		list_entry(head->first, struct inet_timewait_sock, tw_node);
2232 }
2233 
tw_next(struct inet_timewait_sock * tw)2234 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2235 {
2236 	return !is_a_nulls(tw->tw_node.next) ?
2237 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2238 }
2239 
2240 /*
2241  * Get next listener socket follow cur.  If cur is NULL, get first socket
2242  * starting from bucket given in st->bucket; when st->bucket is zero the
2243  * very first socket in the hash table is returned.
2244  */
listening_get_next(struct seq_file * seq,void * cur)2245 static void *listening_get_next(struct seq_file *seq, void *cur)
2246 {
2247 	struct inet_connection_sock *icsk;
2248 	struct hlist_nulls_node *node;
2249 	struct sock *sk = cur;
2250 	struct inet_listen_hashbucket *ilb;
2251 	struct tcp_iter_state *st = seq->private;
2252 	struct net *net = seq_file_net(seq);
2253 
2254 	if (!sk) {
2255 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2256 		spin_lock_bh(&ilb->lock);
2257 		sk = sk_nulls_head(&ilb->head);
2258 		st->offset = 0;
2259 		goto get_sk;
2260 	}
2261 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2262 	++st->num;
2263 	++st->offset;
2264 
2265 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2266 		struct request_sock *req = cur;
2267 
2268 		icsk = inet_csk(st->syn_wait_sk);
2269 		req = req->dl_next;
2270 		while (1) {
2271 			while (req) {
2272 				if (req->rsk_ops->family == st->family) {
2273 					cur = req;
2274 					goto out;
2275 				}
2276 				req = req->dl_next;
2277 			}
2278 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2279 				break;
2280 get_req:
2281 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2282 		}
2283 		sk	  = sk_nulls_next(st->syn_wait_sk);
2284 		st->state = TCP_SEQ_STATE_LISTENING;
2285 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2286 	} else {
2287 		icsk = inet_csk(sk);
2288 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2289 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2290 			goto start_req;
2291 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2292 		sk = sk_nulls_next(sk);
2293 	}
2294 get_sk:
2295 	sk_nulls_for_each_from(sk, node) {
2296 		if (!net_eq(sock_net(sk), net))
2297 			continue;
2298 		if (sk->sk_family == st->family) {
2299 			cur = sk;
2300 			goto out;
2301 		}
2302 		icsk = inet_csk(sk);
2303 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2304 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2305 start_req:
2306 			st->uid		= sock_i_uid(sk);
2307 			st->syn_wait_sk = sk;
2308 			st->state	= TCP_SEQ_STATE_OPENREQ;
2309 			st->sbucket	= 0;
2310 			goto get_req;
2311 		}
2312 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2313 	}
2314 	spin_unlock_bh(&ilb->lock);
2315 	st->offset = 0;
2316 	if (++st->bucket < INET_LHTABLE_SIZE) {
2317 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2318 		spin_lock_bh(&ilb->lock);
2319 		sk = sk_nulls_head(&ilb->head);
2320 		goto get_sk;
2321 	}
2322 	cur = NULL;
2323 out:
2324 	return cur;
2325 }
2326 
listening_get_idx(struct seq_file * seq,loff_t * pos)2327 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2328 {
2329 	struct tcp_iter_state *st = seq->private;
2330 	void *rc;
2331 
2332 	st->bucket = 0;
2333 	st->offset = 0;
2334 	rc = listening_get_next(seq, NULL);
2335 
2336 	while (rc && *pos) {
2337 		rc = listening_get_next(seq, rc);
2338 		--*pos;
2339 	}
2340 	return rc;
2341 }
2342 
empty_bucket(struct tcp_iter_state * st)2343 static inline bool empty_bucket(struct tcp_iter_state *st)
2344 {
2345 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2346 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2347 }
2348 
2349 /*
2350  * Get first established socket starting from bucket given in st->bucket.
2351  * If st->bucket is zero, the very first socket in the hash is returned.
2352  */
established_get_first(struct seq_file * seq)2353 static void *established_get_first(struct seq_file *seq)
2354 {
2355 	struct tcp_iter_state *st = seq->private;
2356 	struct net *net = seq_file_net(seq);
2357 	void *rc = NULL;
2358 
2359 	st->offset = 0;
2360 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2361 		struct sock *sk;
2362 		struct hlist_nulls_node *node;
2363 		struct inet_timewait_sock *tw;
2364 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2365 
2366 		/* Lockless fast path for the common case of empty buckets */
2367 		if (empty_bucket(st))
2368 			continue;
2369 
2370 		spin_lock_bh(lock);
2371 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2372 			if (sk->sk_family != st->family ||
2373 			    !net_eq(sock_net(sk), net)) {
2374 				continue;
2375 			}
2376 			rc = sk;
2377 			goto out;
2378 		}
2379 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2380 		inet_twsk_for_each(tw, node,
2381 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2382 			if (tw->tw_family != st->family ||
2383 			    !net_eq(twsk_net(tw), net)) {
2384 				continue;
2385 			}
2386 			rc = tw;
2387 			goto out;
2388 		}
2389 		spin_unlock_bh(lock);
2390 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2391 	}
2392 out:
2393 	return rc;
2394 }
2395 
established_get_next(struct seq_file * seq,void * cur)2396 static void *established_get_next(struct seq_file *seq, void *cur)
2397 {
2398 	struct sock *sk = cur;
2399 	struct inet_timewait_sock *tw;
2400 	struct hlist_nulls_node *node;
2401 	struct tcp_iter_state *st = seq->private;
2402 	struct net *net = seq_file_net(seq);
2403 
2404 	++st->num;
2405 	++st->offset;
2406 
2407 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2408 		tw = cur;
2409 		tw = tw_next(tw);
2410 get_tw:
2411 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2412 			tw = tw_next(tw);
2413 		}
2414 		if (tw) {
2415 			cur = tw;
2416 			goto out;
2417 		}
2418 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2419 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2420 
2421 		/* Look for next non empty bucket */
2422 		st->offset = 0;
2423 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2424 				empty_bucket(st))
2425 			;
2426 		if (st->bucket > tcp_hashinfo.ehash_mask)
2427 			return NULL;
2428 
2429 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2430 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2431 	} else
2432 		sk = sk_nulls_next(sk);
2433 
2434 	sk_nulls_for_each_from(sk, node) {
2435 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2436 			goto found;
2437 	}
2438 
2439 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2440 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2441 	goto get_tw;
2442 found:
2443 	cur = sk;
2444 out:
2445 	return cur;
2446 }
2447 
established_get_idx(struct seq_file * seq,loff_t pos)2448 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2449 {
2450 	struct tcp_iter_state *st = seq->private;
2451 	void *rc;
2452 
2453 	st->bucket = 0;
2454 	rc = established_get_first(seq);
2455 
2456 	while (rc && pos) {
2457 		rc = established_get_next(seq, rc);
2458 		--pos;
2459 	}
2460 	return rc;
2461 }
2462 
tcp_get_idx(struct seq_file * seq,loff_t pos)2463 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2464 {
2465 	void *rc;
2466 	struct tcp_iter_state *st = seq->private;
2467 
2468 	st->state = TCP_SEQ_STATE_LISTENING;
2469 	rc	  = listening_get_idx(seq, &pos);
2470 
2471 	if (!rc) {
2472 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2473 		rc	  = established_get_idx(seq, pos);
2474 	}
2475 
2476 	return rc;
2477 }
2478 
tcp_seek_last_pos(struct seq_file * seq)2479 static void *tcp_seek_last_pos(struct seq_file *seq)
2480 {
2481 	struct tcp_iter_state *st = seq->private;
2482 	int offset = st->offset;
2483 	int orig_num = st->num;
2484 	void *rc = NULL;
2485 
2486 	switch (st->state) {
2487 	case TCP_SEQ_STATE_OPENREQ:
2488 	case TCP_SEQ_STATE_LISTENING:
2489 		if (st->bucket >= INET_LHTABLE_SIZE)
2490 			break;
2491 		st->state = TCP_SEQ_STATE_LISTENING;
2492 		rc = listening_get_next(seq, NULL);
2493 		while (offset-- && rc)
2494 			rc = listening_get_next(seq, rc);
2495 		if (rc)
2496 			break;
2497 		st->bucket = 0;
2498 		/* Fallthrough */
2499 	case TCP_SEQ_STATE_ESTABLISHED:
2500 	case TCP_SEQ_STATE_TIME_WAIT:
2501 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2502 		if (st->bucket > tcp_hashinfo.ehash_mask)
2503 			break;
2504 		rc = established_get_first(seq);
2505 		while (offset-- && rc)
2506 			rc = established_get_next(seq, rc);
2507 	}
2508 
2509 	st->num = orig_num;
2510 
2511 	return rc;
2512 }
2513 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2514 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2515 {
2516 	struct tcp_iter_state *st = seq->private;
2517 	void *rc;
2518 
2519 	if (*pos && *pos == st->last_pos) {
2520 		rc = tcp_seek_last_pos(seq);
2521 		if (rc)
2522 			goto out;
2523 	}
2524 
2525 	st->state = TCP_SEQ_STATE_LISTENING;
2526 	st->num = 0;
2527 	st->bucket = 0;
2528 	st->offset = 0;
2529 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2530 
2531 out:
2532 	st->last_pos = *pos;
2533 	return rc;
2534 }
2535 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2536 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2537 {
2538 	struct tcp_iter_state *st = seq->private;
2539 	void *rc = NULL;
2540 
2541 	if (v == SEQ_START_TOKEN) {
2542 		rc = tcp_get_idx(seq, 0);
2543 		goto out;
2544 	}
2545 
2546 	switch (st->state) {
2547 	case TCP_SEQ_STATE_OPENREQ:
2548 	case TCP_SEQ_STATE_LISTENING:
2549 		rc = listening_get_next(seq, v);
2550 		if (!rc) {
2551 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2552 			st->bucket = 0;
2553 			st->offset = 0;
2554 			rc	  = established_get_first(seq);
2555 		}
2556 		break;
2557 	case TCP_SEQ_STATE_ESTABLISHED:
2558 	case TCP_SEQ_STATE_TIME_WAIT:
2559 		rc = established_get_next(seq, v);
2560 		break;
2561 	}
2562 out:
2563 	++*pos;
2564 	st->last_pos = *pos;
2565 	return rc;
2566 }
2567 
tcp_seq_stop(struct seq_file * seq,void * v)2568 static void tcp_seq_stop(struct seq_file *seq, void *v)
2569 {
2570 	struct tcp_iter_state *st = seq->private;
2571 
2572 	switch (st->state) {
2573 	case TCP_SEQ_STATE_OPENREQ:
2574 		if (v) {
2575 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2576 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2577 		}
2578 	case TCP_SEQ_STATE_LISTENING:
2579 		if (v != SEQ_START_TOKEN)
2580 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2581 		break;
2582 	case TCP_SEQ_STATE_TIME_WAIT:
2583 	case TCP_SEQ_STATE_ESTABLISHED:
2584 		if (v)
2585 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2586 		break;
2587 	}
2588 }
2589 
tcp_seq_open(struct inode * inode,struct file * file)2590 int tcp_seq_open(struct inode *inode, struct file *file)
2591 {
2592 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2593 	struct tcp_iter_state *s;
2594 	int err;
2595 
2596 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2597 			  sizeof(struct tcp_iter_state));
2598 	if (err < 0)
2599 		return err;
2600 
2601 	s = ((struct seq_file *)file->private_data)->private;
2602 	s->family		= afinfo->family;
2603 	s->last_pos 		= 0;
2604 	return 0;
2605 }
2606 EXPORT_SYMBOL(tcp_seq_open);
2607 
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2608 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2609 {
2610 	int rc = 0;
2611 	struct proc_dir_entry *p;
2612 
2613 	afinfo->seq_ops.start		= tcp_seq_start;
2614 	afinfo->seq_ops.next		= tcp_seq_next;
2615 	afinfo->seq_ops.stop		= tcp_seq_stop;
2616 
2617 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2618 			     afinfo->seq_fops, afinfo);
2619 	if (!p)
2620 		rc = -ENOMEM;
2621 	return rc;
2622 }
2623 EXPORT_SYMBOL(tcp_proc_register);
2624 
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2625 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2626 {
2627 	remove_proc_entry(afinfo->name, net->proc_net);
2628 }
2629 EXPORT_SYMBOL(tcp_proc_unregister);
2630 
get_openreq4(const struct sock * sk,const struct request_sock * req,struct seq_file * f,int i,kuid_t uid,int * len)2631 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2632 			 struct seq_file *f, int i, kuid_t uid, int *len)
2633 {
2634 	const struct inet_request_sock *ireq = inet_rsk(req);
2635 	long delta = req->expires - jiffies;
2636 
2637 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2638 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2639 		i,
2640 		ireq->loc_addr,
2641 		ntohs(inet_sk(sk)->inet_sport),
2642 		ireq->rmt_addr,
2643 		ntohs(ireq->rmt_port),
2644 		TCP_SYN_RECV,
2645 		0, 0, /* could print option size, but that is af dependent. */
2646 		1,    /* timers active (only the expire timer) */
2647 		jiffies_delta_to_clock_t(delta),
2648 		req->num_timeout,
2649 		from_kuid_munged(seq_user_ns(f), uid),
2650 		0,  /* non standard timer */
2651 		0, /* open_requests have no inode */
2652 		atomic_read(&sk->sk_refcnt),
2653 		req,
2654 		len);
2655 }
2656 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i,int * len)2657 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2658 {
2659 	int timer_active;
2660 	unsigned long timer_expires;
2661 	const struct tcp_sock *tp = tcp_sk(sk);
2662 	const struct inet_connection_sock *icsk = inet_csk(sk);
2663 	const struct inet_sock *inet = inet_sk(sk);
2664 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2665 	__be32 dest = inet->inet_daddr;
2666 	__be32 src = inet->inet_rcv_saddr;
2667 	__u16 destp = ntohs(inet->inet_dport);
2668 	__u16 srcp = ntohs(inet->inet_sport);
2669 	int rx_queue;
2670 
2671 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2672 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2673 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2674 		timer_active	= 1;
2675 		timer_expires	= icsk->icsk_timeout;
2676 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2677 		timer_active	= 4;
2678 		timer_expires	= icsk->icsk_timeout;
2679 	} else if (timer_pending(&sk->sk_timer)) {
2680 		timer_active	= 2;
2681 		timer_expires	= sk->sk_timer.expires;
2682 	} else {
2683 		timer_active	= 0;
2684 		timer_expires = jiffies;
2685 	}
2686 
2687 	if (sk->sk_state == TCP_LISTEN)
2688 		rx_queue = sk->sk_ack_backlog;
2689 	else
2690 		/*
2691 		 * because we dont lock socket, we might find a transient negative value
2692 		 */
2693 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2694 
2695 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2696 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2697 		i, src, srcp, dest, destp, sk->sk_state,
2698 		tp->write_seq - tp->snd_una,
2699 		rx_queue,
2700 		timer_active,
2701 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2702 		icsk->icsk_retransmits,
2703 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2704 		icsk->icsk_probes_out,
2705 		sock_i_ino(sk),
2706 		atomic_read(&sk->sk_refcnt), sk,
2707 		jiffies_to_clock_t(icsk->icsk_rto),
2708 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2709 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2710 		tp->snd_cwnd,
2711 		sk->sk_state == TCP_LISTEN ?
2712 		    (fastopenq ? fastopenq->max_qlen : 0) :
2713 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2714 		len);
2715 }
2716 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i,int * len)2717 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2718 			       struct seq_file *f, int i, int *len)
2719 {
2720 	__be32 dest, src;
2721 	__u16 destp, srcp;
2722 	long delta = tw->tw_ttd - jiffies;
2723 
2724 	dest  = tw->tw_daddr;
2725 	src   = tw->tw_rcv_saddr;
2726 	destp = ntohs(tw->tw_dport);
2727 	srcp  = ntohs(tw->tw_sport);
2728 
2729 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2730 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2731 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2732 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2733 		atomic_read(&tw->tw_refcnt), tw, len);
2734 }
2735 
2736 #define TMPSZ 150
2737 
tcp4_seq_show(struct seq_file * seq,void * v)2738 static int tcp4_seq_show(struct seq_file *seq, void *v)
2739 {
2740 	struct tcp_iter_state *st;
2741 	int len;
2742 
2743 	if (v == SEQ_START_TOKEN) {
2744 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2745 			   "  sl  local_address rem_address   st tx_queue "
2746 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2747 			   "inode");
2748 		goto out;
2749 	}
2750 	st = seq->private;
2751 
2752 	switch (st->state) {
2753 	case TCP_SEQ_STATE_LISTENING:
2754 	case TCP_SEQ_STATE_ESTABLISHED:
2755 		get_tcp4_sock(v, seq, st->num, &len);
2756 		break;
2757 	case TCP_SEQ_STATE_OPENREQ:
2758 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2759 		break;
2760 	case TCP_SEQ_STATE_TIME_WAIT:
2761 		get_timewait4_sock(v, seq, st->num, &len);
2762 		break;
2763 	}
2764 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2765 out:
2766 	return 0;
2767 }
2768 
2769 static const struct file_operations tcp_afinfo_seq_fops = {
2770 	.owner   = THIS_MODULE,
2771 	.open    = tcp_seq_open,
2772 	.read    = seq_read,
2773 	.llseek  = seq_lseek,
2774 	.release = seq_release_net
2775 };
2776 
2777 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2778 	.name		= "tcp",
2779 	.family		= AF_INET,
2780 	.seq_fops	= &tcp_afinfo_seq_fops,
2781 	.seq_ops	= {
2782 		.show		= tcp4_seq_show,
2783 	},
2784 };
2785 
tcp4_proc_init_net(struct net * net)2786 static int __net_init tcp4_proc_init_net(struct net *net)
2787 {
2788 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2789 }
2790 
tcp4_proc_exit_net(struct net * net)2791 static void __net_exit tcp4_proc_exit_net(struct net *net)
2792 {
2793 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2794 }
2795 
2796 static struct pernet_operations tcp4_net_ops = {
2797 	.init = tcp4_proc_init_net,
2798 	.exit = tcp4_proc_exit_net,
2799 };
2800 
tcp4_proc_init(void)2801 int __init tcp4_proc_init(void)
2802 {
2803 	return register_pernet_subsys(&tcp4_net_ops);
2804 }
2805 
tcp4_proc_exit(void)2806 void tcp4_proc_exit(void)
2807 {
2808 	unregister_pernet_subsys(&tcp4_net_ops);
2809 }
2810 #endif /* CONFIG_PROC_FS */
2811 
tcp4_gro_receive(struct sk_buff ** head,struct sk_buff * skb)2812 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2813 {
2814 	const struct iphdr *iph = skb_gro_network_header(skb);
2815 	__wsum wsum;
2816 	__sum16 sum;
2817 
2818 	switch (skb->ip_summed) {
2819 	case CHECKSUM_COMPLETE:
2820 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2821 				  skb->csum)) {
2822 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2823 			break;
2824 		}
2825 flush:
2826 		NAPI_GRO_CB(skb)->flush = 1;
2827 		return NULL;
2828 
2829 	case CHECKSUM_NONE:
2830 		wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2831 					  skb_gro_len(skb), IPPROTO_TCP, 0);
2832 		sum = csum_fold(skb_checksum(skb,
2833 					     skb_gro_offset(skb),
2834 					     skb_gro_len(skb),
2835 					     wsum));
2836 		if (sum)
2837 			goto flush;
2838 
2839 		skb->ip_summed = CHECKSUM_UNNECESSARY;
2840 		break;
2841 	}
2842 
2843 	return tcp_gro_receive(head, skb);
2844 }
2845 
tcp4_gro_complete(struct sk_buff * skb)2846 int tcp4_gro_complete(struct sk_buff *skb)
2847 {
2848 	const struct iphdr *iph = ip_hdr(skb);
2849 	struct tcphdr *th = tcp_hdr(skb);
2850 
2851 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2852 				  iph->saddr, iph->daddr, 0);
2853 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2854 
2855 	return tcp_gro_complete(skb);
2856 }
2857 
2858 struct proto tcp_prot = {
2859 	.name			= "TCP",
2860 	.owner			= THIS_MODULE,
2861 	.close			= tcp_close,
2862 	.connect		= tcp_v4_connect,
2863 	.disconnect		= tcp_disconnect,
2864 	.accept			= inet_csk_accept,
2865 	.ioctl			= tcp_ioctl,
2866 	.init			= tcp_v4_init_sock,
2867 	.destroy		= tcp_v4_destroy_sock,
2868 	.shutdown		= tcp_shutdown,
2869 	.setsockopt		= tcp_setsockopt,
2870 	.getsockopt		= tcp_getsockopt,
2871 	.recvmsg		= tcp_recvmsg,
2872 	.sendmsg		= tcp_sendmsg,
2873 	.sendpage		= tcp_sendpage,
2874 	.backlog_rcv		= tcp_v4_do_rcv,
2875 	.release_cb		= tcp_release_cb,
2876 	.mtu_reduced		= tcp_v4_mtu_reduced,
2877 	.hash			= inet_hash,
2878 	.unhash			= inet_unhash,
2879 	.get_port		= inet_csk_get_port,
2880 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2881 	.sockets_allocated	= &tcp_sockets_allocated,
2882 	.orphan_count		= &tcp_orphan_count,
2883 	.memory_allocated	= &tcp_memory_allocated,
2884 	.memory_pressure	= &tcp_memory_pressure,
2885 	.sysctl_wmem		= sysctl_tcp_wmem,
2886 	.sysctl_rmem		= sysctl_tcp_rmem,
2887 	.max_header		= MAX_TCP_HEADER,
2888 	.obj_size		= sizeof(struct tcp_sock),
2889 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2890 	.twsk_prot		= &tcp_timewait_sock_ops,
2891 	.rsk_prot		= &tcp_request_sock_ops,
2892 	.h.hashinfo		= &tcp_hashinfo,
2893 	.no_autobind		= true,
2894 #ifdef CONFIG_COMPAT
2895 	.compat_setsockopt	= compat_tcp_setsockopt,
2896 	.compat_getsockopt	= compat_tcp_getsockopt,
2897 #endif
2898 #ifdef CONFIG_MEMCG_KMEM
2899 	.init_cgroup		= tcp_init_cgroup,
2900 	.destroy_cgroup		= tcp_destroy_cgroup,
2901 	.proto_cgroup		= tcp_proto_cgroup,
2902 #endif
2903 	.diag_destroy		= tcp_abort,
2904 };
2905 EXPORT_SYMBOL(tcp_prot);
2906 
tcp_sk_exit(struct net * net)2907 static void __net_exit tcp_sk_exit(struct net *net)
2908 {
2909 	int cpu;
2910 
2911 	for_each_possible_cpu(cpu)
2912 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2913 	free_percpu(net->ipv4.tcp_sk);
2914 }
2915 
tcp_sk_init(struct net * net)2916 static int __net_init tcp_sk_init(struct net *net)
2917 {
2918 	int res, cpu;
2919 
2920 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2921 	if (!net->ipv4.tcp_sk)
2922 		return -ENOMEM;
2923 
2924 	for_each_possible_cpu(cpu) {
2925 		struct sock *sk;
2926 
2927 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2928 					   IPPROTO_TCP, net);
2929 		if (res)
2930 			goto fail;
2931 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2932 	}
2933 	net->ipv4.sysctl_tcp_ecn = 2;
2934 	return 0;
2935 
2936 fail:
2937 	tcp_sk_exit(net);
2938 
2939 	return res;
2940 }
2941 
tcp_sk_exit_batch(struct list_head * net_exit_list)2942 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2943 {
2944 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2945 }
2946 
2947 static struct pernet_operations __net_initdata tcp_sk_ops = {
2948        .init	   = tcp_sk_init,
2949        .exit	   = tcp_sk_exit,
2950        .exit_batch = tcp_sk_exit_batch,
2951 };
2952 
tcp_v4_init(void)2953 void __init tcp_v4_init(void)
2954 {
2955 	inet_hashinfo_init(&tcp_hashinfo);
2956 	if (register_pernet_subsys(&tcp_sk_ops))
2957 		panic("Failed to create the TCP control socket.\n");
2958 }
2959