1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24 /*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102 }
103
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
113
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
120 holder.
121
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
124 */
125 if (tcptw->tw_ts_recent_stamp &&
126 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
130 tp->write_seq = 1;
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 sock_hold(sktw);
134 return 1;
135 }
136
137 return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145 struct inet_sock *inet = inet_sk(sk);
146 struct tcp_sock *tp = tcp_sk(sk);
147 __be16 orig_sport, orig_dport;
148 __be32 daddr, nexthop;
149 struct flowi4 *fl4;
150 struct rtable *rt;
151 int err;
152 struct ip_options_rcu *inet_opt;
153 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
154
155 if (addr_len < sizeof(struct sockaddr_in))
156 return -EINVAL;
157
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
160
161 nexthop = daddr = usin->sin_addr.s_addr;
162 inet_opt = rcu_dereference_protected(inet->inet_opt,
163 lockdep_sock_is_held(sk));
164 if (inet_opt && inet_opt->opt.srr) {
165 if (!daddr)
166 return -EINVAL;
167 nexthop = inet_opt->opt.faddr;
168 }
169
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
176 orig_sport, orig_dport, sk);
177 if (IS_ERR(rt)) {
178 err = PTR_ERR(rt);
179 if (err == -ENETUNREACH)
180 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
181 return err;
182 }
183
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
187 }
188
189 if (!inet_opt || !inet_opt->opt.srr)
190 daddr = fl4->daddr;
191
192 if (!inet->inet_saddr)
193 inet->inet_saddr = fl4->saddr;
194 sk_rcv_saddr_set(sk, inet->inet_saddr);
195
196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
200 if (likely(!tp->repair))
201 tp->write_seq = 0;
202 }
203
204 inet->inet_dport = usin->sin_port;
205 sk_daddr_set(sk, daddr);
206
207 inet_csk(sk)->icsk_ext_hdr_len = 0;
208 if (inet_opt)
209 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
210
211 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
212
213 /* Socket identity is still unknown (sport may be zero).
214 * However we set state to SYN-SENT and not releasing socket
215 * lock select source port, enter ourselves into the hash tables and
216 * complete initialization after this.
217 */
218 tcp_set_state(sk, TCP_SYN_SENT);
219 err = inet_hash_connect(tcp_death_row, sk);
220 if (err)
221 goto failure;
222
223 sk_set_txhash(sk);
224
225 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226 inet->inet_sport, inet->inet_dport, sk);
227 if (IS_ERR(rt)) {
228 err = PTR_ERR(rt);
229 rt = NULL;
230 goto failure;
231 }
232 /* OK, now commit destination to socket. */
233 sk->sk_gso_type = SKB_GSO_TCPV4;
234 sk_setup_caps(sk, &rt->dst);
235 rt = NULL;
236
237 if (likely(!tp->repair)) {
238 if (!tp->write_seq)
239 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
242 usin->sin_port);
243 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
244 inet->inet_saddr,
245 inet->inet_daddr);
246 }
247
248 inet->inet_id = prandom_u32();
249
250 if (tcp_fastopen_defer_connect(sk, &err))
251 return err;
252 if (err)
253 goto failure;
254
255 err = tcp_connect(sk);
256
257 if (err)
258 goto failure;
259
260 return 0;
261
262 failure:
263 /*
264 * This unhashes the socket and releases the local port,
265 * if necessary.
266 */
267 tcp_set_state(sk, TCP_CLOSE);
268 ip_rt_put(rt);
269 sk->sk_route_caps = 0;
270 inet->inet_dport = 0;
271 return err;
272 }
273 EXPORT_SYMBOL(tcp_v4_connect);
274
275 /*
276 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
277 * It can be called through tcp_release_cb() if socket was owned by user
278 * at the time tcp_v4_err() was called to handle ICMP message.
279 */
tcp_v4_mtu_reduced(struct sock * sk)280 void tcp_v4_mtu_reduced(struct sock *sk)
281 {
282 struct inet_sock *inet = inet_sk(sk);
283 struct dst_entry *dst;
284 u32 mtu;
285
286 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287 return;
288 mtu = tcp_sk(sk)->mtu_info;
289 dst = inet_csk_update_pmtu(sk, mtu);
290 if (!dst)
291 return;
292
293 /* Something is about to be wrong... Remember soft error
294 * for the case, if this connection will not able to recover.
295 */
296 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
297 sk->sk_err_soft = EMSGSIZE;
298
299 mtu = dst_mtu(dst);
300
301 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
302 ip_sk_accept_pmtu(sk) &&
303 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304 tcp_sync_mss(sk, mtu);
305
306 /* Resend the TCP packet because it's
307 * clear that the old packet has been
308 * dropped. This is the new "fast" path mtu
309 * discovery.
310 */
311 tcp_simple_retransmit(sk);
312 } /* else let the usual retransmit timer handle it */
313 }
314 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
315
do_redirect(struct sk_buff * skb,struct sock * sk)316 static void do_redirect(struct sk_buff *skb, struct sock *sk)
317 {
318 struct dst_entry *dst = __sk_dst_check(sk, 0);
319
320 if (dst)
321 dst->ops->redirect(dst, sk, skb);
322 }
323
324
325 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)326 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
327 {
328 struct request_sock *req = inet_reqsk(sk);
329 struct net *net = sock_net(sk);
330
331 /* ICMPs are not backlogged, hence we cannot get
332 * an established socket here.
333 */
334 if (seq != tcp_rsk(req)->snt_isn) {
335 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
336 } else if (abort) {
337 /*
338 * Still in SYN_RECV, just remove it silently.
339 * There is no good way to pass the error to the newly
340 * created socket, and POSIX does not want network
341 * errors returned from accept().
342 */
343 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
344 tcp_listendrop(req->rsk_listener);
345 }
346 reqsk_put(req);
347 }
348 EXPORT_SYMBOL(tcp_req_err);
349
350 /*
351 * This routine is called by the ICMP module when it gets some
352 * sort of error condition. If err < 0 then the socket should
353 * be closed and the error returned to the user. If err > 0
354 * it's just the icmp type << 8 | icmp code. After adjustment
355 * header points to the first 8 bytes of the tcp header. We need
356 * to find the appropriate port.
357 *
358 * The locking strategy used here is very "optimistic". When
359 * someone else accesses the socket the ICMP is just dropped
360 * and for some paths there is no check at all.
361 * A more general error queue to queue errors for later handling
362 * is probably better.
363 *
364 */
365
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)366 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
367 {
368 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
369 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
370 struct inet_connection_sock *icsk;
371 struct tcp_sock *tp;
372 struct inet_sock *inet;
373 const int type = icmp_hdr(icmp_skb)->type;
374 const int code = icmp_hdr(icmp_skb)->code;
375 struct sock *sk;
376 struct sk_buff *skb;
377 struct request_sock *fastopen;
378 u32 seq, snd_una;
379 s32 remaining;
380 u32 delta_us;
381 int err;
382 struct net *net = dev_net(icmp_skb->dev);
383
384 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
385 th->dest, iph->saddr, ntohs(th->source),
386 inet_iif(icmp_skb), 0);
387 if (!sk) {
388 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
389 return;
390 }
391 if (sk->sk_state == TCP_TIME_WAIT) {
392 inet_twsk_put(inet_twsk(sk));
393 return;
394 }
395 seq = ntohl(th->seq);
396 if (sk->sk_state == TCP_NEW_SYN_RECV)
397 return tcp_req_err(sk, seq,
398 type == ICMP_PARAMETERPROB ||
399 type == ICMP_TIME_EXCEEDED ||
400 (type == ICMP_DEST_UNREACH &&
401 (code == ICMP_NET_UNREACH ||
402 code == ICMP_HOST_UNREACH)));
403
404 bh_lock_sock(sk);
405 /* If too many ICMPs get dropped on busy
406 * servers this needs to be solved differently.
407 * We do take care of PMTU discovery (RFC1191) special case :
408 * we can receive locally generated ICMP messages while socket is held.
409 */
410 if (sock_owned_by_user(sk)) {
411 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
412 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
413 }
414 if (sk->sk_state == TCP_CLOSE)
415 goto out;
416
417 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
418 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
419 goto out;
420 }
421
422 icsk = inet_csk(sk);
423 tp = tcp_sk(sk);
424 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
425 fastopen = tp->fastopen_rsk;
426 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
427 if (sk->sk_state != TCP_LISTEN &&
428 !between(seq, snd_una, tp->snd_nxt)) {
429 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
430 goto out;
431 }
432
433 switch (type) {
434 case ICMP_REDIRECT:
435 if (!sock_owned_by_user(sk))
436 do_redirect(icmp_skb, sk);
437 goto out;
438 case ICMP_SOURCE_QUENCH:
439 /* Just silently ignore these. */
440 goto out;
441 case ICMP_PARAMETERPROB:
442 err = EPROTO;
443 break;
444 case ICMP_DEST_UNREACH:
445 if (code > NR_ICMP_UNREACH)
446 goto out;
447
448 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
449 /* We are not interested in TCP_LISTEN and open_requests
450 * (SYN-ACKs send out by Linux are always <576bytes so
451 * they should go through unfragmented).
452 */
453 if (sk->sk_state == TCP_LISTEN)
454 goto out;
455
456 tp->mtu_info = info;
457 if (!sock_owned_by_user(sk)) {
458 tcp_v4_mtu_reduced(sk);
459 } else {
460 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
461 sock_hold(sk);
462 }
463 goto out;
464 }
465
466 err = icmp_err_convert[code].errno;
467 /* check if icmp_skb allows revert of backoff
468 * (see draft-zimmermann-tcp-lcd) */
469 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
470 break;
471 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
472 !icsk->icsk_backoff || fastopen)
473 break;
474
475 if (sock_owned_by_user(sk))
476 break;
477
478 skb = tcp_write_queue_head(sk);
479 if (WARN_ON_ONCE(!skb))
480 break;
481
482 icsk->icsk_backoff--;
483 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
484 TCP_TIMEOUT_INIT;
485 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
486
487 tcp_mstamp_refresh(tp);
488 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
489 remaining = icsk->icsk_rto -
490 usecs_to_jiffies(delta_us);
491
492 if (remaining > 0) {
493 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
494 remaining, TCP_RTO_MAX);
495 } else {
496 /* RTO revert clocked out retransmission.
497 * Will retransmit now */
498 tcp_retransmit_timer(sk);
499 }
500
501 break;
502 case ICMP_TIME_EXCEEDED:
503 err = EHOSTUNREACH;
504 break;
505 default:
506 goto out;
507 }
508
509 switch (sk->sk_state) {
510 case TCP_SYN_SENT:
511 case TCP_SYN_RECV:
512 /* Only in fast or simultaneous open. If a fast open socket is
513 * is already accepted it is treated as a connected one below.
514 */
515 if (fastopen && !fastopen->sk)
516 break;
517
518 if (!sock_owned_by_user(sk)) {
519 sk->sk_err = err;
520
521 sk->sk_error_report(sk);
522
523 tcp_done(sk);
524 } else {
525 sk->sk_err_soft = err;
526 }
527 goto out;
528 }
529
530 /* If we've already connected we will keep trying
531 * until we time out, or the user gives up.
532 *
533 * rfc1122 4.2.3.9 allows to consider as hard errors
534 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535 * but it is obsoleted by pmtu discovery).
536 *
537 * Note, that in modern internet, where routing is unreliable
538 * and in each dark corner broken firewalls sit, sending random
539 * errors ordered by their masters even this two messages finally lose
540 * their original sense (even Linux sends invalid PORT_UNREACHs)
541 *
542 * Now we are in compliance with RFCs.
543 * --ANK (980905)
544 */
545
546 inet = inet_sk(sk);
547 if (!sock_owned_by_user(sk) && inet->recverr) {
548 sk->sk_err = err;
549 sk->sk_error_report(sk);
550 } else { /* Only an error on timeout */
551 sk->sk_err_soft = err;
552 }
553
554 out:
555 bh_unlock_sock(sk);
556 sock_put(sk);
557 }
558
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)559 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
560 {
561 struct tcphdr *th = tcp_hdr(skb);
562
563 if (skb->ip_summed == CHECKSUM_PARTIAL) {
564 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
565 skb->csum_start = skb_transport_header(skb) - skb->head;
566 skb->csum_offset = offsetof(struct tcphdr, check);
567 } else {
568 th->check = tcp_v4_check(skb->len, saddr, daddr,
569 csum_partial(th,
570 th->doff << 2,
571 skb->csum));
572 }
573 }
574
575 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)576 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
577 {
578 const struct inet_sock *inet = inet_sk(sk);
579
580 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
581 }
582 EXPORT_SYMBOL(tcp_v4_send_check);
583
584 /*
585 * This routine will send an RST to the other tcp.
586 *
587 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
588 * for reset.
589 * Answer: if a packet caused RST, it is not for a socket
590 * existing in our system, if it is matched to a socket,
591 * it is just duplicate segment or bug in other side's TCP.
592 * So that we build reply only basing on parameters
593 * arrived with segment.
594 * Exception: precedence violation. We do not implement it in any case.
595 */
596
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)597 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
598 {
599 const struct tcphdr *th = tcp_hdr(skb);
600 struct {
601 struct tcphdr th;
602 #ifdef CONFIG_TCP_MD5SIG
603 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
604 #endif
605 } rep;
606 struct ip_reply_arg arg;
607 #ifdef CONFIG_TCP_MD5SIG
608 struct tcp_md5sig_key *key = NULL;
609 const __u8 *hash_location = NULL;
610 unsigned char newhash[16];
611 int genhash;
612 struct sock *sk1 = NULL;
613 #endif
614 struct net *net;
615
616 /* Never send a reset in response to a reset. */
617 if (th->rst)
618 return;
619
620 /* If sk not NULL, it means we did a successful lookup and incoming
621 * route had to be correct. prequeue might have dropped our dst.
622 */
623 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
624 return;
625
626 /* Swap the send and the receive. */
627 memset(&rep, 0, sizeof(rep));
628 rep.th.dest = th->source;
629 rep.th.source = th->dest;
630 rep.th.doff = sizeof(struct tcphdr) / 4;
631 rep.th.rst = 1;
632
633 if (th->ack) {
634 rep.th.seq = th->ack_seq;
635 } else {
636 rep.th.ack = 1;
637 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
638 skb->len - (th->doff << 2));
639 }
640
641 memset(&arg, 0, sizeof(arg));
642 arg.iov[0].iov_base = (unsigned char *)&rep;
643 arg.iov[0].iov_len = sizeof(rep.th);
644
645 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
646 #ifdef CONFIG_TCP_MD5SIG
647 rcu_read_lock();
648 hash_location = tcp_parse_md5sig_option(th);
649 if (sk && sk_fullsock(sk)) {
650 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
651 &ip_hdr(skb)->saddr, AF_INET);
652 } else if (hash_location) {
653 /*
654 * active side is lost. Try to find listening socket through
655 * source port, and then find md5 key through listening socket.
656 * we are not loose security here:
657 * Incoming packet is checked with md5 hash with finding key,
658 * no RST generated if md5 hash doesn't match.
659 */
660 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
661 ip_hdr(skb)->saddr,
662 th->source, ip_hdr(skb)->daddr,
663 ntohs(th->source), inet_iif(skb),
664 tcp_v4_sdif(skb));
665 /* don't send rst if it can't find key */
666 if (!sk1)
667 goto out;
668
669 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
670 &ip_hdr(skb)->saddr, AF_INET);
671 if (!key)
672 goto out;
673
674
675 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
676 if (genhash || memcmp(hash_location, newhash, 16) != 0)
677 goto out;
678
679 }
680
681 if (key) {
682 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
683 (TCPOPT_NOP << 16) |
684 (TCPOPT_MD5SIG << 8) |
685 TCPOLEN_MD5SIG);
686 /* Update length and the length the header thinks exists */
687 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
688 rep.th.doff = arg.iov[0].iov_len / 4;
689
690 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
691 key, ip_hdr(skb)->saddr,
692 ip_hdr(skb)->daddr, &rep.th);
693 }
694 #endif
695 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
696 ip_hdr(skb)->saddr, /* XXX */
697 arg.iov[0].iov_len, IPPROTO_TCP, 0);
698 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
699 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
700
701 /* When socket is gone, all binding information is lost.
702 * routing might fail in this case. No choice here, if we choose to force
703 * input interface, we will misroute in case of asymmetric route.
704 */
705 if (sk)
706 arg.bound_dev_if = sk->sk_bound_dev_if;
707
708 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
709 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
710
711 arg.tos = ip_hdr(skb)->tos;
712 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
713 local_bh_disable();
714 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
715 skb, &TCP_SKB_CB(skb)->header.h4.opt,
716 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
717 &arg, arg.iov[0].iov_len);
718
719 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
720 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
721 local_bh_enable();
722
723 #ifdef CONFIG_TCP_MD5SIG
724 out:
725 rcu_read_unlock();
726 #endif
727 }
728
729 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
730 outside socket context is ugly, certainly. What can I do?
731 */
732
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)733 static void tcp_v4_send_ack(const struct sock *sk,
734 struct sk_buff *skb, u32 seq, u32 ack,
735 u32 win, u32 tsval, u32 tsecr, int oif,
736 struct tcp_md5sig_key *key,
737 int reply_flags, u8 tos)
738 {
739 const struct tcphdr *th = tcp_hdr(skb);
740 struct {
741 struct tcphdr th;
742 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
743 #ifdef CONFIG_TCP_MD5SIG
744 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
745 #endif
746 ];
747 } rep;
748 struct net *net = sock_net(sk);
749 struct ip_reply_arg arg;
750
751 memset(&rep.th, 0, sizeof(struct tcphdr));
752 memset(&arg, 0, sizeof(arg));
753
754 arg.iov[0].iov_base = (unsigned char *)&rep;
755 arg.iov[0].iov_len = sizeof(rep.th);
756 if (tsecr) {
757 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
758 (TCPOPT_TIMESTAMP << 8) |
759 TCPOLEN_TIMESTAMP);
760 rep.opt[1] = htonl(tsval);
761 rep.opt[2] = htonl(tsecr);
762 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
763 }
764
765 /* Swap the send and the receive. */
766 rep.th.dest = th->source;
767 rep.th.source = th->dest;
768 rep.th.doff = arg.iov[0].iov_len / 4;
769 rep.th.seq = htonl(seq);
770 rep.th.ack_seq = htonl(ack);
771 rep.th.ack = 1;
772 rep.th.window = htons(win);
773
774 #ifdef CONFIG_TCP_MD5SIG
775 if (key) {
776 int offset = (tsecr) ? 3 : 0;
777
778 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
779 (TCPOPT_NOP << 16) |
780 (TCPOPT_MD5SIG << 8) |
781 TCPOLEN_MD5SIG);
782 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783 rep.th.doff = arg.iov[0].iov_len/4;
784
785 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
786 key, ip_hdr(skb)->saddr,
787 ip_hdr(skb)->daddr, &rep.th);
788 }
789 #endif
790 arg.flags = reply_flags;
791 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
792 ip_hdr(skb)->saddr, /* XXX */
793 arg.iov[0].iov_len, IPPROTO_TCP, 0);
794 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
795 if (oif)
796 arg.bound_dev_if = oif;
797 arg.tos = tos;
798 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
799 local_bh_disable();
800 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
801 skb, &TCP_SKB_CB(skb)->header.h4.opt,
802 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
803 &arg, arg.iov[0].iov_len);
804
805 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
806 local_bh_enable();
807 }
808
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)809 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
810 {
811 struct inet_timewait_sock *tw = inet_twsk(sk);
812 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
813
814 tcp_v4_send_ack(sk, skb,
815 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
816 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
817 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
818 tcptw->tw_ts_recent,
819 tw->tw_bound_dev_if,
820 tcp_twsk_md5_key(tcptw),
821 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
822 tw->tw_tos
823 );
824
825 inet_twsk_put(tw);
826 }
827
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)828 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
829 struct request_sock *req)
830 {
831 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
832 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
833 */
834 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
835 tcp_sk(sk)->snd_nxt;
836
837 /* RFC 7323 2.3
838 * The window field (SEG.WND) of every outgoing segment, with the
839 * exception of <SYN> segments, MUST be right-shifted by
840 * Rcv.Wind.Shift bits:
841 */
842 tcp_v4_send_ack(sk, skb, seq,
843 tcp_rsk(req)->rcv_nxt,
844 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
845 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
846 req->ts_recent,
847 0,
848 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
849 AF_INET),
850 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
851 ip_hdr(skb)->tos);
852 }
853
854 /*
855 * Send a SYN-ACK after having received a SYN.
856 * This still operates on a request_sock only, not on a big
857 * socket.
858 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)859 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
860 struct flowi *fl,
861 struct request_sock *req,
862 struct tcp_fastopen_cookie *foc,
863 enum tcp_synack_type synack_type)
864 {
865 const struct inet_request_sock *ireq = inet_rsk(req);
866 struct flowi4 fl4;
867 int err = -1;
868 struct sk_buff *skb;
869
870 /* First, grab a route. */
871 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
872 return -1;
873
874 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
875
876 if (skb) {
877 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
878
879 rcu_read_lock();
880 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
881 ireq->ir_rmt_addr,
882 rcu_dereference(ireq->ireq_opt));
883 rcu_read_unlock();
884 err = net_xmit_eval(err);
885 }
886
887 return err;
888 }
889
890 /*
891 * IPv4 request_sock destructor.
892 */
tcp_v4_reqsk_destructor(struct request_sock * req)893 static void tcp_v4_reqsk_destructor(struct request_sock *req)
894 {
895 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
896 }
897
898 #ifdef CONFIG_TCP_MD5SIG
899 /*
900 * RFC2385 MD5 checksumming requires a mapping of
901 * IP address->MD5 Key.
902 * We need to maintain these in the sk structure.
903 */
904
905 /* Find the Key structure for an address. */
tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)906 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
907 const union tcp_md5_addr *addr,
908 int family)
909 {
910 const struct tcp_sock *tp = tcp_sk(sk);
911 struct tcp_md5sig_key *key;
912 const struct tcp_md5sig_info *md5sig;
913 __be32 mask;
914 struct tcp_md5sig_key *best_match = NULL;
915 bool match;
916
917 /* caller either holds rcu_read_lock() or socket lock */
918 md5sig = rcu_dereference_check(tp->md5sig_info,
919 lockdep_sock_is_held(sk));
920 if (!md5sig)
921 return NULL;
922
923 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
924 if (key->family != family)
925 continue;
926
927 if (family == AF_INET) {
928 mask = inet_make_mask(key->prefixlen);
929 match = (key->addr.a4.s_addr & mask) ==
930 (addr->a4.s_addr & mask);
931 #if IS_ENABLED(CONFIG_IPV6)
932 } else if (family == AF_INET6) {
933 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
934 key->prefixlen);
935 #endif
936 } else {
937 match = false;
938 }
939
940 if (match && (!best_match ||
941 key->prefixlen > best_match->prefixlen))
942 best_match = key;
943 }
944 return best_match;
945 }
946 EXPORT_SYMBOL(tcp_md5_do_lookup);
947
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)948 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
949 const union tcp_md5_addr *addr,
950 int family, u8 prefixlen)
951 {
952 const struct tcp_sock *tp = tcp_sk(sk);
953 struct tcp_md5sig_key *key;
954 unsigned int size = sizeof(struct in_addr);
955 const struct tcp_md5sig_info *md5sig;
956
957 /* caller either holds rcu_read_lock() or socket lock */
958 md5sig = rcu_dereference_check(tp->md5sig_info,
959 lockdep_sock_is_held(sk));
960 if (!md5sig)
961 return NULL;
962 #if IS_ENABLED(CONFIG_IPV6)
963 if (family == AF_INET6)
964 size = sizeof(struct in6_addr);
965 #endif
966 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
967 if (key->family != family)
968 continue;
969 if (!memcmp(&key->addr, addr, size) &&
970 key->prefixlen == prefixlen)
971 return key;
972 }
973 return NULL;
974 }
975
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)976 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
977 const struct sock *addr_sk)
978 {
979 const union tcp_md5_addr *addr;
980
981 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
982 return tcp_md5_do_lookup(sk, addr, AF_INET);
983 }
984 EXPORT_SYMBOL(tcp_v4_md5_lookup);
985
986 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,const u8 * newkey,u8 newkeylen,gfp_t gfp)987 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
988 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
989 gfp_t gfp)
990 {
991 /* Add Key to the list */
992 struct tcp_md5sig_key *key;
993 struct tcp_sock *tp = tcp_sk(sk);
994 struct tcp_md5sig_info *md5sig;
995
996 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
997 if (key) {
998 /* Pre-existing entry - just update that one. */
999 memcpy(key->key, newkey, newkeylen);
1000 key->keylen = newkeylen;
1001 return 0;
1002 }
1003
1004 md5sig = rcu_dereference_protected(tp->md5sig_info,
1005 lockdep_sock_is_held(sk));
1006 if (!md5sig) {
1007 md5sig = kmalloc(sizeof(*md5sig), gfp);
1008 if (!md5sig)
1009 return -ENOMEM;
1010
1011 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1012 INIT_HLIST_HEAD(&md5sig->head);
1013 rcu_assign_pointer(tp->md5sig_info, md5sig);
1014 }
1015
1016 key = sock_kmalloc(sk, sizeof(*key), gfp);
1017 if (!key)
1018 return -ENOMEM;
1019 if (!tcp_alloc_md5sig_pool()) {
1020 sock_kfree_s(sk, key, sizeof(*key));
1021 return -ENOMEM;
1022 }
1023
1024 memcpy(key->key, newkey, newkeylen);
1025 key->keylen = newkeylen;
1026 key->family = family;
1027 key->prefixlen = prefixlen;
1028 memcpy(&key->addr, addr,
1029 (family == AF_INET6) ? sizeof(struct in6_addr) :
1030 sizeof(struct in_addr));
1031 hlist_add_head_rcu(&key->node, &md5sig->head);
1032 return 0;
1033 }
1034 EXPORT_SYMBOL(tcp_md5_do_add);
1035
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1036 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1037 u8 prefixlen)
1038 {
1039 struct tcp_md5sig_key *key;
1040
1041 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1042 if (!key)
1043 return -ENOENT;
1044 hlist_del_rcu(&key->node);
1045 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1046 kfree_rcu(key, rcu);
1047 return 0;
1048 }
1049 EXPORT_SYMBOL(tcp_md5_do_del);
1050
tcp_clear_md5_list(struct sock * sk)1051 static void tcp_clear_md5_list(struct sock *sk)
1052 {
1053 struct tcp_sock *tp = tcp_sk(sk);
1054 struct tcp_md5sig_key *key;
1055 struct hlist_node *n;
1056 struct tcp_md5sig_info *md5sig;
1057
1058 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1059
1060 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1061 hlist_del_rcu(&key->node);
1062 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1063 kfree_rcu(key, rcu);
1064 }
1065 }
1066
tcp_v4_parse_md5_keys(struct sock * sk,int optname,char __user * optval,int optlen)1067 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1068 char __user *optval, int optlen)
1069 {
1070 struct tcp_md5sig cmd;
1071 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1072 u8 prefixlen = 32;
1073
1074 if (optlen < sizeof(cmd))
1075 return -EINVAL;
1076
1077 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1078 return -EFAULT;
1079
1080 if (sin->sin_family != AF_INET)
1081 return -EINVAL;
1082
1083 if (optname == TCP_MD5SIG_EXT &&
1084 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1085 prefixlen = cmd.tcpm_prefixlen;
1086 if (prefixlen > 32)
1087 return -EINVAL;
1088 }
1089
1090 if (!cmd.tcpm_keylen)
1091 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1092 AF_INET, prefixlen);
1093
1094 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1095 return -EINVAL;
1096
1097 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1098 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1099 GFP_KERNEL);
1100 }
1101
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1102 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1103 __be32 daddr, __be32 saddr,
1104 const struct tcphdr *th, int nbytes)
1105 {
1106 struct tcp4_pseudohdr *bp;
1107 struct scatterlist sg;
1108 struct tcphdr *_th;
1109
1110 bp = hp->scratch;
1111 bp->saddr = saddr;
1112 bp->daddr = daddr;
1113 bp->pad = 0;
1114 bp->protocol = IPPROTO_TCP;
1115 bp->len = cpu_to_be16(nbytes);
1116
1117 _th = (struct tcphdr *)(bp + 1);
1118 memcpy(_th, th, sizeof(*th));
1119 _th->check = 0;
1120
1121 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1122 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1123 sizeof(*bp) + sizeof(*th));
1124 return crypto_ahash_update(hp->md5_req);
1125 }
1126
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1127 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1128 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1129 {
1130 struct tcp_md5sig_pool *hp;
1131 struct ahash_request *req;
1132
1133 hp = tcp_get_md5sig_pool();
1134 if (!hp)
1135 goto clear_hash_noput;
1136 req = hp->md5_req;
1137
1138 if (crypto_ahash_init(req))
1139 goto clear_hash;
1140 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1141 goto clear_hash;
1142 if (tcp_md5_hash_key(hp, key))
1143 goto clear_hash;
1144 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1145 if (crypto_ahash_final(req))
1146 goto clear_hash;
1147
1148 tcp_put_md5sig_pool();
1149 return 0;
1150
1151 clear_hash:
1152 tcp_put_md5sig_pool();
1153 clear_hash_noput:
1154 memset(md5_hash, 0, 16);
1155 return 1;
1156 }
1157
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1158 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1159 const struct sock *sk,
1160 const struct sk_buff *skb)
1161 {
1162 struct tcp_md5sig_pool *hp;
1163 struct ahash_request *req;
1164 const struct tcphdr *th = tcp_hdr(skb);
1165 __be32 saddr, daddr;
1166
1167 if (sk) { /* valid for establish/request sockets */
1168 saddr = sk->sk_rcv_saddr;
1169 daddr = sk->sk_daddr;
1170 } else {
1171 const struct iphdr *iph = ip_hdr(skb);
1172 saddr = iph->saddr;
1173 daddr = iph->daddr;
1174 }
1175
1176 hp = tcp_get_md5sig_pool();
1177 if (!hp)
1178 goto clear_hash_noput;
1179 req = hp->md5_req;
1180
1181 if (crypto_ahash_init(req))
1182 goto clear_hash;
1183
1184 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1185 goto clear_hash;
1186 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1187 goto clear_hash;
1188 if (tcp_md5_hash_key(hp, key))
1189 goto clear_hash;
1190 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1191 if (crypto_ahash_final(req))
1192 goto clear_hash;
1193
1194 tcp_put_md5sig_pool();
1195 return 0;
1196
1197 clear_hash:
1198 tcp_put_md5sig_pool();
1199 clear_hash_noput:
1200 memset(md5_hash, 0, 16);
1201 return 1;
1202 }
1203 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1204
1205 #endif
1206
1207 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1208 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1209 const struct sk_buff *skb)
1210 {
1211 #ifdef CONFIG_TCP_MD5SIG
1212 /*
1213 * This gets called for each TCP segment that arrives
1214 * so we want to be efficient.
1215 * We have 3 drop cases:
1216 * o No MD5 hash and one expected.
1217 * o MD5 hash and we're not expecting one.
1218 * o MD5 hash and its wrong.
1219 */
1220 const __u8 *hash_location = NULL;
1221 struct tcp_md5sig_key *hash_expected;
1222 const struct iphdr *iph = ip_hdr(skb);
1223 const struct tcphdr *th = tcp_hdr(skb);
1224 int genhash;
1225 unsigned char newhash[16];
1226
1227 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1228 AF_INET);
1229 hash_location = tcp_parse_md5sig_option(th);
1230
1231 /* We've parsed the options - do we have a hash? */
1232 if (!hash_expected && !hash_location)
1233 return false;
1234
1235 if (hash_expected && !hash_location) {
1236 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1237 return true;
1238 }
1239
1240 if (!hash_expected && hash_location) {
1241 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1242 return true;
1243 }
1244
1245 /* Okay, so this is hash_expected and hash_location -
1246 * so we need to calculate the checksum.
1247 */
1248 genhash = tcp_v4_md5_hash_skb(newhash,
1249 hash_expected,
1250 NULL, skb);
1251
1252 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1253 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1254 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1255 &iph->saddr, ntohs(th->source),
1256 &iph->daddr, ntohs(th->dest),
1257 genhash ? " tcp_v4_calc_md5_hash failed"
1258 : "");
1259 return true;
1260 }
1261 return false;
1262 #endif
1263 return false;
1264 }
1265
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1266 static void tcp_v4_init_req(struct request_sock *req,
1267 const struct sock *sk_listener,
1268 struct sk_buff *skb)
1269 {
1270 struct inet_request_sock *ireq = inet_rsk(req);
1271 struct net *net = sock_net(sk_listener);
1272
1273 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1274 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1275 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1276 }
1277
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1278 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1279 struct flowi *fl,
1280 const struct request_sock *req)
1281 {
1282 return inet_csk_route_req(sk, &fl->u.ip4, req);
1283 }
1284
1285 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1286 .family = PF_INET,
1287 .obj_size = sizeof(struct tcp_request_sock),
1288 .rtx_syn_ack = tcp_rtx_synack,
1289 .send_ack = tcp_v4_reqsk_send_ack,
1290 .destructor = tcp_v4_reqsk_destructor,
1291 .send_reset = tcp_v4_send_reset,
1292 .syn_ack_timeout = tcp_syn_ack_timeout,
1293 };
1294
1295 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1296 .mss_clamp = TCP_MSS_DEFAULT,
1297 #ifdef CONFIG_TCP_MD5SIG
1298 .req_md5_lookup = tcp_v4_md5_lookup,
1299 .calc_md5_hash = tcp_v4_md5_hash_skb,
1300 #endif
1301 .init_req = tcp_v4_init_req,
1302 #ifdef CONFIG_SYN_COOKIES
1303 .cookie_init_seq = cookie_v4_init_sequence,
1304 #endif
1305 .route_req = tcp_v4_route_req,
1306 .init_seq = tcp_v4_init_seq,
1307 .init_ts_off = tcp_v4_init_ts_off,
1308 .send_synack = tcp_v4_send_synack,
1309 };
1310
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1311 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1312 {
1313 /* Never answer to SYNs send to broadcast or multicast */
1314 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1315 goto drop;
1316
1317 return tcp_conn_request(&tcp_request_sock_ops,
1318 &tcp_request_sock_ipv4_ops, sk, skb);
1319
1320 drop:
1321 tcp_listendrop(sk);
1322 return 0;
1323 }
1324 EXPORT_SYMBOL(tcp_v4_conn_request);
1325
1326
1327 /*
1328 * The three way handshake has completed - we got a valid synack -
1329 * now create the new socket.
1330 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1331 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1332 struct request_sock *req,
1333 struct dst_entry *dst,
1334 struct request_sock *req_unhash,
1335 bool *own_req)
1336 {
1337 struct inet_request_sock *ireq;
1338 struct inet_sock *newinet;
1339 struct tcp_sock *newtp;
1340 struct sock *newsk;
1341 #ifdef CONFIG_TCP_MD5SIG
1342 struct tcp_md5sig_key *key;
1343 #endif
1344 struct ip_options_rcu *inet_opt;
1345
1346 if (sk_acceptq_is_full(sk))
1347 goto exit_overflow;
1348
1349 newsk = tcp_create_openreq_child(sk, req, skb);
1350 if (!newsk)
1351 goto exit_nonewsk;
1352
1353 newsk->sk_gso_type = SKB_GSO_TCPV4;
1354 inet_sk_rx_dst_set(newsk, skb);
1355
1356 newtp = tcp_sk(newsk);
1357 newinet = inet_sk(newsk);
1358 ireq = inet_rsk(req);
1359 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1360 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1361 newsk->sk_bound_dev_if = ireq->ir_iif;
1362 newinet->inet_saddr = ireq->ir_loc_addr;
1363 inet_opt = rcu_dereference(ireq->ireq_opt);
1364 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1365 newinet->mc_index = inet_iif(skb);
1366 newinet->mc_ttl = ip_hdr(skb)->ttl;
1367 newinet->rcv_tos = ip_hdr(skb)->tos;
1368 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1369 if (inet_opt)
1370 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1371 newinet->inet_id = prandom_u32();
1372
1373 if (!dst) {
1374 dst = inet_csk_route_child_sock(sk, newsk, req);
1375 if (!dst)
1376 goto put_and_exit;
1377 } else {
1378 /* syncookie case : see end of cookie_v4_check() */
1379 }
1380 sk_setup_caps(newsk, dst);
1381
1382 tcp_ca_openreq_child(newsk, dst);
1383
1384 tcp_sync_mss(newsk, dst_mtu(dst));
1385 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1386
1387 tcp_initialize_rcv_mss(newsk);
1388
1389 #ifdef CONFIG_TCP_MD5SIG
1390 /* Copy over the MD5 key from the original socket */
1391 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1392 AF_INET);
1393 if (key) {
1394 /*
1395 * We're using one, so create a matching key
1396 * on the newsk structure. If we fail to get
1397 * memory, then we end up not copying the key
1398 * across. Shucks.
1399 */
1400 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1401 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1402 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1403 }
1404 #endif
1405
1406 if (__inet_inherit_port(sk, newsk) < 0)
1407 goto put_and_exit;
1408 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1409 if (likely(*own_req)) {
1410 tcp_move_syn(newtp, req);
1411 ireq->ireq_opt = NULL;
1412 } else {
1413 newinet->inet_opt = NULL;
1414 }
1415 return newsk;
1416
1417 exit_overflow:
1418 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1419 exit_nonewsk:
1420 dst_release(dst);
1421 exit:
1422 tcp_listendrop(sk);
1423 return NULL;
1424 put_and_exit:
1425 newinet->inet_opt = NULL;
1426 inet_csk_prepare_forced_close(newsk);
1427 tcp_done(newsk);
1428 goto exit;
1429 }
1430 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1431
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1432 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1433 {
1434 #ifdef CONFIG_SYN_COOKIES
1435 const struct tcphdr *th = tcp_hdr(skb);
1436
1437 if (!th->syn)
1438 sk = cookie_v4_check(sk, skb);
1439 #endif
1440 return sk;
1441 }
1442
1443 /* The socket must have it's spinlock held when we get
1444 * here, unless it is a TCP_LISTEN socket.
1445 *
1446 * We have a potential double-lock case here, so even when
1447 * doing backlog processing we use the BH locking scheme.
1448 * This is because we cannot sleep with the original spinlock
1449 * held.
1450 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1451 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1452 {
1453 struct sock *rsk;
1454
1455 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1456 struct dst_entry *dst = sk->sk_rx_dst;
1457
1458 sock_rps_save_rxhash(sk, skb);
1459 sk_mark_napi_id(sk, skb);
1460 if (dst) {
1461 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1462 !dst->ops->check(dst, 0)) {
1463 dst_release(dst);
1464 sk->sk_rx_dst = NULL;
1465 }
1466 }
1467 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1468 return 0;
1469 }
1470
1471 if (tcp_checksum_complete(skb))
1472 goto csum_err;
1473
1474 if (sk->sk_state == TCP_LISTEN) {
1475 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1476
1477 if (!nsk)
1478 goto discard;
1479 if (nsk != sk) {
1480 if (tcp_child_process(sk, nsk, skb)) {
1481 rsk = nsk;
1482 goto reset;
1483 }
1484 return 0;
1485 }
1486 } else
1487 sock_rps_save_rxhash(sk, skb);
1488
1489 if (tcp_rcv_state_process(sk, skb)) {
1490 rsk = sk;
1491 goto reset;
1492 }
1493 return 0;
1494
1495 reset:
1496 tcp_v4_send_reset(rsk, skb);
1497 discard:
1498 kfree_skb(skb);
1499 /* Be careful here. If this function gets more complicated and
1500 * gcc suffers from register pressure on the x86, sk (in %ebx)
1501 * might be destroyed here. This current version compiles correctly,
1502 * but you have been warned.
1503 */
1504 return 0;
1505
1506 csum_err:
1507 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1508 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1509 goto discard;
1510 }
1511 EXPORT_SYMBOL(tcp_v4_do_rcv);
1512
tcp_v4_early_demux(struct sk_buff * skb)1513 int tcp_v4_early_demux(struct sk_buff *skb)
1514 {
1515 const struct iphdr *iph;
1516 const struct tcphdr *th;
1517 struct sock *sk;
1518
1519 if (skb->pkt_type != PACKET_HOST)
1520 return 0;
1521
1522 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1523 return 0;
1524
1525 iph = ip_hdr(skb);
1526 th = tcp_hdr(skb);
1527
1528 if (th->doff < sizeof(struct tcphdr) / 4)
1529 return 0;
1530
1531 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1532 iph->saddr, th->source,
1533 iph->daddr, ntohs(th->dest),
1534 skb->skb_iif, inet_sdif(skb));
1535 if (sk) {
1536 skb->sk = sk;
1537 skb->destructor = sock_edemux;
1538 if (sk_fullsock(sk)) {
1539 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1540
1541 if (dst)
1542 dst = dst_check(dst, 0);
1543 if (dst &&
1544 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1545 skb_dst_set_noref(skb, dst);
1546 }
1547 }
1548 return 0;
1549 }
1550
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1551 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1552 {
1553 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1554
1555 /* Only socket owner can try to collapse/prune rx queues
1556 * to reduce memory overhead, so add a little headroom here.
1557 * Few sockets backlog are possibly concurrently non empty.
1558 */
1559 limit += 64*1024;
1560
1561 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1562 * we can fix skb->truesize to its real value to avoid future drops.
1563 * This is valid because skb is not yet charged to the socket.
1564 * It has been noticed pure SACK packets were sometimes dropped
1565 * (if cooked by drivers without copybreak feature).
1566 */
1567 skb_condense(skb);
1568
1569 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1570 bh_unlock_sock(sk);
1571 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1572 return true;
1573 }
1574 return false;
1575 }
1576 EXPORT_SYMBOL(tcp_add_backlog);
1577
tcp_filter(struct sock * sk,struct sk_buff * skb)1578 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1579 {
1580 struct tcphdr *th = (struct tcphdr *)skb->data;
1581
1582 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1583 }
1584 EXPORT_SYMBOL(tcp_filter);
1585
tcp_v4_restore_cb(struct sk_buff * skb)1586 static void tcp_v4_restore_cb(struct sk_buff *skb)
1587 {
1588 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1589 sizeof(struct inet_skb_parm));
1590 }
1591
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1592 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1593 const struct tcphdr *th)
1594 {
1595 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1596 * barrier() makes sure compiler wont play fool^Waliasing games.
1597 */
1598 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1599 sizeof(struct inet_skb_parm));
1600 barrier();
1601
1602 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1603 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1604 skb->len - th->doff * 4);
1605 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1606 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1607 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1608 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1609 TCP_SKB_CB(skb)->sacked = 0;
1610 TCP_SKB_CB(skb)->has_rxtstamp =
1611 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1612 }
1613
1614 /*
1615 * From tcp_input.c
1616 */
1617
tcp_v4_rcv(struct sk_buff * skb)1618 int tcp_v4_rcv(struct sk_buff *skb)
1619 {
1620 struct net *net = dev_net(skb->dev);
1621 int sdif = inet_sdif(skb);
1622 const struct iphdr *iph;
1623 const struct tcphdr *th;
1624 bool refcounted;
1625 struct sock *sk;
1626 int ret;
1627
1628 if (skb->pkt_type != PACKET_HOST)
1629 goto discard_it;
1630
1631 /* Count it even if it's bad */
1632 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1633
1634 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1635 goto discard_it;
1636
1637 th = (const struct tcphdr *)skb->data;
1638
1639 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1640 goto bad_packet;
1641 if (!pskb_may_pull(skb, th->doff * 4))
1642 goto discard_it;
1643
1644 /* An explanation is required here, I think.
1645 * Packet length and doff are validated by header prediction,
1646 * provided case of th->doff==0 is eliminated.
1647 * So, we defer the checks. */
1648
1649 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1650 goto csum_error;
1651
1652 th = (const struct tcphdr *)skb->data;
1653 iph = ip_hdr(skb);
1654 lookup:
1655 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1656 th->dest, sdif, &refcounted);
1657 if (!sk)
1658 goto no_tcp_socket;
1659
1660 process:
1661 if (sk->sk_state == TCP_TIME_WAIT)
1662 goto do_time_wait;
1663
1664 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1665 struct request_sock *req = inet_reqsk(sk);
1666 struct sock *nsk;
1667
1668 sk = req->rsk_listener;
1669 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1670 sk_drops_add(sk, skb);
1671 reqsk_put(req);
1672 goto discard_it;
1673 }
1674 if (tcp_checksum_complete(skb)) {
1675 reqsk_put(req);
1676 goto csum_error;
1677 }
1678 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1679 inet_csk_reqsk_queue_drop_and_put(sk, req);
1680 goto lookup;
1681 }
1682 /* We own a reference on the listener, increase it again
1683 * as we might lose it too soon.
1684 */
1685 sock_hold(sk);
1686 refcounted = true;
1687 nsk = NULL;
1688 if (!tcp_filter(sk, skb)) {
1689 th = (const struct tcphdr *)skb->data;
1690 iph = ip_hdr(skb);
1691 tcp_v4_fill_cb(skb, iph, th);
1692 nsk = tcp_check_req(sk, skb, req, false);
1693 }
1694 if (!nsk) {
1695 reqsk_put(req);
1696 goto discard_and_relse;
1697 }
1698 if (nsk == sk) {
1699 reqsk_put(req);
1700 tcp_v4_restore_cb(skb);
1701 } else if (tcp_child_process(sk, nsk, skb)) {
1702 tcp_v4_send_reset(nsk, skb);
1703 goto discard_and_relse;
1704 } else {
1705 sock_put(sk);
1706 return 0;
1707 }
1708 }
1709 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1710 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1711 goto discard_and_relse;
1712 }
1713
1714 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1715 goto discard_and_relse;
1716
1717 if (tcp_v4_inbound_md5_hash(sk, skb))
1718 goto discard_and_relse;
1719
1720 nf_reset(skb);
1721
1722 if (tcp_filter(sk, skb))
1723 goto discard_and_relse;
1724 th = (const struct tcphdr *)skb->data;
1725 iph = ip_hdr(skb);
1726 tcp_v4_fill_cb(skb, iph, th);
1727
1728 skb->dev = NULL;
1729
1730 if (sk->sk_state == TCP_LISTEN) {
1731 ret = tcp_v4_do_rcv(sk, skb);
1732 goto put_and_return;
1733 }
1734
1735 sk_incoming_cpu_update(sk);
1736
1737 bh_lock_sock_nested(sk);
1738 tcp_segs_in(tcp_sk(sk), skb);
1739 ret = 0;
1740 if (!sock_owned_by_user(sk)) {
1741 ret = tcp_v4_do_rcv(sk, skb);
1742 } else if (tcp_add_backlog(sk, skb)) {
1743 goto discard_and_relse;
1744 }
1745 bh_unlock_sock(sk);
1746
1747 put_and_return:
1748 if (refcounted)
1749 sock_put(sk);
1750
1751 return ret;
1752
1753 no_tcp_socket:
1754 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1755 goto discard_it;
1756
1757 tcp_v4_fill_cb(skb, iph, th);
1758
1759 if (tcp_checksum_complete(skb)) {
1760 csum_error:
1761 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1762 bad_packet:
1763 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1764 } else {
1765 tcp_v4_send_reset(NULL, skb);
1766 }
1767
1768 discard_it:
1769 /* Discard frame. */
1770 kfree_skb(skb);
1771 return 0;
1772
1773 discard_and_relse:
1774 sk_drops_add(sk, skb);
1775 if (refcounted)
1776 sock_put(sk);
1777 goto discard_it;
1778
1779 do_time_wait:
1780 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1781 inet_twsk_put(inet_twsk(sk));
1782 goto discard_it;
1783 }
1784
1785 tcp_v4_fill_cb(skb, iph, th);
1786
1787 if (tcp_checksum_complete(skb)) {
1788 inet_twsk_put(inet_twsk(sk));
1789 goto csum_error;
1790 }
1791 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1792 case TCP_TW_SYN: {
1793 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1794 &tcp_hashinfo, skb,
1795 __tcp_hdrlen(th),
1796 iph->saddr, th->source,
1797 iph->daddr, th->dest,
1798 inet_iif(skb),
1799 sdif);
1800 if (sk2) {
1801 inet_twsk_deschedule_put(inet_twsk(sk));
1802 sk = sk2;
1803 tcp_v4_restore_cb(skb);
1804 refcounted = false;
1805 goto process;
1806 }
1807 /* Fall through to ACK */
1808 }
1809 case TCP_TW_ACK:
1810 tcp_v4_timewait_ack(sk, skb);
1811 break;
1812 case TCP_TW_RST:
1813 tcp_v4_send_reset(sk, skb);
1814 inet_twsk_deschedule_put(inet_twsk(sk));
1815 goto discard_it;
1816 case TCP_TW_SUCCESS:;
1817 }
1818 goto discard_it;
1819 }
1820
1821 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1822 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1823 .twsk_unique = tcp_twsk_unique,
1824 .twsk_destructor= tcp_twsk_destructor,
1825 };
1826
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)1827 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1828 {
1829 struct dst_entry *dst = skb_dst(skb);
1830
1831 if (dst && dst_hold_safe(dst)) {
1832 sk->sk_rx_dst = dst;
1833 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1834 }
1835 }
1836 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1837
1838 const struct inet_connection_sock_af_ops ipv4_specific = {
1839 .queue_xmit = ip_queue_xmit,
1840 .send_check = tcp_v4_send_check,
1841 .rebuild_header = inet_sk_rebuild_header,
1842 .sk_rx_dst_set = inet_sk_rx_dst_set,
1843 .conn_request = tcp_v4_conn_request,
1844 .syn_recv_sock = tcp_v4_syn_recv_sock,
1845 .net_header_len = sizeof(struct iphdr),
1846 .setsockopt = ip_setsockopt,
1847 .getsockopt = ip_getsockopt,
1848 .addr2sockaddr = inet_csk_addr2sockaddr,
1849 .sockaddr_len = sizeof(struct sockaddr_in),
1850 #ifdef CONFIG_COMPAT
1851 .compat_setsockopt = compat_ip_setsockopt,
1852 .compat_getsockopt = compat_ip_getsockopt,
1853 #endif
1854 .mtu_reduced = tcp_v4_mtu_reduced,
1855 };
1856 EXPORT_SYMBOL(ipv4_specific);
1857
1858 #ifdef CONFIG_TCP_MD5SIG
1859 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1860 .md5_lookup = tcp_v4_md5_lookup,
1861 .calc_md5_hash = tcp_v4_md5_hash_skb,
1862 .md5_parse = tcp_v4_parse_md5_keys,
1863 };
1864 #endif
1865
1866 /* NOTE: A lot of things set to zero explicitly by call to
1867 * sk_alloc() so need not be done here.
1868 */
tcp_v4_init_sock(struct sock * sk)1869 static int tcp_v4_init_sock(struct sock *sk)
1870 {
1871 struct inet_connection_sock *icsk = inet_csk(sk);
1872
1873 tcp_init_sock(sk);
1874
1875 icsk->icsk_af_ops = &ipv4_specific;
1876
1877 #ifdef CONFIG_TCP_MD5SIG
1878 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1879 #endif
1880
1881 return 0;
1882 }
1883
tcp_v4_destroy_sock(struct sock * sk)1884 void tcp_v4_destroy_sock(struct sock *sk)
1885 {
1886 struct tcp_sock *tp = tcp_sk(sk);
1887
1888 tcp_clear_xmit_timers(sk);
1889
1890 tcp_cleanup_congestion_control(sk);
1891
1892 tcp_cleanup_ulp(sk);
1893
1894 /* Cleanup up the write buffer. */
1895 tcp_write_queue_purge(sk);
1896
1897 /* Check if we want to disable active TFO */
1898 tcp_fastopen_active_disable_ofo_check(sk);
1899
1900 /* Cleans up our, hopefully empty, out_of_order_queue. */
1901 skb_rbtree_purge(&tp->out_of_order_queue);
1902
1903 #ifdef CONFIG_TCP_MD5SIG
1904 /* Clean up the MD5 key list, if any */
1905 if (tp->md5sig_info) {
1906 tcp_clear_md5_list(sk);
1907 kfree_rcu(tp->md5sig_info, rcu);
1908 tp->md5sig_info = NULL;
1909 }
1910 #endif
1911
1912 /* Clean up a referenced TCP bind bucket. */
1913 if (inet_csk(sk)->icsk_bind_hash)
1914 inet_put_port(sk);
1915
1916 BUG_ON(tp->fastopen_rsk);
1917
1918 /* If socket is aborted during connect operation */
1919 tcp_free_fastopen_req(tp);
1920 tcp_saved_syn_free(tp);
1921
1922 sk_sockets_allocated_dec(sk);
1923 }
1924 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1925
1926 #ifdef CONFIG_PROC_FS
1927 /* Proc filesystem TCP sock list dumping. */
1928
1929 /*
1930 * Get next listener socket follow cur. If cur is NULL, get first socket
1931 * starting from bucket given in st->bucket; when st->bucket is zero the
1932 * very first socket in the hash table is returned.
1933 */
listening_get_next(struct seq_file * seq,void * cur)1934 static void *listening_get_next(struct seq_file *seq, void *cur)
1935 {
1936 struct tcp_iter_state *st = seq->private;
1937 struct net *net = seq_file_net(seq);
1938 struct inet_listen_hashbucket *ilb;
1939 struct hlist_nulls_node *node;
1940 struct sock *sk = cur;
1941
1942 if (!sk) {
1943 get_head:
1944 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1945 spin_lock(&ilb->lock);
1946 sk = sk_nulls_head(&ilb->nulls_head);
1947 st->offset = 0;
1948 goto get_sk;
1949 }
1950 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1951 ++st->num;
1952 ++st->offset;
1953
1954 sk = sk_nulls_next(sk);
1955 get_sk:
1956 sk_nulls_for_each_from(sk, node) {
1957 if (!net_eq(sock_net(sk), net))
1958 continue;
1959 if (sk->sk_family == st->family)
1960 return sk;
1961 }
1962 spin_unlock(&ilb->lock);
1963 st->offset = 0;
1964 if (++st->bucket < INET_LHTABLE_SIZE)
1965 goto get_head;
1966 return NULL;
1967 }
1968
listening_get_idx(struct seq_file * seq,loff_t * pos)1969 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1970 {
1971 struct tcp_iter_state *st = seq->private;
1972 void *rc;
1973
1974 st->bucket = 0;
1975 st->offset = 0;
1976 rc = listening_get_next(seq, NULL);
1977
1978 while (rc && *pos) {
1979 rc = listening_get_next(seq, rc);
1980 --*pos;
1981 }
1982 return rc;
1983 }
1984
empty_bucket(const struct tcp_iter_state * st)1985 static inline bool empty_bucket(const struct tcp_iter_state *st)
1986 {
1987 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1988 }
1989
1990 /*
1991 * Get first established socket starting from bucket given in st->bucket.
1992 * If st->bucket is zero, the very first socket in the hash is returned.
1993 */
established_get_first(struct seq_file * seq)1994 static void *established_get_first(struct seq_file *seq)
1995 {
1996 struct tcp_iter_state *st = seq->private;
1997 struct net *net = seq_file_net(seq);
1998 void *rc = NULL;
1999
2000 st->offset = 0;
2001 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2002 struct sock *sk;
2003 struct hlist_nulls_node *node;
2004 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2005
2006 /* Lockless fast path for the common case of empty buckets */
2007 if (empty_bucket(st))
2008 continue;
2009
2010 spin_lock_bh(lock);
2011 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2012 if (sk->sk_family != st->family ||
2013 !net_eq(sock_net(sk), net)) {
2014 continue;
2015 }
2016 rc = sk;
2017 goto out;
2018 }
2019 spin_unlock_bh(lock);
2020 }
2021 out:
2022 return rc;
2023 }
2024
established_get_next(struct seq_file * seq,void * cur)2025 static void *established_get_next(struct seq_file *seq, void *cur)
2026 {
2027 struct sock *sk = cur;
2028 struct hlist_nulls_node *node;
2029 struct tcp_iter_state *st = seq->private;
2030 struct net *net = seq_file_net(seq);
2031
2032 ++st->num;
2033 ++st->offset;
2034
2035 sk = sk_nulls_next(sk);
2036
2037 sk_nulls_for_each_from(sk, node) {
2038 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2039 return sk;
2040 }
2041
2042 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2043 ++st->bucket;
2044 return established_get_first(seq);
2045 }
2046
established_get_idx(struct seq_file * seq,loff_t pos)2047 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2048 {
2049 struct tcp_iter_state *st = seq->private;
2050 void *rc;
2051
2052 st->bucket = 0;
2053 rc = established_get_first(seq);
2054
2055 while (rc && pos) {
2056 rc = established_get_next(seq, rc);
2057 --pos;
2058 }
2059 return rc;
2060 }
2061
tcp_get_idx(struct seq_file * seq,loff_t pos)2062 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2063 {
2064 void *rc;
2065 struct tcp_iter_state *st = seq->private;
2066
2067 st->state = TCP_SEQ_STATE_LISTENING;
2068 rc = listening_get_idx(seq, &pos);
2069
2070 if (!rc) {
2071 st->state = TCP_SEQ_STATE_ESTABLISHED;
2072 rc = established_get_idx(seq, pos);
2073 }
2074
2075 return rc;
2076 }
2077
tcp_seek_last_pos(struct seq_file * seq)2078 static void *tcp_seek_last_pos(struct seq_file *seq)
2079 {
2080 struct tcp_iter_state *st = seq->private;
2081 int offset = st->offset;
2082 int orig_num = st->num;
2083 void *rc = NULL;
2084
2085 switch (st->state) {
2086 case TCP_SEQ_STATE_LISTENING:
2087 if (st->bucket >= INET_LHTABLE_SIZE)
2088 break;
2089 st->state = TCP_SEQ_STATE_LISTENING;
2090 rc = listening_get_next(seq, NULL);
2091 while (offset-- && rc)
2092 rc = listening_get_next(seq, rc);
2093 if (rc)
2094 break;
2095 st->bucket = 0;
2096 st->state = TCP_SEQ_STATE_ESTABLISHED;
2097 /* Fallthrough */
2098 case TCP_SEQ_STATE_ESTABLISHED:
2099 if (st->bucket > tcp_hashinfo.ehash_mask)
2100 break;
2101 rc = established_get_first(seq);
2102 while (offset-- && rc)
2103 rc = established_get_next(seq, rc);
2104 }
2105
2106 st->num = orig_num;
2107
2108 return rc;
2109 }
2110
tcp_seq_start(struct seq_file * seq,loff_t * pos)2111 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2112 {
2113 struct tcp_iter_state *st = seq->private;
2114 void *rc;
2115
2116 if (*pos && *pos == st->last_pos) {
2117 rc = tcp_seek_last_pos(seq);
2118 if (rc)
2119 goto out;
2120 }
2121
2122 st->state = TCP_SEQ_STATE_LISTENING;
2123 st->num = 0;
2124 st->bucket = 0;
2125 st->offset = 0;
2126 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2127
2128 out:
2129 st->last_pos = *pos;
2130 return rc;
2131 }
2132
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2133 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2134 {
2135 struct tcp_iter_state *st = seq->private;
2136 void *rc = NULL;
2137
2138 if (v == SEQ_START_TOKEN) {
2139 rc = tcp_get_idx(seq, 0);
2140 goto out;
2141 }
2142
2143 switch (st->state) {
2144 case TCP_SEQ_STATE_LISTENING:
2145 rc = listening_get_next(seq, v);
2146 if (!rc) {
2147 st->state = TCP_SEQ_STATE_ESTABLISHED;
2148 st->bucket = 0;
2149 st->offset = 0;
2150 rc = established_get_first(seq);
2151 }
2152 break;
2153 case TCP_SEQ_STATE_ESTABLISHED:
2154 rc = established_get_next(seq, v);
2155 break;
2156 }
2157 out:
2158 ++*pos;
2159 st->last_pos = *pos;
2160 return rc;
2161 }
2162
tcp_seq_stop(struct seq_file * seq,void * v)2163 static void tcp_seq_stop(struct seq_file *seq, void *v)
2164 {
2165 struct tcp_iter_state *st = seq->private;
2166
2167 switch (st->state) {
2168 case TCP_SEQ_STATE_LISTENING:
2169 if (v != SEQ_START_TOKEN)
2170 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2171 break;
2172 case TCP_SEQ_STATE_ESTABLISHED:
2173 if (v)
2174 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2175 break;
2176 }
2177 }
2178
tcp_seq_open(struct inode * inode,struct file * file)2179 int tcp_seq_open(struct inode *inode, struct file *file)
2180 {
2181 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2182 struct tcp_iter_state *s;
2183 int err;
2184
2185 err = seq_open_net(inode, file, &afinfo->seq_ops,
2186 sizeof(struct tcp_iter_state));
2187 if (err < 0)
2188 return err;
2189
2190 s = ((struct seq_file *)file->private_data)->private;
2191 s->family = afinfo->family;
2192 s->last_pos = 0;
2193 return 0;
2194 }
2195 EXPORT_SYMBOL(tcp_seq_open);
2196
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2197 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2198 {
2199 int rc = 0;
2200 struct proc_dir_entry *p;
2201
2202 afinfo->seq_ops.start = tcp_seq_start;
2203 afinfo->seq_ops.next = tcp_seq_next;
2204 afinfo->seq_ops.stop = tcp_seq_stop;
2205
2206 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2207 afinfo->seq_fops, afinfo);
2208 if (!p)
2209 rc = -ENOMEM;
2210 return rc;
2211 }
2212 EXPORT_SYMBOL(tcp_proc_register);
2213
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2214 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2215 {
2216 remove_proc_entry(afinfo->name, net->proc_net);
2217 }
2218 EXPORT_SYMBOL(tcp_proc_unregister);
2219
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2220 static void get_openreq4(const struct request_sock *req,
2221 struct seq_file *f, int i)
2222 {
2223 const struct inet_request_sock *ireq = inet_rsk(req);
2224 long delta = req->rsk_timer.expires - jiffies;
2225
2226 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2227 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2228 i,
2229 ireq->ir_loc_addr,
2230 ireq->ir_num,
2231 ireq->ir_rmt_addr,
2232 ntohs(ireq->ir_rmt_port),
2233 TCP_SYN_RECV,
2234 0, 0, /* could print option size, but that is af dependent. */
2235 1, /* timers active (only the expire timer) */
2236 jiffies_delta_to_clock_t(delta),
2237 req->num_timeout,
2238 from_kuid_munged(seq_user_ns(f),
2239 sock_i_uid(req->rsk_listener)),
2240 0, /* non standard timer */
2241 0, /* open_requests have no inode */
2242 0,
2243 req);
2244 }
2245
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2246 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2247 {
2248 int timer_active;
2249 unsigned long timer_expires;
2250 const struct tcp_sock *tp = tcp_sk(sk);
2251 const struct inet_connection_sock *icsk = inet_csk(sk);
2252 const struct inet_sock *inet = inet_sk(sk);
2253 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2254 __be32 dest = inet->inet_daddr;
2255 __be32 src = inet->inet_rcv_saddr;
2256 __u16 destp = ntohs(inet->inet_dport);
2257 __u16 srcp = ntohs(inet->inet_sport);
2258 int rx_queue;
2259 int state;
2260
2261 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2262 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2263 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2264 timer_active = 1;
2265 timer_expires = icsk->icsk_timeout;
2266 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2267 timer_active = 4;
2268 timer_expires = icsk->icsk_timeout;
2269 } else if (timer_pending(&sk->sk_timer)) {
2270 timer_active = 2;
2271 timer_expires = sk->sk_timer.expires;
2272 } else {
2273 timer_active = 0;
2274 timer_expires = jiffies;
2275 }
2276
2277 state = sk_state_load(sk);
2278 if (state == TCP_LISTEN)
2279 rx_queue = sk->sk_ack_backlog;
2280 else
2281 /* Because we don't lock the socket,
2282 * we might find a transient negative value.
2283 */
2284 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2285
2286 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2287 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2288 i, src, srcp, dest, destp, state,
2289 tp->write_seq - tp->snd_una,
2290 rx_queue,
2291 timer_active,
2292 jiffies_delta_to_clock_t(timer_expires - jiffies),
2293 icsk->icsk_retransmits,
2294 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2295 icsk->icsk_probes_out,
2296 sock_i_ino(sk),
2297 refcount_read(&sk->sk_refcnt), sk,
2298 jiffies_to_clock_t(icsk->icsk_rto),
2299 jiffies_to_clock_t(icsk->icsk_ack.ato),
2300 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2301 tp->snd_cwnd,
2302 state == TCP_LISTEN ?
2303 fastopenq->max_qlen :
2304 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2305 }
2306
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2307 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2308 struct seq_file *f, int i)
2309 {
2310 long delta = tw->tw_timer.expires - jiffies;
2311 __be32 dest, src;
2312 __u16 destp, srcp;
2313
2314 dest = tw->tw_daddr;
2315 src = tw->tw_rcv_saddr;
2316 destp = ntohs(tw->tw_dport);
2317 srcp = ntohs(tw->tw_sport);
2318
2319 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2320 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2321 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2322 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2323 refcount_read(&tw->tw_refcnt), tw);
2324 }
2325
2326 #define TMPSZ 150
2327
tcp4_seq_show(struct seq_file * seq,void * v)2328 static int tcp4_seq_show(struct seq_file *seq, void *v)
2329 {
2330 struct tcp_iter_state *st;
2331 struct sock *sk = v;
2332
2333 seq_setwidth(seq, TMPSZ - 1);
2334 if (v == SEQ_START_TOKEN) {
2335 seq_puts(seq, " sl local_address rem_address st tx_queue "
2336 "rx_queue tr tm->when retrnsmt uid timeout "
2337 "inode");
2338 goto out;
2339 }
2340 st = seq->private;
2341
2342 if (sk->sk_state == TCP_TIME_WAIT)
2343 get_timewait4_sock(v, seq, st->num);
2344 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2345 get_openreq4(v, seq, st->num);
2346 else
2347 get_tcp4_sock(v, seq, st->num);
2348 out:
2349 seq_pad(seq, '\n');
2350 return 0;
2351 }
2352
2353 static const struct file_operations tcp_afinfo_seq_fops = {
2354 .owner = THIS_MODULE,
2355 .open = tcp_seq_open,
2356 .read = seq_read,
2357 .llseek = seq_lseek,
2358 .release = seq_release_net
2359 };
2360
2361 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2362 .name = "tcp",
2363 .family = AF_INET,
2364 .seq_fops = &tcp_afinfo_seq_fops,
2365 .seq_ops = {
2366 .show = tcp4_seq_show,
2367 },
2368 };
2369
tcp4_proc_init_net(struct net * net)2370 static int __net_init tcp4_proc_init_net(struct net *net)
2371 {
2372 return tcp_proc_register(net, &tcp4_seq_afinfo);
2373 }
2374
tcp4_proc_exit_net(struct net * net)2375 static void __net_exit tcp4_proc_exit_net(struct net *net)
2376 {
2377 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2378 }
2379
2380 static struct pernet_operations tcp4_net_ops = {
2381 .init = tcp4_proc_init_net,
2382 .exit = tcp4_proc_exit_net,
2383 };
2384
tcp4_proc_init(void)2385 int __init tcp4_proc_init(void)
2386 {
2387 return register_pernet_subsys(&tcp4_net_ops);
2388 }
2389
tcp4_proc_exit(void)2390 void tcp4_proc_exit(void)
2391 {
2392 unregister_pernet_subsys(&tcp4_net_ops);
2393 }
2394 #endif /* CONFIG_PROC_FS */
2395
2396 struct proto tcp_prot = {
2397 .name = "TCP",
2398 .owner = THIS_MODULE,
2399 .close = tcp_close,
2400 .connect = tcp_v4_connect,
2401 .disconnect = tcp_disconnect,
2402 .accept = inet_csk_accept,
2403 .ioctl = tcp_ioctl,
2404 .init = tcp_v4_init_sock,
2405 .destroy = tcp_v4_destroy_sock,
2406 .shutdown = tcp_shutdown,
2407 .setsockopt = tcp_setsockopt,
2408 .getsockopt = tcp_getsockopt,
2409 .keepalive = tcp_set_keepalive,
2410 .recvmsg = tcp_recvmsg,
2411 .sendmsg = tcp_sendmsg,
2412 .sendpage = tcp_sendpage,
2413 .backlog_rcv = tcp_v4_do_rcv,
2414 .release_cb = tcp_release_cb,
2415 .hash = inet_hash,
2416 .unhash = inet_unhash,
2417 .get_port = inet_csk_get_port,
2418 .enter_memory_pressure = tcp_enter_memory_pressure,
2419 .leave_memory_pressure = tcp_leave_memory_pressure,
2420 .stream_memory_free = tcp_stream_memory_free,
2421 .sockets_allocated = &tcp_sockets_allocated,
2422 .orphan_count = &tcp_orphan_count,
2423 .memory_allocated = &tcp_memory_allocated,
2424 .memory_pressure = &tcp_memory_pressure,
2425 .sysctl_mem = sysctl_tcp_mem,
2426 .sysctl_wmem = sysctl_tcp_wmem,
2427 .sysctl_rmem = sysctl_tcp_rmem,
2428 .max_header = MAX_TCP_HEADER,
2429 .obj_size = sizeof(struct tcp_sock),
2430 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2431 .twsk_prot = &tcp_timewait_sock_ops,
2432 .rsk_prot = &tcp_request_sock_ops,
2433 .h.hashinfo = &tcp_hashinfo,
2434 .no_autobind = true,
2435 #ifdef CONFIG_COMPAT
2436 .compat_setsockopt = compat_tcp_setsockopt,
2437 .compat_getsockopt = compat_tcp_getsockopt,
2438 #endif
2439 .diag_destroy = tcp_abort,
2440 };
2441 EXPORT_SYMBOL(tcp_prot);
2442
tcp_sk_exit(struct net * net)2443 static void __net_exit tcp_sk_exit(struct net *net)
2444 {
2445 int cpu;
2446
2447 for_each_possible_cpu(cpu)
2448 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2449 free_percpu(net->ipv4.tcp_sk);
2450 }
2451
tcp_sk_init(struct net * net)2452 static int __net_init tcp_sk_init(struct net *net)
2453 {
2454 int res, cpu, cnt;
2455
2456 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2457 if (!net->ipv4.tcp_sk)
2458 return -ENOMEM;
2459
2460 for_each_possible_cpu(cpu) {
2461 struct sock *sk;
2462
2463 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2464 IPPROTO_TCP, net);
2465 if (res)
2466 goto fail;
2467 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2468
2469 /* Please enforce IP_DF and IPID==0 for RST and
2470 * ACK sent in SYN-RECV and TIME-WAIT state.
2471 */
2472 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2473
2474 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2475 }
2476
2477 net->ipv4.sysctl_tcp_ecn = 2;
2478 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2479
2480 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2481 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2482 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2483 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2484
2485 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2486 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2487 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2488
2489 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2490 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2491 net->ipv4.sysctl_tcp_syncookies = 1;
2492 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2493 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2494 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2495 net->ipv4.sysctl_tcp_orphan_retries = 0;
2496 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2497 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2498 net->ipv4.sysctl_tcp_tw_reuse = 0;
2499
2500 cnt = tcp_hashinfo.ehash_mask + 1;
2501 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2502 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2503
2504 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2505 net->ipv4.sysctl_tcp_sack = 1;
2506 net->ipv4.sysctl_tcp_window_scaling = 1;
2507 net->ipv4.sysctl_tcp_timestamps = 1;
2508
2509 return 0;
2510 fail:
2511 tcp_sk_exit(net);
2512
2513 return res;
2514 }
2515
tcp_sk_exit_batch(struct list_head * net_exit_list)2516 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2517 {
2518 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2519 }
2520
2521 static struct pernet_operations __net_initdata tcp_sk_ops = {
2522 .init = tcp_sk_init,
2523 .exit = tcp_sk_exit,
2524 .exit_batch = tcp_sk_exit_batch,
2525 };
2526
tcp_v4_init(void)2527 void __init tcp_v4_init(void)
2528 {
2529 if (register_pernet_subsys(&tcp_sk_ops))
2530 panic("Failed to create the TCP control socket.\n");
2531 }
2532