1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/netdma.h> 76 #include <net/secure_seq.h> 77 #include <net/tcp_memcontrol.h> 78 79 #include <linux/inet.h> 80 #include <linux/ipv6.h> 81 #include <linux/stddef.h> 82 #include <linux/proc_fs.h> 83 #include <linux/seq_file.h> 84 85 #include <linux/crypto.h> 86 #include <linux/scatterlist.h> 87 88 int sysctl_tcp_tw_reuse __read_mostly; 89 int sysctl_tcp_low_latency __read_mostly; 90 EXPORT_SYMBOL(sysctl_tcp_low_latency); 91 92 93 #ifdef CONFIG_TCP_MD5SIG 94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 95 __be32 daddr, __be32 saddr, const struct tcphdr *th); 96 #endif 97 98 struct inet_hashinfo tcp_hashinfo; 99 EXPORT_SYMBOL(tcp_hashinfo); 100 tcp_v4_init_sequence(const struct sk_buff * skb)101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) 102 { 103 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 104 ip_hdr(skb)->saddr, 105 tcp_hdr(skb)->dest, 106 tcp_hdr(skb)->source); 107 } 108 tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 112 struct tcp_sock *tp = tcp_sk(sk); 113 114 /* With PAWS, it is safe from the viewpoint 115 of data integrity. Even without PAWS it is safe provided sequence 116 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 117 118 Actually, the idea is close to VJ's one, only timestamp cache is 119 held not per host, but per port pair and TW bucket is used as state 120 holder. 121 122 If TW bucket has been already destroyed we fall back to VJ's scheme 123 and use initial timestamp retrieved from peer table. 124 */ 125 if (tcptw->tw_ts_recent_stamp && 126 (twp == NULL || (sysctl_tcp_tw_reuse && 127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 129 if (tp->write_seq == 0) 130 tp->write_seq = 1; 131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 133 sock_hold(sktw); 134 return 1; 135 } 136 137 return 0; 138 } 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 140 141 /* This will initiate an outgoing connection. */ tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 143 { 144 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 145 struct inet_sock *inet = inet_sk(sk); 146 struct tcp_sock *tp = tcp_sk(sk); 147 __be16 orig_sport, orig_dport; 148 __be32 daddr, nexthop; 149 struct flowi4 *fl4; 150 struct rtable *rt; 151 int err; 152 struct ip_options_rcu *inet_opt; 153 154 if (addr_len < sizeof(struct sockaddr_in)) 155 return -EINVAL; 156 157 if (usin->sin_family != AF_INET) 158 return -EAFNOSUPPORT; 159 160 nexthop = daddr = usin->sin_addr.s_addr; 161 inet_opt = rcu_dereference_protected(inet->inet_opt, 162 sock_owned_by_user(sk)); 163 if (inet_opt && inet_opt->opt.srr) { 164 if (!daddr) 165 return -EINVAL; 166 nexthop = inet_opt->opt.faddr; 167 } 168 169 orig_sport = inet->inet_sport; 170 orig_dport = usin->sin_port; 171 fl4 = &inet->cork.fl.u.ip4; 172 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 174 IPPROTO_TCP, 175 orig_sport, orig_dport, sk, true); 176 if (IS_ERR(rt)) { 177 err = PTR_ERR(rt); 178 if (err == -ENETUNREACH) 179 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 180 return err; 181 } 182 183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 184 ip_rt_put(rt); 185 return -ENETUNREACH; 186 } 187 188 if (!inet_opt || !inet_opt->opt.srr) 189 daddr = fl4->daddr; 190 191 if (!inet->inet_saddr) 192 inet->inet_saddr = fl4->saddr; 193 inet->inet_rcv_saddr = inet->inet_saddr; 194 195 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 196 /* Reset inherited state */ 197 tp->rx_opt.ts_recent = 0; 198 tp->rx_opt.ts_recent_stamp = 0; 199 if (likely(!tp->repair)) 200 tp->write_seq = 0; 201 } 202 203 if (tcp_death_row.sysctl_tw_recycle && 204 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 205 tcp_fetch_timewait_stamp(sk, &rt->dst); 206 207 inet->inet_dport = usin->sin_port; 208 inet->inet_daddr = daddr; 209 210 inet_csk(sk)->icsk_ext_hdr_len = 0; 211 if (inet_opt) 212 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 213 214 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 215 216 /* Socket identity is still unknown (sport may be zero). 217 * However we set state to SYN-SENT and not releasing socket 218 * lock select source port, enter ourselves into the hash tables and 219 * complete initialization after this. 220 */ 221 tcp_set_state(sk, TCP_SYN_SENT); 222 err = inet_hash_connect(&tcp_death_row, sk); 223 if (err) 224 goto failure; 225 226 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 227 inet->inet_sport, inet->inet_dport, sk); 228 if (IS_ERR(rt)) { 229 err = PTR_ERR(rt); 230 rt = NULL; 231 goto failure; 232 } 233 /* OK, now commit destination to socket. */ 234 sk->sk_gso_type = SKB_GSO_TCPV4; 235 sk_setup_caps(sk, &rt->dst); 236 237 if (!tp->write_seq && likely(!tp->repair)) 238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 239 inet->inet_daddr, 240 inet->inet_sport, 241 usin->sin_port); 242 243 inet->inet_id = tp->write_seq ^ jiffies; 244 245 err = tcp_connect(sk); 246 247 rt = NULL; 248 if (err) 249 goto failure; 250 251 return 0; 252 253 failure: 254 /* 255 * This unhashes the socket and releases the local port, 256 * if necessary. 257 */ 258 tcp_set_state(sk, TCP_CLOSE); 259 ip_rt_put(rt); 260 sk->sk_route_caps = 0; 261 inet->inet_dport = 0; 262 return err; 263 } 264 EXPORT_SYMBOL(tcp_v4_connect); 265 266 /* 267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 268 * It can be called through tcp_release_cb() if socket was owned by user 269 * at the time tcp_v4_err() was called to handle ICMP message. 270 */ tcp_v4_mtu_reduced(struct sock * sk)271 static void tcp_v4_mtu_reduced(struct sock *sk) 272 { 273 struct dst_entry *dst; 274 struct inet_sock *inet = inet_sk(sk); 275 u32 mtu = tcp_sk(sk)->mtu_info; 276 277 dst = inet_csk_update_pmtu(sk, mtu); 278 if (!dst) 279 return; 280 281 /* Something is about to be wrong... Remember soft error 282 * for the case, if this connection will not able to recover. 283 */ 284 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 285 sk->sk_err_soft = EMSGSIZE; 286 287 mtu = dst_mtu(dst); 288 289 if (inet->pmtudisc != IP_PMTUDISC_DONT && 290 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 291 tcp_sync_mss(sk, mtu); 292 293 /* Resend the TCP packet because it's 294 * clear that the old packet has been 295 * dropped. This is the new "fast" path mtu 296 * discovery. 297 */ 298 tcp_simple_retransmit(sk); 299 } /* else let the usual retransmit timer handle it */ 300 } 301 do_redirect(struct sk_buff * skb,struct sock * sk)302 static void do_redirect(struct sk_buff *skb, struct sock *sk) 303 { 304 struct dst_entry *dst = __sk_dst_check(sk, 0); 305 306 if (dst) 307 dst->ops->redirect(dst, sk, skb); 308 } 309 310 /* 311 * This routine is called by the ICMP module when it gets some 312 * sort of error condition. If err < 0 then the socket should 313 * be closed and the error returned to the user. If err > 0 314 * it's just the icmp type << 8 | icmp code. After adjustment 315 * header points to the first 8 bytes of the tcp header. We need 316 * to find the appropriate port. 317 * 318 * The locking strategy used here is very "optimistic". When 319 * someone else accesses the socket the ICMP is just dropped 320 * and for some paths there is no check at all. 321 * A more general error queue to queue errors for later handling 322 * is probably better. 323 * 324 */ 325 tcp_v4_err(struct sk_buff * icmp_skb,u32 info)326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 327 { 328 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 329 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 330 struct inet_connection_sock *icsk; 331 struct tcp_sock *tp; 332 struct inet_sock *inet; 333 const int type = icmp_hdr(icmp_skb)->type; 334 const int code = icmp_hdr(icmp_skb)->code; 335 struct sock *sk; 336 struct sk_buff *skb; 337 struct request_sock *req; 338 __u32 seq; 339 __u32 remaining; 340 int err; 341 struct net *net = dev_net(icmp_skb->dev); 342 343 if (icmp_skb->len < (iph->ihl << 2) + 8) { 344 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 345 return; 346 } 347 348 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, 349 iph->saddr, th->source, inet_iif(icmp_skb)); 350 if (!sk) { 351 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 352 return; 353 } 354 if (sk->sk_state == TCP_TIME_WAIT) { 355 inet_twsk_put(inet_twsk(sk)); 356 return; 357 } 358 359 bh_lock_sock(sk); 360 /* If too many ICMPs get dropped on busy 361 * servers this needs to be solved differently. 362 * We do take care of PMTU discovery (RFC1191) special case : 363 * we can receive locally generated ICMP messages while socket is held. 364 */ 365 if (sock_owned_by_user(sk)) { 366 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 367 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 368 } 369 if (sk->sk_state == TCP_CLOSE) 370 goto out; 371 372 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 373 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); 374 goto out; 375 } 376 377 icsk = inet_csk(sk); 378 tp = tcp_sk(sk); 379 req = tp->fastopen_rsk; 380 seq = ntohl(th->seq); 381 if (sk->sk_state != TCP_LISTEN && 382 !between(seq, tp->snd_una, tp->snd_nxt) && 383 (req == NULL || seq != tcp_rsk(req)->snt_isn)) { 384 /* For a Fast Open socket, allow seq to be snt_isn. */ 385 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 386 goto out; 387 } 388 389 switch (type) { 390 case ICMP_REDIRECT: 391 do_redirect(icmp_skb, sk); 392 goto out; 393 case ICMP_SOURCE_QUENCH: 394 /* Just silently ignore these. */ 395 goto out; 396 case ICMP_PARAMETERPROB: 397 err = EPROTO; 398 break; 399 case ICMP_DEST_UNREACH: 400 if (code > NR_ICMP_UNREACH) 401 goto out; 402 403 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 404 /* We are not interested in TCP_LISTEN and open_requests 405 * (SYN-ACKs send out by Linux are always <576bytes so 406 * they should go through unfragmented). 407 */ 408 if (sk->sk_state == TCP_LISTEN) 409 goto out; 410 411 tp->mtu_info = info; 412 if (!sock_owned_by_user(sk)) { 413 tcp_v4_mtu_reduced(sk); 414 } else { 415 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) 416 sock_hold(sk); 417 } 418 goto out; 419 } 420 421 err = icmp_err_convert[code].errno; 422 /* check if icmp_skb allows revert of backoff 423 * (see draft-zimmermann-tcp-lcd) */ 424 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 425 break; 426 if (seq != tp->snd_una || !icsk->icsk_retransmits || 427 !icsk->icsk_backoff) 428 break; 429 430 /* XXX (TFO) - revisit the following logic for TFO */ 431 432 if (sock_owned_by_user(sk)) 433 break; 434 435 icsk->icsk_backoff--; 436 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : 437 TCP_TIMEOUT_INIT) << icsk->icsk_backoff; 438 tcp_bound_rto(sk); 439 440 skb = tcp_write_queue_head(sk); 441 BUG_ON(!skb); 442 443 remaining = icsk->icsk_rto - min(icsk->icsk_rto, 444 tcp_time_stamp - TCP_SKB_CB(skb)->when); 445 446 if (remaining) { 447 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 448 remaining, TCP_RTO_MAX); 449 } else { 450 /* RTO revert clocked out retransmission. 451 * Will retransmit now */ 452 tcp_retransmit_timer(sk); 453 } 454 455 break; 456 case ICMP_TIME_EXCEEDED: 457 err = EHOSTUNREACH; 458 break; 459 default: 460 goto out; 461 } 462 463 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather 464 * than following the TCP_SYN_RECV case and closing the socket, 465 * we ignore the ICMP error and keep trying like a fully established 466 * socket. Is this the right thing to do? 467 */ 468 if (req && req->sk == NULL) 469 goto out; 470 471 switch (sk->sk_state) { 472 struct request_sock *req, **prev; 473 case TCP_LISTEN: 474 if (sock_owned_by_user(sk)) 475 goto out; 476 477 req = inet_csk_search_req(sk, &prev, th->dest, 478 iph->daddr, iph->saddr); 479 if (!req) 480 goto out; 481 482 /* ICMPs are not backlogged, hence we cannot get 483 an established socket here. 484 */ 485 WARN_ON(req->sk); 486 487 if (seq != tcp_rsk(req)->snt_isn) { 488 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 489 goto out; 490 } 491 492 /* 493 * Still in SYN_RECV, just remove it silently. 494 * There is no good way to pass the error to the newly 495 * created socket, and POSIX does not want network 496 * errors returned from accept(). 497 */ 498 inet_csk_reqsk_queue_drop(sk, req, prev); 499 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 500 goto out; 501 502 case TCP_SYN_SENT: 503 case TCP_SYN_RECV: /* Cannot happen. 504 It can f.e. if SYNs crossed, 505 or Fast Open. 506 */ 507 if (!sock_owned_by_user(sk)) { 508 sk->sk_err = err; 509 510 sk->sk_error_report(sk); 511 512 tcp_done(sk); 513 } else { 514 sk->sk_err_soft = err; 515 } 516 goto out; 517 } 518 519 /* If we've already connected we will keep trying 520 * until we time out, or the user gives up. 521 * 522 * rfc1122 4.2.3.9 allows to consider as hard errors 523 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 524 * but it is obsoleted by pmtu discovery). 525 * 526 * Note, that in modern internet, where routing is unreliable 527 * and in each dark corner broken firewalls sit, sending random 528 * errors ordered by their masters even this two messages finally lose 529 * their original sense (even Linux sends invalid PORT_UNREACHs) 530 * 531 * Now we are in compliance with RFCs. 532 * --ANK (980905) 533 */ 534 535 inet = inet_sk(sk); 536 if (!sock_owned_by_user(sk) && inet->recverr) { 537 sk->sk_err = err; 538 sk->sk_error_report(sk); 539 } else { /* Only an error on timeout */ 540 sk->sk_err_soft = err; 541 } 542 543 out: 544 bh_unlock_sock(sk); 545 sock_put(sk); 546 } 547 __tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)548 static void __tcp_v4_send_check(struct sk_buff *skb, 549 __be32 saddr, __be32 daddr) 550 { 551 struct tcphdr *th = tcp_hdr(skb); 552 553 if (skb->ip_summed == CHECKSUM_PARTIAL) { 554 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 555 skb->csum_start = skb_transport_header(skb) - skb->head; 556 skb->csum_offset = offsetof(struct tcphdr, check); 557 } else { 558 th->check = tcp_v4_check(skb->len, saddr, daddr, 559 csum_partial(th, 560 th->doff << 2, 561 skb->csum)); 562 } 563 } 564 565 /* This routine computes an IPv4 TCP checksum. */ tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)566 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 567 { 568 const struct inet_sock *inet = inet_sk(sk); 569 570 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 571 } 572 EXPORT_SYMBOL(tcp_v4_send_check); 573 tcp_v4_gso_send_check(struct sk_buff * skb)574 int tcp_v4_gso_send_check(struct sk_buff *skb) 575 { 576 const struct iphdr *iph; 577 struct tcphdr *th; 578 579 if (!pskb_may_pull(skb, sizeof(*th))) 580 return -EINVAL; 581 582 iph = ip_hdr(skb); 583 th = tcp_hdr(skb); 584 585 th->check = 0; 586 skb->ip_summed = CHECKSUM_PARTIAL; 587 __tcp_v4_send_check(skb, iph->saddr, iph->daddr); 588 return 0; 589 } 590 591 /* 592 * This routine will send an RST to the other tcp. 593 * 594 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 595 * for reset. 596 * Answer: if a packet caused RST, it is not for a socket 597 * existing in our system, if it is matched to a socket, 598 * it is just duplicate segment or bug in other side's TCP. 599 * So that we build reply only basing on parameters 600 * arrived with segment. 601 * Exception: precedence violation. We do not implement it in any case. 602 */ 603 tcp_v4_send_reset(struct sock * sk,struct sk_buff * skb)604 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) 605 { 606 const struct tcphdr *th = tcp_hdr(skb); 607 struct { 608 struct tcphdr th; 609 #ifdef CONFIG_TCP_MD5SIG 610 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 611 #endif 612 } rep; 613 struct ip_reply_arg arg; 614 #ifdef CONFIG_TCP_MD5SIG 615 struct tcp_md5sig_key *key; 616 const __u8 *hash_location = NULL; 617 unsigned char newhash[16]; 618 int genhash; 619 struct sock *sk1 = NULL; 620 #endif 621 struct net *net; 622 623 /* Never send a reset in response to a reset. */ 624 if (th->rst) 625 return; 626 627 if (skb_rtable(skb)->rt_type != RTN_LOCAL) 628 return; 629 630 /* Swap the send and the receive. */ 631 memset(&rep, 0, sizeof(rep)); 632 rep.th.dest = th->source; 633 rep.th.source = th->dest; 634 rep.th.doff = sizeof(struct tcphdr) / 4; 635 rep.th.rst = 1; 636 637 if (th->ack) { 638 rep.th.seq = th->ack_seq; 639 } else { 640 rep.th.ack = 1; 641 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 642 skb->len - (th->doff << 2)); 643 } 644 645 memset(&arg, 0, sizeof(arg)); 646 arg.iov[0].iov_base = (unsigned char *)&rep; 647 arg.iov[0].iov_len = sizeof(rep.th); 648 649 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 650 #ifdef CONFIG_TCP_MD5SIG 651 hash_location = tcp_parse_md5sig_option(th); 652 if (!sk && hash_location) { 653 /* 654 * active side is lost. Try to find listening socket through 655 * source port, and then find md5 key through listening socket. 656 * we are not loose security here: 657 * Incoming packet is checked with md5 hash with finding key, 658 * no RST generated if md5 hash doesn't match. 659 */ 660 sk1 = __inet_lookup_listener(net, 661 &tcp_hashinfo, ip_hdr(skb)->saddr, 662 th->source, ip_hdr(skb)->daddr, 663 ntohs(th->source), inet_iif(skb)); 664 /* don't send rst if it can't find key */ 665 if (!sk1) 666 return; 667 rcu_read_lock(); 668 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 669 &ip_hdr(skb)->saddr, AF_INET); 670 if (!key) 671 goto release_sk1; 672 673 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); 674 if (genhash || memcmp(hash_location, newhash, 16) != 0) 675 goto release_sk1; 676 } else { 677 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 678 &ip_hdr(skb)->saddr, 679 AF_INET) : NULL; 680 } 681 682 if (key) { 683 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 684 (TCPOPT_NOP << 16) | 685 (TCPOPT_MD5SIG << 8) | 686 TCPOLEN_MD5SIG); 687 /* Update length and the length the header thinks exists */ 688 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 689 rep.th.doff = arg.iov[0].iov_len / 4; 690 691 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 692 key, ip_hdr(skb)->saddr, 693 ip_hdr(skb)->daddr, &rep.th); 694 } 695 #endif 696 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 697 ip_hdr(skb)->saddr, /* XXX */ 698 arg.iov[0].iov_len, IPPROTO_TCP, 0); 699 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 700 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; 701 /* When socket is gone, all binding information is lost. 702 * routing might fail in this case. No choice here, if we choose to force 703 * input interface, we will misroute in case of asymmetric route. 704 */ 705 if (sk) 706 arg.bound_dev_if = sk->sk_bound_dev_if; 707 708 arg.tos = ip_hdr(skb)->tos; 709 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 710 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 711 skb, ip_hdr(skb)->saddr, 712 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 713 714 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 715 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 716 717 #ifdef CONFIG_TCP_MD5SIG 718 release_sk1: 719 if (sk1) { 720 rcu_read_unlock(); 721 sock_put(sk1); 722 } 723 #endif 724 } 725 726 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 727 outside socket context is ugly, certainly. What can I do? 728 */ 729 tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)730 static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, 731 u32 seq, u32 ack, 732 u32 win, u32 tsval, u32 tsecr, int oif, 733 struct tcp_md5sig_key *key, 734 int reply_flags, u8 tos) 735 { 736 const struct tcphdr *th = tcp_hdr(skb); 737 struct { 738 struct tcphdr th; 739 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 740 #ifdef CONFIG_TCP_MD5SIG 741 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 742 #endif 743 ]; 744 } rep; 745 struct ip_reply_arg arg; 746 struct net *net = sock_net(sk); 747 748 memset(&rep.th, 0, sizeof(struct tcphdr)); 749 memset(&arg, 0, sizeof(arg)); 750 751 arg.iov[0].iov_base = (unsigned char *)&rep; 752 arg.iov[0].iov_len = sizeof(rep.th); 753 if (tsecr) { 754 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 755 (TCPOPT_TIMESTAMP << 8) | 756 TCPOLEN_TIMESTAMP); 757 rep.opt[1] = htonl(tsval); 758 rep.opt[2] = htonl(tsecr); 759 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 760 } 761 762 /* Swap the send and the receive. */ 763 rep.th.dest = th->source; 764 rep.th.source = th->dest; 765 rep.th.doff = arg.iov[0].iov_len / 4; 766 rep.th.seq = htonl(seq); 767 rep.th.ack_seq = htonl(ack); 768 rep.th.ack = 1; 769 rep.th.window = htons(win); 770 771 #ifdef CONFIG_TCP_MD5SIG 772 if (key) { 773 int offset = (tsecr) ? 3 : 0; 774 775 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 776 (TCPOPT_NOP << 16) | 777 (TCPOPT_MD5SIG << 8) | 778 TCPOLEN_MD5SIG); 779 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 780 rep.th.doff = arg.iov[0].iov_len/4; 781 782 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 783 key, ip_hdr(skb)->saddr, 784 ip_hdr(skb)->daddr, &rep.th); 785 } 786 #endif 787 arg.flags = reply_flags; 788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 789 ip_hdr(skb)->saddr, /* XXX */ 790 arg.iov[0].iov_len, IPPROTO_TCP, 0); 791 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 792 if (oif) 793 arg.bound_dev_if = oif; 794 arg.tos = tos; 795 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 796 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 797 skb, ip_hdr(skb)->saddr, 798 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 799 800 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 801 } 802 tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)803 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 804 { 805 struct inet_timewait_sock *tw = inet_twsk(sk); 806 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 807 808 tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 809 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 810 tcp_time_stamp + tcptw->tw_ts_offset, 811 tcptw->tw_ts_recent, 812 tw->tw_bound_dev_if, 813 tcp_twsk_md5_key(tcptw), 814 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 815 tw->tw_tos 816 ); 817 818 inet_twsk_put(tw); 819 } 820 tcp_v4_reqsk_send_ack(struct sock * sk,struct sk_buff * skb,struct request_sock * req)821 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 822 struct request_sock *req) 823 { 824 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 825 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 826 */ 827 tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? 828 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, 829 tcp_rsk(req)->rcv_nxt, req->rcv_wnd, 830 tcp_time_stamp, 831 req->ts_recent, 832 0, 833 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 834 AF_INET), 835 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 836 ip_hdr(skb)->tos); 837 } 838 839 /* 840 * Send a SYN-ACK after having received a SYN. 841 * This still operates on a request_sock only, not on a big 842 * socket. 843 */ tcp_v4_send_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,u16 queue_mapping,bool nocache)844 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 845 struct request_sock *req, 846 u16 queue_mapping, 847 bool nocache) 848 { 849 const struct inet_request_sock *ireq = inet_rsk(req); 850 struct flowi4 fl4; 851 int err = -1; 852 struct sk_buff * skb; 853 854 /* First, grab a route. */ 855 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 856 return -1; 857 858 skb = tcp_make_synack(sk, dst, req, NULL); 859 860 if (skb) { 861 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 862 863 skb_set_queue_mapping(skb, queue_mapping); 864 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 865 ireq->rmt_addr, 866 ireq->opt); 867 err = net_xmit_eval(err); 868 if (!tcp_rsk(req)->snt_synack && !err) 869 tcp_rsk(req)->snt_synack = tcp_time_stamp; 870 } 871 872 return err; 873 } 874 tcp_v4_rtx_synack(struct sock * sk,struct request_sock * req)875 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) 876 { 877 int res = tcp_v4_send_synack(sk, NULL, req, 0, false); 878 879 if (!res) 880 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 881 return res; 882 } 883 884 /* 885 * IPv4 request_sock destructor. 886 */ tcp_v4_reqsk_destructor(struct request_sock * req)887 static void tcp_v4_reqsk_destructor(struct request_sock *req) 888 { 889 kfree(inet_rsk(req)->opt); 890 } 891 892 /* 893 * Return true if a syncookie should be sent 894 */ tcp_syn_flood_action(struct sock * sk,const struct sk_buff * skb,const char * proto)895 bool tcp_syn_flood_action(struct sock *sk, 896 const struct sk_buff *skb, 897 const char *proto) 898 { 899 const char *msg = "Dropping request"; 900 bool want_cookie = false; 901 struct listen_sock *lopt; 902 903 904 905 #ifdef CONFIG_SYN_COOKIES 906 if (sysctl_tcp_syncookies) { 907 msg = "Sending cookies"; 908 want_cookie = true; 909 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); 910 } else 911 #endif 912 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); 913 914 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; 915 if (!lopt->synflood_warned) { 916 lopt->synflood_warned = 1; 917 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", 918 proto, ntohs(tcp_hdr(skb)->dest), msg); 919 } 920 return want_cookie; 921 } 922 EXPORT_SYMBOL(tcp_syn_flood_action); 923 924 /* 925 * Save and compile IPv4 options into the request_sock if needed. 926 */ tcp_v4_save_options(struct sk_buff * skb)927 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) 928 { 929 const struct ip_options *opt = &(IPCB(skb)->opt); 930 struct ip_options_rcu *dopt = NULL; 931 932 if (opt && opt->optlen) { 933 int opt_size = sizeof(*dopt) + opt->optlen; 934 935 dopt = kmalloc(opt_size, GFP_ATOMIC); 936 if (dopt) { 937 if (ip_options_echo(&dopt->opt, skb)) { 938 kfree(dopt); 939 dopt = NULL; 940 } 941 } 942 } 943 return dopt; 944 } 945 946 #ifdef CONFIG_TCP_MD5SIG 947 /* 948 * RFC2385 MD5 checksumming requires a mapping of 949 * IP address->MD5 Key. 950 * We need to maintain these in the sk structure. 951 */ 952 953 /* Find the Key structure for an address. */ tcp_md5_do_lookup(struct sock * sk,const union tcp_md5_addr * addr,int family)954 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, 955 const union tcp_md5_addr *addr, 956 int family) 957 { 958 struct tcp_sock *tp = tcp_sk(sk); 959 struct tcp_md5sig_key *key; 960 unsigned int size = sizeof(struct in_addr); 961 struct tcp_md5sig_info *md5sig; 962 963 /* caller either holds rcu_read_lock() or socket lock */ 964 md5sig = rcu_dereference_check(tp->md5sig_info, 965 sock_owned_by_user(sk) || 966 lockdep_is_held(&sk->sk_lock.slock)); 967 if (!md5sig) 968 return NULL; 969 #if IS_ENABLED(CONFIG_IPV6) 970 if (family == AF_INET6) 971 size = sizeof(struct in6_addr); 972 #endif 973 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 974 if (key->family != family) 975 continue; 976 if (!memcmp(&key->addr, addr, size)) 977 return key; 978 } 979 return NULL; 980 } 981 EXPORT_SYMBOL(tcp_md5_do_lookup); 982 tcp_v4_md5_lookup(struct sock * sk,struct sock * addr_sk)983 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 984 struct sock *addr_sk) 985 { 986 union tcp_md5_addr *addr; 987 988 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; 989 return tcp_md5_do_lookup(sk, addr, AF_INET); 990 } 991 EXPORT_SYMBOL(tcp_v4_md5_lookup); 992 tcp_v4_reqsk_md5_lookup(struct sock * sk,struct request_sock * req)993 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 994 struct request_sock *req) 995 { 996 union tcp_md5_addr *addr; 997 998 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr; 999 return tcp_md5_do_lookup(sk, addr, AF_INET); 1000 } 1001 1002 /* This can be called on a newly created socket, from other files */ tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,const u8 * newkey,u8 newkeylen,gfp_t gfp)1003 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1004 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) 1005 { 1006 /* Add Key to the list */ 1007 struct tcp_md5sig_key *key; 1008 struct tcp_sock *tp = tcp_sk(sk); 1009 struct tcp_md5sig_info *md5sig; 1010 1011 key = tcp_md5_do_lookup(sk, addr, family); 1012 if (key) { 1013 /* Pre-existing entry - just update that one. */ 1014 memcpy(key->key, newkey, newkeylen); 1015 key->keylen = newkeylen; 1016 return 0; 1017 } 1018 1019 md5sig = rcu_dereference_protected(tp->md5sig_info, 1020 sock_owned_by_user(sk)); 1021 if (!md5sig) { 1022 md5sig = kmalloc(sizeof(*md5sig), gfp); 1023 if (!md5sig) 1024 return -ENOMEM; 1025 1026 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1027 INIT_HLIST_HEAD(&md5sig->head); 1028 rcu_assign_pointer(tp->md5sig_info, md5sig); 1029 } 1030 1031 key = sock_kmalloc(sk, sizeof(*key), gfp); 1032 if (!key) 1033 return -ENOMEM; 1034 if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) { 1035 sock_kfree_s(sk, key, sizeof(*key)); 1036 return -ENOMEM; 1037 } 1038 1039 memcpy(key->key, newkey, newkeylen); 1040 key->keylen = newkeylen; 1041 key->family = family; 1042 memcpy(&key->addr, addr, 1043 (family == AF_INET6) ? sizeof(struct in6_addr) : 1044 sizeof(struct in_addr)); 1045 hlist_add_head_rcu(&key->node, &md5sig->head); 1046 return 0; 1047 } 1048 EXPORT_SYMBOL(tcp_md5_do_add); 1049 tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family)1050 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) 1051 { 1052 struct tcp_sock *tp = tcp_sk(sk); 1053 struct tcp_md5sig_key *key; 1054 struct tcp_md5sig_info *md5sig; 1055 1056 key = tcp_md5_do_lookup(sk, addr, family); 1057 if (!key) 1058 return -ENOENT; 1059 hlist_del_rcu(&key->node); 1060 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1061 kfree_rcu(key, rcu); 1062 md5sig = rcu_dereference_protected(tp->md5sig_info, 1063 sock_owned_by_user(sk)); 1064 if (hlist_empty(&md5sig->head)) 1065 tcp_free_md5sig_pool(); 1066 return 0; 1067 } 1068 EXPORT_SYMBOL(tcp_md5_do_del); 1069 tcp_clear_md5_list(struct sock * sk)1070 static void tcp_clear_md5_list(struct sock *sk) 1071 { 1072 struct tcp_sock *tp = tcp_sk(sk); 1073 struct tcp_md5sig_key *key; 1074 struct hlist_node *n; 1075 struct tcp_md5sig_info *md5sig; 1076 1077 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1078 1079 if (!hlist_empty(&md5sig->head)) 1080 tcp_free_md5sig_pool(); 1081 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1082 hlist_del_rcu(&key->node); 1083 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1084 kfree_rcu(key, rcu); 1085 } 1086 } 1087 tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)1088 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, 1089 int optlen) 1090 { 1091 struct tcp_md5sig cmd; 1092 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1093 1094 if (optlen < sizeof(cmd)) 1095 return -EINVAL; 1096 1097 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1098 return -EFAULT; 1099 1100 if (sin->sin_family != AF_INET) 1101 return -EINVAL; 1102 1103 if (!cmd.tcpm_key || !cmd.tcpm_keylen) 1104 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1105 AF_INET); 1106 1107 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1108 return -EINVAL; 1109 1110 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1111 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, 1112 GFP_KERNEL); 1113 } 1114 tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1115 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, 1116 __be32 daddr, __be32 saddr, int nbytes) 1117 { 1118 struct tcp4_pseudohdr *bp; 1119 struct scatterlist sg; 1120 1121 bp = &hp->md5_blk.ip4; 1122 1123 /* 1124 * 1. the TCP pseudo-header (in the order: source IP address, 1125 * destination IP address, zero-padded protocol number, and 1126 * segment length) 1127 */ 1128 bp->saddr = saddr; 1129 bp->daddr = daddr; 1130 bp->pad = 0; 1131 bp->protocol = IPPROTO_TCP; 1132 bp->len = cpu_to_be16(nbytes); 1133 1134 sg_init_one(&sg, bp, sizeof(*bp)); 1135 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); 1136 } 1137 tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1138 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1139 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1140 { 1141 struct tcp_md5sig_pool *hp; 1142 struct hash_desc *desc; 1143 1144 hp = tcp_get_md5sig_pool(); 1145 if (!hp) 1146 goto clear_hash_noput; 1147 desc = &hp->md5_desc; 1148 1149 if (crypto_hash_init(desc)) 1150 goto clear_hash; 1151 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) 1152 goto clear_hash; 1153 if (tcp_md5_hash_header(hp, th)) 1154 goto clear_hash; 1155 if (tcp_md5_hash_key(hp, key)) 1156 goto clear_hash; 1157 if (crypto_hash_final(desc, md5_hash)) 1158 goto clear_hash; 1159 1160 tcp_put_md5sig_pool(); 1161 return 0; 1162 1163 clear_hash: 1164 tcp_put_md5sig_pool(); 1165 clear_hash_noput: 1166 memset(md5_hash, 0, 16); 1167 return 1; 1168 } 1169 tcp_v4_md5_hash_skb(char * md5_hash,struct tcp_md5sig_key * key,const struct sock * sk,const struct request_sock * req,const struct sk_buff * skb)1170 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, 1171 const struct sock *sk, const struct request_sock *req, 1172 const struct sk_buff *skb) 1173 { 1174 struct tcp_md5sig_pool *hp; 1175 struct hash_desc *desc; 1176 const struct tcphdr *th = tcp_hdr(skb); 1177 __be32 saddr, daddr; 1178 1179 if (sk) { 1180 saddr = inet_sk(sk)->inet_saddr; 1181 daddr = inet_sk(sk)->inet_daddr; 1182 } else if (req) { 1183 saddr = inet_rsk(req)->loc_addr; 1184 daddr = inet_rsk(req)->rmt_addr; 1185 } else { 1186 const struct iphdr *iph = ip_hdr(skb); 1187 saddr = iph->saddr; 1188 daddr = iph->daddr; 1189 } 1190 1191 hp = tcp_get_md5sig_pool(); 1192 if (!hp) 1193 goto clear_hash_noput; 1194 desc = &hp->md5_desc; 1195 1196 if (crypto_hash_init(desc)) 1197 goto clear_hash; 1198 1199 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) 1200 goto clear_hash; 1201 if (tcp_md5_hash_header(hp, th)) 1202 goto clear_hash; 1203 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1204 goto clear_hash; 1205 if (tcp_md5_hash_key(hp, key)) 1206 goto clear_hash; 1207 if (crypto_hash_final(desc, md5_hash)) 1208 goto clear_hash; 1209 1210 tcp_put_md5sig_pool(); 1211 return 0; 1212 1213 clear_hash: 1214 tcp_put_md5sig_pool(); 1215 clear_hash_noput: 1216 memset(md5_hash, 0, 16); 1217 return 1; 1218 } 1219 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1220 tcp_v4_inbound_md5_hash(struct sock * sk,const struct sk_buff * skb)1221 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) 1222 { 1223 /* 1224 * This gets called for each TCP segment that arrives 1225 * so we want to be efficient. 1226 * We have 3 drop cases: 1227 * o No MD5 hash and one expected. 1228 * o MD5 hash and we're not expecting one. 1229 * o MD5 hash and its wrong. 1230 */ 1231 const __u8 *hash_location = NULL; 1232 struct tcp_md5sig_key *hash_expected; 1233 const struct iphdr *iph = ip_hdr(skb); 1234 const struct tcphdr *th = tcp_hdr(skb); 1235 int genhash; 1236 unsigned char newhash[16]; 1237 1238 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1239 AF_INET); 1240 hash_location = tcp_parse_md5sig_option(th); 1241 1242 /* We've parsed the options - do we have a hash? */ 1243 if (!hash_expected && !hash_location) 1244 return false; 1245 1246 if (hash_expected && !hash_location) { 1247 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1248 return true; 1249 } 1250 1251 if (!hash_expected && hash_location) { 1252 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1253 return true; 1254 } 1255 1256 /* Okay, so this is hash_expected and hash_location - 1257 * so we need to calculate the checksum. 1258 */ 1259 genhash = tcp_v4_md5_hash_skb(newhash, 1260 hash_expected, 1261 NULL, NULL, skb); 1262 1263 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1264 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1265 &iph->saddr, ntohs(th->source), 1266 &iph->daddr, ntohs(th->dest), 1267 genhash ? " tcp_v4_calc_md5_hash failed" 1268 : ""); 1269 return true; 1270 } 1271 return false; 1272 } 1273 1274 #endif 1275 1276 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1277 .family = PF_INET, 1278 .obj_size = sizeof(struct tcp_request_sock), 1279 .rtx_syn_ack = tcp_v4_rtx_synack, 1280 .send_ack = tcp_v4_reqsk_send_ack, 1281 .destructor = tcp_v4_reqsk_destructor, 1282 .send_reset = tcp_v4_send_reset, 1283 .syn_ack_timeout = tcp_syn_ack_timeout, 1284 }; 1285 1286 #ifdef CONFIG_TCP_MD5SIG 1287 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1288 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1289 .calc_md5_hash = tcp_v4_md5_hash_skb, 1290 }; 1291 #endif 1292 tcp_fastopen_check(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct tcp_fastopen_cookie * foc,struct tcp_fastopen_cookie * valid_foc)1293 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, 1294 struct request_sock *req, 1295 struct tcp_fastopen_cookie *foc, 1296 struct tcp_fastopen_cookie *valid_foc) 1297 { 1298 bool skip_cookie = false; 1299 struct fastopen_queue *fastopenq; 1300 1301 if (likely(!fastopen_cookie_present(foc))) { 1302 /* See include/net/tcp.h for the meaning of these knobs */ 1303 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || 1304 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && 1305 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) 1306 skip_cookie = true; /* no cookie to validate */ 1307 else 1308 return false; 1309 } 1310 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; 1311 /* A FO option is present; bump the counter. */ 1312 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); 1313 1314 /* Make sure the listener has enabled fastopen, and we don't 1315 * exceed the max # of pending TFO requests allowed before trying 1316 * to validating the cookie in order to avoid burning CPU cycles 1317 * unnecessarily. 1318 * 1319 * XXX (TFO) - The implication of checking the max_qlen before 1320 * processing a cookie request is that clients can't differentiate 1321 * between qlen overflow causing Fast Open to be disabled 1322 * temporarily vs a server not supporting Fast Open at all. 1323 */ 1324 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || 1325 fastopenq == NULL || fastopenq->max_qlen == 0) 1326 return false; 1327 1328 if (fastopenq->qlen >= fastopenq->max_qlen) { 1329 struct request_sock *req1; 1330 spin_lock(&fastopenq->lock); 1331 req1 = fastopenq->rskq_rst_head; 1332 if ((req1 == NULL) || time_after(req1->expires, jiffies)) { 1333 spin_unlock(&fastopenq->lock); 1334 NET_INC_STATS_BH(sock_net(sk), 1335 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); 1336 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ 1337 foc->len = -1; 1338 return false; 1339 } 1340 fastopenq->rskq_rst_head = req1->dl_next; 1341 fastopenq->qlen--; 1342 spin_unlock(&fastopenq->lock); 1343 reqsk_free(req1); 1344 } 1345 if (skip_cookie) { 1346 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 1347 return true; 1348 } 1349 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { 1350 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { 1351 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); 1352 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || 1353 memcmp(&foc->val[0], &valid_foc->val[0], 1354 TCP_FASTOPEN_COOKIE_SIZE) != 0) 1355 return false; 1356 valid_foc->len = -1; 1357 } 1358 /* Acknowledge the data received from the peer. */ 1359 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 1360 return true; 1361 } else if (foc->len == 0) { /* Client requesting a cookie */ 1362 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); 1363 NET_INC_STATS_BH(sock_net(sk), 1364 LINUX_MIB_TCPFASTOPENCOOKIEREQD); 1365 } else { 1366 /* Client sent a cookie with wrong size. Treat it 1367 * the same as invalid and return a valid one. 1368 */ 1369 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); 1370 } 1371 return false; 1372 } 1373 tcp_v4_conn_req_fastopen(struct sock * sk,struct sk_buff * skb,struct sk_buff * skb_synack,struct request_sock * req)1374 static int tcp_v4_conn_req_fastopen(struct sock *sk, 1375 struct sk_buff *skb, 1376 struct sk_buff *skb_synack, 1377 struct request_sock *req) 1378 { 1379 struct tcp_sock *tp = tcp_sk(sk); 1380 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; 1381 const struct inet_request_sock *ireq = inet_rsk(req); 1382 struct sock *child; 1383 int err; 1384 1385 req->num_retrans = 0; 1386 req->num_timeout = 0; 1387 req->sk = NULL; 1388 1389 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 1390 if (child == NULL) { 1391 NET_INC_STATS_BH(sock_net(sk), 1392 LINUX_MIB_TCPFASTOPENPASSIVEFAIL); 1393 kfree_skb(skb_synack); 1394 return -1; 1395 } 1396 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, 1397 ireq->rmt_addr, ireq->opt); 1398 err = net_xmit_eval(err); 1399 if (!err) 1400 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1401 /* XXX (TFO) - is it ok to ignore error and continue? */ 1402 1403 spin_lock(&queue->fastopenq->lock); 1404 queue->fastopenq->qlen++; 1405 spin_unlock(&queue->fastopenq->lock); 1406 1407 /* Initialize the child socket. Have to fix some values to take 1408 * into account the child is a Fast Open socket and is created 1409 * only out of the bits carried in the SYN packet. 1410 */ 1411 tp = tcp_sk(child); 1412 1413 tp->fastopen_rsk = req; 1414 /* Do a hold on the listner sk so that if the listener is being 1415 * closed, the child that has been accepted can live on and still 1416 * access listen_lock. 1417 */ 1418 sock_hold(sk); 1419 tcp_rsk(req)->listener = sk; 1420 1421 /* RFC1323: The window in SYN & SYN/ACK segments is never 1422 * scaled. So correct it appropriately. 1423 */ 1424 tp->snd_wnd = ntohs(tcp_hdr(skb)->window); 1425 1426 /* Activate the retrans timer so that SYNACK can be retransmitted. 1427 * The request socket is not added to the SYN table of the parent 1428 * because it's been added to the accept queue directly. 1429 */ 1430 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, 1431 TCP_TIMEOUT_INIT, TCP_RTO_MAX); 1432 1433 /* Add the child socket directly into the accept queue */ 1434 inet_csk_reqsk_queue_add(sk, req, child); 1435 1436 /* Now finish processing the fastopen child socket. */ 1437 inet_csk(child)->icsk_af_ops->rebuild_header(child); 1438 tcp_init_congestion_control(child); 1439 tcp_mtup_init(child); 1440 tcp_init_buffer_space(child); 1441 tcp_init_metrics(child); 1442 1443 /* Queue the data carried in the SYN packet. We need to first 1444 * bump skb's refcnt because the caller will attempt to free it. 1445 * 1446 * XXX (TFO) - we honor a zero-payload TFO request for now. 1447 * (Any reason not to?) 1448 */ 1449 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { 1450 /* Don't queue the skb if there is no payload in SYN. 1451 * XXX (TFO) - How about SYN+FIN? 1452 */ 1453 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 1454 } else { 1455 skb = skb_get(skb); 1456 skb_dst_drop(skb); 1457 __skb_pull(skb, tcp_hdr(skb)->doff * 4); 1458 skb_set_owner_r(skb, child); 1459 __skb_queue_tail(&child->sk_receive_queue, skb); 1460 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 1461 tp->syn_data_acked = 1; 1462 } 1463 sk->sk_data_ready(sk, 0); 1464 bh_unlock_sock(child); 1465 sock_put(child); 1466 WARN_ON(req->sk == NULL); 1467 return 0; 1468 } 1469 tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1470 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1471 { 1472 struct tcp_options_received tmp_opt; 1473 struct request_sock *req; 1474 struct inet_request_sock *ireq; 1475 struct tcp_sock *tp = tcp_sk(sk); 1476 struct dst_entry *dst = NULL; 1477 __be32 saddr = ip_hdr(skb)->saddr; 1478 __be32 daddr = ip_hdr(skb)->daddr; 1479 __u32 isn = TCP_SKB_CB(skb)->when; 1480 bool want_cookie = false; 1481 struct flowi4 fl4; 1482 struct tcp_fastopen_cookie foc = { .len = -1 }; 1483 struct tcp_fastopen_cookie valid_foc = { .len = -1 }; 1484 struct sk_buff *skb_synack; 1485 int do_fastopen; 1486 1487 /* Never answer to SYNs send to broadcast or multicast */ 1488 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1489 goto drop; 1490 1491 /* TW buckets are converted to open requests without 1492 * limitations, they conserve resources and peer is 1493 * evidently real one. 1494 */ 1495 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1496 want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); 1497 if (!want_cookie) 1498 goto drop; 1499 } 1500 1501 /* Accept backlog is full. If we have already queued enough 1502 * of warm entries in syn queue, drop request. It is better than 1503 * clogging syn queue with openreqs with exponentially increasing 1504 * timeout. 1505 */ 1506 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { 1507 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1508 goto drop; 1509 } 1510 1511 req = inet_reqsk_alloc(&tcp_request_sock_ops); 1512 if (!req) 1513 goto drop; 1514 1515 #ifdef CONFIG_TCP_MD5SIG 1516 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; 1517 #endif 1518 1519 tcp_clear_options(&tmp_opt); 1520 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1521 tmp_opt.user_mss = tp->rx_opt.user_mss; 1522 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); 1523 1524 if (want_cookie && !tmp_opt.saw_tstamp) 1525 tcp_clear_options(&tmp_opt); 1526 1527 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1528 tcp_openreq_init(req, &tmp_opt, skb); 1529 1530 ireq = inet_rsk(req); 1531 ireq->loc_addr = daddr; 1532 ireq->rmt_addr = saddr; 1533 ireq->no_srccheck = inet_sk(sk)->transparent; 1534 ireq->opt = tcp_v4_save_options(skb); 1535 ireq->ir_mark = inet_request_mark(sk, skb); 1536 1537 if (security_inet_conn_request(sk, skb, req)) 1538 goto drop_and_free; 1539 1540 if (!want_cookie || tmp_opt.tstamp_ok) 1541 TCP_ECN_create_request(req, skb, sock_net(sk)); 1542 1543 if (want_cookie) { 1544 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1545 req->cookie_ts = tmp_opt.tstamp_ok; 1546 } else if (!isn) { 1547 /* VJ's idea. We save last timestamp seen 1548 * from the destination in peer table, when entering 1549 * state TIME-WAIT, and check against it before 1550 * accepting new connection request. 1551 * 1552 * If "isn" is not zero, this request hit alive 1553 * timewait bucket, so that all the necessary checks 1554 * are made in the function processing timewait state. 1555 */ 1556 if (tmp_opt.saw_tstamp && 1557 tcp_death_row.sysctl_tw_recycle && 1558 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && 1559 fl4.daddr == saddr) { 1560 if (!tcp_peer_is_proven(req, dst, true)) { 1561 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1562 goto drop_and_release; 1563 } 1564 } 1565 /* Kill the following clause, if you dislike this way. */ 1566 else if (!sysctl_tcp_syncookies && 1567 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1568 (sysctl_max_syn_backlog >> 2)) && 1569 !tcp_peer_is_proven(req, dst, false)) { 1570 /* Without syncookies last quarter of 1571 * backlog is filled with destinations, 1572 * proven to be alive. 1573 * It means that we continue to communicate 1574 * to destinations, already remembered 1575 * to the moment of synflood. 1576 */ 1577 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), 1578 &saddr, ntohs(tcp_hdr(skb)->source)); 1579 goto drop_and_release; 1580 } 1581 1582 isn = tcp_v4_init_sequence(skb); 1583 } 1584 tcp_rsk(req)->snt_isn = isn; 1585 1586 if (dst == NULL) { 1587 dst = inet_csk_route_req(sk, &fl4, req); 1588 if (dst == NULL) 1589 goto drop_and_free; 1590 } 1591 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); 1592 1593 /* We don't call tcp_v4_send_synack() directly because we need 1594 * to make sure a child socket can be created successfully before 1595 * sending back synack! 1596 * 1597 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() 1598 * (or better yet, call tcp_send_synack() in the child context 1599 * directly, but will have to fix bunch of other code first) 1600 * after syn_recv_sock() except one will need to first fix the 1601 * latter to remove its dependency on the current implementation 1602 * of tcp_v4_send_synack()->tcp_select_initial_window(). 1603 */ 1604 skb_synack = tcp_make_synack(sk, dst, req, 1605 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); 1606 1607 if (skb_synack) { 1608 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); 1609 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); 1610 } else 1611 goto drop_and_free; 1612 1613 if (likely(!do_fastopen)) { 1614 int err; 1615 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, 1616 ireq->rmt_addr, ireq->opt); 1617 err = net_xmit_eval(err); 1618 if (err || want_cookie) 1619 goto drop_and_free; 1620 1621 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1622 tcp_rsk(req)->listener = NULL; 1623 /* Add the request_sock to the SYN table */ 1624 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1625 if (fastopen_cookie_present(&foc) && foc.len != 0) 1626 NET_INC_STATS_BH(sock_net(sk), 1627 LINUX_MIB_TCPFASTOPENPASSIVEFAIL); 1628 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) 1629 goto drop_and_free; 1630 1631 return 0; 1632 1633 drop_and_release: 1634 dst_release(dst); 1635 drop_and_free: 1636 reqsk_free(req); 1637 drop: 1638 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1639 return 0; 1640 } 1641 EXPORT_SYMBOL(tcp_v4_conn_request); 1642 1643 1644 /* 1645 * The three way handshake has completed - we got a valid synack - 1646 * now create the new socket. 1647 */ tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst)1648 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1649 struct request_sock *req, 1650 struct dst_entry *dst) 1651 { 1652 struct inet_request_sock *ireq; 1653 struct inet_sock *newinet; 1654 struct tcp_sock *newtp; 1655 struct sock *newsk; 1656 #ifdef CONFIG_TCP_MD5SIG 1657 struct tcp_md5sig_key *key; 1658 #endif 1659 struct ip_options_rcu *inet_opt; 1660 1661 if (sk_acceptq_is_full(sk)) 1662 goto exit_overflow; 1663 1664 newsk = tcp_create_openreq_child(sk, req, skb); 1665 if (!newsk) 1666 goto exit_nonewsk; 1667 1668 newsk->sk_gso_type = SKB_GSO_TCPV4; 1669 inet_sk_rx_dst_set(newsk, skb); 1670 1671 newtp = tcp_sk(newsk); 1672 newinet = inet_sk(newsk); 1673 ireq = inet_rsk(req); 1674 newinet->inet_daddr = ireq->rmt_addr; 1675 newinet->inet_rcv_saddr = ireq->loc_addr; 1676 newinet->inet_saddr = ireq->loc_addr; 1677 inet_opt = ireq->opt; 1678 rcu_assign_pointer(newinet->inet_opt, inet_opt); 1679 ireq->opt = NULL; 1680 newinet->mc_index = inet_iif(skb); 1681 newinet->mc_ttl = ip_hdr(skb)->ttl; 1682 newinet->rcv_tos = ip_hdr(skb)->tos; 1683 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1684 if (inet_opt) 1685 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1686 newinet->inet_id = newtp->write_seq ^ jiffies; 1687 1688 if (!dst) { 1689 dst = inet_csk_route_child_sock(sk, newsk, req); 1690 if (!dst) 1691 goto put_and_exit; 1692 } else { 1693 /* syncookie case : see end of cookie_v4_check() */ 1694 } 1695 sk_setup_caps(newsk, dst); 1696 1697 tcp_mtup_init(newsk); 1698 tcp_sync_mss(newsk, dst_mtu(dst)); 1699 newtp->advmss = dst_metric_advmss(dst); 1700 if (tcp_sk(sk)->rx_opt.user_mss && 1701 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1702 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1703 1704 tcp_initialize_rcv_mss(newsk); 1705 tcp_synack_rtt_meas(newsk, req); 1706 newtp->total_retrans = req->num_retrans; 1707 1708 #ifdef CONFIG_TCP_MD5SIG 1709 /* Copy over the MD5 key from the original socket */ 1710 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1711 AF_INET); 1712 if (key != NULL) { 1713 /* 1714 * We're using one, so create a matching key 1715 * on the newsk structure. If we fail to get 1716 * memory, then we end up not copying the key 1717 * across. Shucks. 1718 */ 1719 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1720 AF_INET, key->key, key->keylen, GFP_ATOMIC); 1721 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1722 } 1723 #endif 1724 1725 if (__inet_inherit_port(sk, newsk) < 0) 1726 goto put_and_exit; 1727 __inet_hash_nolisten(newsk, NULL); 1728 1729 return newsk; 1730 1731 exit_overflow: 1732 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1733 exit_nonewsk: 1734 dst_release(dst); 1735 exit: 1736 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1737 return NULL; 1738 put_and_exit: 1739 inet_csk_prepare_forced_close(newsk); 1740 tcp_done(newsk); 1741 goto exit; 1742 } 1743 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1744 tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1745 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1746 { 1747 struct tcphdr *th = tcp_hdr(skb); 1748 const struct iphdr *iph = ip_hdr(skb); 1749 struct sock *nsk; 1750 struct request_sock **prev; 1751 /* Find possible connection requests. */ 1752 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1753 iph->saddr, iph->daddr); 1754 if (req) 1755 return tcp_check_req(sk, skb, req, prev, false); 1756 1757 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1758 th->source, iph->daddr, th->dest, inet_iif(skb)); 1759 1760 if (nsk) { 1761 if (nsk->sk_state != TCP_TIME_WAIT) { 1762 bh_lock_sock(nsk); 1763 return nsk; 1764 } 1765 inet_twsk_put(inet_twsk(nsk)); 1766 return NULL; 1767 } 1768 1769 #ifdef CONFIG_SYN_COOKIES 1770 if (!th->syn) 1771 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1772 #endif 1773 return sk; 1774 } 1775 tcp_v4_checksum_init(struct sk_buff * skb)1776 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) 1777 { 1778 const struct iphdr *iph = ip_hdr(skb); 1779 1780 if (skb->ip_summed == CHECKSUM_COMPLETE) { 1781 if (!tcp_v4_check(skb->len, iph->saddr, 1782 iph->daddr, skb->csum)) { 1783 skb->ip_summed = CHECKSUM_UNNECESSARY; 1784 return 0; 1785 } 1786 } 1787 1788 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 1789 skb->len, IPPROTO_TCP, 0); 1790 1791 if (skb->len <= 76) { 1792 return __skb_checksum_complete(skb); 1793 } 1794 return 0; 1795 } 1796 1797 1798 /* The socket must have it's spinlock held when we get 1799 * here. 1800 * 1801 * We have a potential double-lock case here, so even when 1802 * doing backlog processing we use the BH locking scheme. 1803 * This is because we cannot sleep with the original spinlock 1804 * held. 1805 */ tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1806 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1807 { 1808 struct sock *rsk; 1809 #ifdef CONFIG_TCP_MD5SIG 1810 /* 1811 * We really want to reject the packet as early as possible 1812 * if: 1813 * o We're expecting an MD5'd packet and this is no MD5 tcp option 1814 * o There is an MD5 option and we're not expecting one 1815 */ 1816 if (tcp_v4_inbound_md5_hash(sk, skb)) 1817 goto discard; 1818 #endif 1819 1820 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1821 struct dst_entry *dst = sk->sk_rx_dst; 1822 1823 sock_rps_save_rxhash(sk, skb); 1824 if (dst) { 1825 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1826 dst->ops->check(dst, 0) == NULL) { 1827 dst_release(dst); 1828 sk->sk_rx_dst = NULL; 1829 } 1830 } 1831 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1832 rsk = sk; 1833 goto reset; 1834 } 1835 return 0; 1836 } 1837 1838 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) 1839 goto csum_err; 1840 1841 if (sk->sk_state == TCP_LISTEN) { 1842 struct sock *nsk = tcp_v4_hnd_req(sk, skb); 1843 if (!nsk) 1844 goto discard; 1845 1846 if (nsk != sk) { 1847 sock_rps_save_rxhash(nsk, skb); 1848 if (tcp_child_process(sk, nsk, skb)) { 1849 rsk = nsk; 1850 goto reset; 1851 } 1852 return 0; 1853 } 1854 } else 1855 sock_rps_save_rxhash(sk, skb); 1856 1857 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1858 rsk = sk; 1859 goto reset; 1860 } 1861 return 0; 1862 1863 reset: 1864 tcp_v4_send_reset(rsk, skb); 1865 discard: 1866 kfree_skb(skb); 1867 /* Be careful here. If this function gets more complicated and 1868 * gcc suffers from register pressure on the x86, sk (in %ebx) 1869 * might be destroyed here. This current version compiles correctly, 1870 * but you have been warned. 1871 */ 1872 return 0; 1873 1874 csum_err: 1875 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); 1876 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 1877 goto discard; 1878 } 1879 EXPORT_SYMBOL(tcp_v4_do_rcv); 1880 tcp_v4_early_demux(struct sk_buff * skb)1881 void tcp_v4_early_demux(struct sk_buff *skb) 1882 { 1883 const struct iphdr *iph; 1884 const struct tcphdr *th; 1885 struct sock *sk; 1886 1887 if (skb->pkt_type != PACKET_HOST) 1888 return; 1889 1890 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1891 return; 1892 1893 iph = ip_hdr(skb); 1894 th = tcp_hdr(skb); 1895 1896 if (th->doff < sizeof(struct tcphdr) / 4) 1897 return; 1898 1899 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1900 iph->saddr, th->source, 1901 iph->daddr, ntohs(th->dest), 1902 skb->skb_iif); 1903 if (sk) { 1904 skb->sk = sk; 1905 skb->destructor = sock_edemux; 1906 if (sk->sk_state != TCP_TIME_WAIT) { 1907 struct dst_entry *dst = sk->sk_rx_dst; 1908 1909 if (dst) 1910 dst = dst_check(dst, 0); 1911 if (dst && 1912 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1913 skb_dst_set_noref(skb, dst); 1914 } 1915 } 1916 } 1917 1918 /* Packet is added to VJ-style prequeue for processing in process 1919 * context, if a reader task is waiting. Apparently, this exciting 1920 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) 1921 * failed somewhere. Latency? Burstiness? Well, at least now we will 1922 * see, why it failed. 8)8) --ANK 1923 * 1924 */ tcp_prequeue(struct sock * sk,struct sk_buff * skb)1925 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) 1926 { 1927 struct tcp_sock *tp = tcp_sk(sk); 1928 1929 if (sysctl_tcp_low_latency || !tp->ucopy.task) 1930 return false; 1931 1932 if (skb->len <= tcp_hdrlen(skb) && 1933 skb_queue_len(&tp->ucopy.prequeue) == 0) 1934 return false; 1935 1936 skb_dst_force(skb); 1937 __skb_queue_tail(&tp->ucopy.prequeue, skb); 1938 tp->ucopy.memory += skb->truesize; 1939 if (tp->ucopy.memory > sk->sk_rcvbuf) { 1940 struct sk_buff *skb1; 1941 1942 BUG_ON(sock_owned_by_user(sk)); 1943 1944 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { 1945 sk_backlog_rcv(sk, skb1); 1946 NET_INC_STATS_BH(sock_net(sk), 1947 LINUX_MIB_TCPPREQUEUEDROPPED); 1948 } 1949 1950 tp->ucopy.memory = 0; 1951 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { 1952 wake_up_interruptible_sync_poll(sk_sleep(sk), 1953 POLLIN | POLLRDNORM | POLLRDBAND); 1954 if (!inet_csk_ack_scheduled(sk)) 1955 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 1956 (3 * tcp_rto_min(sk)) / 4, 1957 TCP_RTO_MAX); 1958 } 1959 return true; 1960 } 1961 EXPORT_SYMBOL(tcp_prequeue); 1962 1963 /* 1964 * From tcp_input.c 1965 */ 1966 tcp_v4_rcv(struct sk_buff * skb)1967 int tcp_v4_rcv(struct sk_buff *skb) 1968 { 1969 const struct iphdr *iph; 1970 const struct tcphdr *th; 1971 struct sock *sk; 1972 int ret; 1973 struct net *net = dev_net(skb->dev); 1974 1975 if (skb->pkt_type != PACKET_HOST) 1976 goto discard_it; 1977 1978 /* Count it even if it's bad */ 1979 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); 1980 1981 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1982 goto discard_it; 1983 1984 th = tcp_hdr(skb); 1985 1986 if (th->doff < sizeof(struct tcphdr) / 4) 1987 goto bad_packet; 1988 if (!pskb_may_pull(skb, th->doff * 4)) 1989 goto discard_it; 1990 1991 /* An explanation is required here, I think. 1992 * Packet length and doff are validated by header prediction, 1993 * provided case of th->doff==0 is eliminated. 1994 * So, we defer the checks. */ 1995 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) 1996 goto csum_error; 1997 1998 th = tcp_hdr(skb); 1999 iph = ip_hdr(skb); 2000 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2001 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2002 skb->len - th->doff * 4); 2003 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2004 TCP_SKB_CB(skb)->when = 0; 2005 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2006 TCP_SKB_CB(skb)->sacked = 0; 2007 2008 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); 2009 if (!sk) 2010 goto no_tcp_socket; 2011 2012 process: 2013 if (sk->sk_state == TCP_TIME_WAIT) 2014 goto do_time_wait; 2015 2016 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 2017 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); 2018 goto discard_and_relse; 2019 } 2020 2021 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2022 goto discard_and_relse; 2023 nf_reset(skb); 2024 2025 if (sk_filter(sk, skb)) 2026 goto discard_and_relse; 2027 2028 skb->dev = NULL; 2029 2030 bh_lock_sock_nested(sk); 2031 ret = 0; 2032 if (!sock_owned_by_user(sk)) { 2033 #ifdef CONFIG_NET_DMA 2034 struct tcp_sock *tp = tcp_sk(sk); 2035 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 2036 tp->ucopy.dma_chan = net_dma_find_channel(); 2037 if (tp->ucopy.dma_chan) 2038 ret = tcp_v4_do_rcv(sk, skb); 2039 else 2040 #endif 2041 { 2042 if (!tcp_prequeue(sk, skb)) 2043 ret = tcp_v4_do_rcv(sk, skb); 2044 } 2045 } else if (unlikely(sk_add_backlog(sk, skb, 2046 sk->sk_rcvbuf + sk->sk_sndbuf))) { 2047 bh_unlock_sock(sk); 2048 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); 2049 goto discard_and_relse; 2050 } 2051 bh_unlock_sock(sk); 2052 2053 sock_put(sk); 2054 2055 return ret; 2056 2057 no_tcp_socket: 2058 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2059 goto discard_it; 2060 2061 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 2062 csum_error: 2063 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); 2064 bad_packet: 2065 TCP_INC_STATS_BH(net, TCP_MIB_INERRS); 2066 } else { 2067 tcp_v4_send_reset(NULL, skb); 2068 } 2069 2070 discard_it: 2071 /* Discard frame. */ 2072 kfree_skb(skb); 2073 return 0; 2074 2075 discard_and_relse: 2076 sock_put(sk); 2077 goto discard_it; 2078 2079 do_time_wait: 2080 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2081 inet_twsk_put(inet_twsk(sk)); 2082 goto discard_it; 2083 } 2084 2085 if (skb->len < (th->doff << 2)) { 2086 inet_twsk_put(inet_twsk(sk)); 2087 goto bad_packet; 2088 } 2089 if (tcp_checksum_complete(skb)) { 2090 inet_twsk_put(inet_twsk(sk)); 2091 goto csum_error; 2092 } 2093 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2094 case TCP_TW_SYN: { 2095 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2096 &tcp_hashinfo, 2097 iph->saddr, th->source, 2098 iph->daddr, th->dest, 2099 inet_iif(skb)); 2100 if (sk2) { 2101 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); 2102 inet_twsk_put(inet_twsk(sk)); 2103 sk = sk2; 2104 goto process; 2105 } 2106 /* Fall through to ACK */ 2107 } 2108 case TCP_TW_ACK: 2109 tcp_v4_timewait_ack(sk, skb); 2110 break; 2111 case TCP_TW_RST: 2112 goto no_tcp_socket; 2113 case TCP_TW_SUCCESS:; 2114 } 2115 goto discard_it; 2116 } 2117 2118 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2119 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2120 .twsk_unique = tcp_twsk_unique, 2121 .twsk_destructor= tcp_twsk_destructor, 2122 }; 2123 inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2124 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2125 { 2126 struct dst_entry *dst = skb_dst(skb); 2127 2128 dst_hold(dst); 2129 sk->sk_rx_dst = dst; 2130 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2131 } 2132 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2133 2134 const struct inet_connection_sock_af_ops ipv4_specific = { 2135 .queue_xmit = ip_queue_xmit, 2136 .send_check = tcp_v4_send_check, 2137 .rebuild_header = inet_sk_rebuild_header, 2138 .sk_rx_dst_set = inet_sk_rx_dst_set, 2139 .conn_request = tcp_v4_conn_request, 2140 .syn_recv_sock = tcp_v4_syn_recv_sock, 2141 .net_header_len = sizeof(struct iphdr), 2142 .setsockopt = ip_setsockopt, 2143 .getsockopt = ip_getsockopt, 2144 .addr2sockaddr = inet_csk_addr2sockaddr, 2145 .sockaddr_len = sizeof(struct sockaddr_in), 2146 .bind_conflict = inet_csk_bind_conflict, 2147 #ifdef CONFIG_COMPAT 2148 .compat_setsockopt = compat_ip_setsockopt, 2149 .compat_getsockopt = compat_ip_getsockopt, 2150 #endif 2151 }; 2152 EXPORT_SYMBOL(ipv4_specific); 2153 2154 #ifdef CONFIG_TCP_MD5SIG 2155 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2156 .md5_lookup = tcp_v4_md5_lookup, 2157 .calc_md5_hash = tcp_v4_md5_hash_skb, 2158 .md5_parse = tcp_v4_parse_md5_keys, 2159 }; 2160 #endif 2161 2162 /* NOTE: A lot of things set to zero explicitly by call to 2163 * sk_alloc() so need not be done here. 2164 */ tcp_v4_init_sock(struct sock * sk)2165 static int tcp_v4_init_sock(struct sock *sk) 2166 { 2167 struct inet_connection_sock *icsk = inet_csk(sk); 2168 2169 tcp_init_sock(sk); 2170 2171 icsk->icsk_af_ops = &ipv4_specific; 2172 2173 #ifdef CONFIG_TCP_MD5SIG 2174 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2175 #endif 2176 2177 return 0; 2178 } 2179 tcp_v4_destroy_sock(struct sock * sk)2180 void tcp_v4_destroy_sock(struct sock *sk) 2181 { 2182 struct tcp_sock *tp = tcp_sk(sk); 2183 2184 tcp_clear_xmit_timers(sk); 2185 2186 tcp_cleanup_congestion_control(sk); 2187 2188 /* Cleanup up the write buffer. */ 2189 tcp_write_queue_purge(sk); 2190 2191 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2192 __skb_queue_purge(&tp->out_of_order_queue); 2193 2194 #ifdef CONFIG_TCP_MD5SIG 2195 /* Clean up the MD5 key list, if any */ 2196 if (tp->md5sig_info) { 2197 tcp_clear_md5_list(sk); 2198 kfree_rcu(tp->md5sig_info, rcu); 2199 tp->md5sig_info = NULL; 2200 } 2201 #endif 2202 2203 #ifdef CONFIG_NET_DMA 2204 /* Cleans up our sk_async_wait_queue */ 2205 __skb_queue_purge(&sk->sk_async_wait_queue); 2206 #endif 2207 2208 /* Clean prequeue, it must be empty really */ 2209 __skb_queue_purge(&tp->ucopy.prequeue); 2210 2211 /* Clean up a referenced TCP bind bucket. */ 2212 if (inet_csk(sk)->icsk_bind_hash) 2213 inet_put_port(sk); 2214 2215 BUG_ON(tp->fastopen_rsk != NULL); 2216 2217 /* If socket is aborted during connect operation */ 2218 tcp_free_fastopen_req(tp); 2219 2220 sk_sockets_allocated_dec(sk); 2221 sock_release_memcg(sk); 2222 } 2223 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2224 2225 #ifdef CONFIG_PROC_FS 2226 /* Proc filesystem TCP sock list dumping. */ 2227 tw_head(struct hlist_nulls_head * head)2228 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) 2229 { 2230 return hlist_nulls_empty(head) ? NULL : 2231 list_entry(head->first, struct inet_timewait_sock, tw_node); 2232 } 2233 tw_next(struct inet_timewait_sock * tw)2234 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) 2235 { 2236 return !is_a_nulls(tw->tw_node.next) ? 2237 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 2238 } 2239 2240 /* 2241 * Get next listener socket follow cur. If cur is NULL, get first socket 2242 * starting from bucket given in st->bucket; when st->bucket is zero the 2243 * very first socket in the hash table is returned. 2244 */ listening_get_next(struct seq_file * seq,void * cur)2245 static void *listening_get_next(struct seq_file *seq, void *cur) 2246 { 2247 struct inet_connection_sock *icsk; 2248 struct hlist_nulls_node *node; 2249 struct sock *sk = cur; 2250 struct inet_listen_hashbucket *ilb; 2251 struct tcp_iter_state *st = seq->private; 2252 struct net *net = seq_file_net(seq); 2253 2254 if (!sk) { 2255 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2256 spin_lock_bh(&ilb->lock); 2257 sk = sk_nulls_head(&ilb->head); 2258 st->offset = 0; 2259 goto get_sk; 2260 } 2261 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2262 ++st->num; 2263 ++st->offset; 2264 2265 if (st->state == TCP_SEQ_STATE_OPENREQ) { 2266 struct request_sock *req = cur; 2267 2268 icsk = inet_csk(st->syn_wait_sk); 2269 req = req->dl_next; 2270 while (1) { 2271 while (req) { 2272 if (req->rsk_ops->family == st->family) { 2273 cur = req; 2274 goto out; 2275 } 2276 req = req->dl_next; 2277 } 2278 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 2279 break; 2280 get_req: 2281 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 2282 } 2283 sk = sk_nulls_next(st->syn_wait_sk); 2284 st->state = TCP_SEQ_STATE_LISTENING; 2285 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2286 } else { 2287 icsk = inet_csk(sk); 2288 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2289 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 2290 goto start_req; 2291 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2292 sk = sk_nulls_next(sk); 2293 } 2294 get_sk: 2295 sk_nulls_for_each_from(sk, node) { 2296 if (!net_eq(sock_net(sk), net)) 2297 continue; 2298 if (sk->sk_family == st->family) { 2299 cur = sk; 2300 goto out; 2301 } 2302 icsk = inet_csk(sk); 2303 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2304 if (reqsk_queue_len(&icsk->icsk_accept_queue)) { 2305 start_req: 2306 st->uid = sock_i_uid(sk); 2307 st->syn_wait_sk = sk; 2308 st->state = TCP_SEQ_STATE_OPENREQ; 2309 st->sbucket = 0; 2310 goto get_req; 2311 } 2312 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2313 } 2314 spin_unlock_bh(&ilb->lock); 2315 st->offset = 0; 2316 if (++st->bucket < INET_LHTABLE_SIZE) { 2317 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2318 spin_lock_bh(&ilb->lock); 2319 sk = sk_nulls_head(&ilb->head); 2320 goto get_sk; 2321 } 2322 cur = NULL; 2323 out: 2324 return cur; 2325 } 2326 listening_get_idx(struct seq_file * seq,loff_t * pos)2327 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2328 { 2329 struct tcp_iter_state *st = seq->private; 2330 void *rc; 2331 2332 st->bucket = 0; 2333 st->offset = 0; 2334 rc = listening_get_next(seq, NULL); 2335 2336 while (rc && *pos) { 2337 rc = listening_get_next(seq, rc); 2338 --*pos; 2339 } 2340 return rc; 2341 } 2342 empty_bucket(struct tcp_iter_state * st)2343 static inline bool empty_bucket(struct tcp_iter_state *st) 2344 { 2345 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 2346 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 2347 } 2348 2349 /* 2350 * Get first established socket starting from bucket given in st->bucket. 2351 * If st->bucket is zero, the very first socket in the hash is returned. 2352 */ established_get_first(struct seq_file * seq)2353 static void *established_get_first(struct seq_file *seq) 2354 { 2355 struct tcp_iter_state *st = seq->private; 2356 struct net *net = seq_file_net(seq); 2357 void *rc = NULL; 2358 2359 st->offset = 0; 2360 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2361 struct sock *sk; 2362 struct hlist_nulls_node *node; 2363 struct inet_timewait_sock *tw; 2364 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2365 2366 /* Lockless fast path for the common case of empty buckets */ 2367 if (empty_bucket(st)) 2368 continue; 2369 2370 spin_lock_bh(lock); 2371 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2372 if (sk->sk_family != st->family || 2373 !net_eq(sock_net(sk), net)) { 2374 continue; 2375 } 2376 rc = sk; 2377 goto out; 2378 } 2379 st->state = TCP_SEQ_STATE_TIME_WAIT; 2380 inet_twsk_for_each(tw, node, 2381 &tcp_hashinfo.ehash[st->bucket].twchain) { 2382 if (tw->tw_family != st->family || 2383 !net_eq(twsk_net(tw), net)) { 2384 continue; 2385 } 2386 rc = tw; 2387 goto out; 2388 } 2389 spin_unlock_bh(lock); 2390 st->state = TCP_SEQ_STATE_ESTABLISHED; 2391 } 2392 out: 2393 return rc; 2394 } 2395 established_get_next(struct seq_file * seq,void * cur)2396 static void *established_get_next(struct seq_file *seq, void *cur) 2397 { 2398 struct sock *sk = cur; 2399 struct inet_timewait_sock *tw; 2400 struct hlist_nulls_node *node; 2401 struct tcp_iter_state *st = seq->private; 2402 struct net *net = seq_file_net(seq); 2403 2404 ++st->num; 2405 ++st->offset; 2406 2407 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2408 tw = cur; 2409 tw = tw_next(tw); 2410 get_tw: 2411 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { 2412 tw = tw_next(tw); 2413 } 2414 if (tw) { 2415 cur = tw; 2416 goto out; 2417 } 2418 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2419 st->state = TCP_SEQ_STATE_ESTABLISHED; 2420 2421 /* Look for next non empty bucket */ 2422 st->offset = 0; 2423 while (++st->bucket <= tcp_hashinfo.ehash_mask && 2424 empty_bucket(st)) 2425 ; 2426 if (st->bucket > tcp_hashinfo.ehash_mask) 2427 return NULL; 2428 2429 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2430 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); 2431 } else 2432 sk = sk_nulls_next(sk); 2433 2434 sk_nulls_for_each_from(sk, node) { 2435 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2436 goto found; 2437 } 2438 2439 st->state = TCP_SEQ_STATE_TIME_WAIT; 2440 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); 2441 goto get_tw; 2442 found: 2443 cur = sk; 2444 out: 2445 return cur; 2446 } 2447 established_get_idx(struct seq_file * seq,loff_t pos)2448 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2449 { 2450 struct tcp_iter_state *st = seq->private; 2451 void *rc; 2452 2453 st->bucket = 0; 2454 rc = established_get_first(seq); 2455 2456 while (rc && pos) { 2457 rc = established_get_next(seq, rc); 2458 --pos; 2459 } 2460 return rc; 2461 } 2462 tcp_get_idx(struct seq_file * seq,loff_t pos)2463 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2464 { 2465 void *rc; 2466 struct tcp_iter_state *st = seq->private; 2467 2468 st->state = TCP_SEQ_STATE_LISTENING; 2469 rc = listening_get_idx(seq, &pos); 2470 2471 if (!rc) { 2472 st->state = TCP_SEQ_STATE_ESTABLISHED; 2473 rc = established_get_idx(seq, pos); 2474 } 2475 2476 return rc; 2477 } 2478 tcp_seek_last_pos(struct seq_file * seq)2479 static void *tcp_seek_last_pos(struct seq_file *seq) 2480 { 2481 struct tcp_iter_state *st = seq->private; 2482 int offset = st->offset; 2483 int orig_num = st->num; 2484 void *rc = NULL; 2485 2486 switch (st->state) { 2487 case TCP_SEQ_STATE_OPENREQ: 2488 case TCP_SEQ_STATE_LISTENING: 2489 if (st->bucket >= INET_LHTABLE_SIZE) 2490 break; 2491 st->state = TCP_SEQ_STATE_LISTENING; 2492 rc = listening_get_next(seq, NULL); 2493 while (offset-- && rc) 2494 rc = listening_get_next(seq, rc); 2495 if (rc) 2496 break; 2497 st->bucket = 0; 2498 /* Fallthrough */ 2499 case TCP_SEQ_STATE_ESTABLISHED: 2500 case TCP_SEQ_STATE_TIME_WAIT: 2501 st->state = TCP_SEQ_STATE_ESTABLISHED; 2502 if (st->bucket > tcp_hashinfo.ehash_mask) 2503 break; 2504 rc = established_get_first(seq); 2505 while (offset-- && rc) 2506 rc = established_get_next(seq, rc); 2507 } 2508 2509 st->num = orig_num; 2510 2511 return rc; 2512 } 2513 tcp_seq_start(struct seq_file * seq,loff_t * pos)2514 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2515 { 2516 struct tcp_iter_state *st = seq->private; 2517 void *rc; 2518 2519 if (*pos && *pos == st->last_pos) { 2520 rc = tcp_seek_last_pos(seq); 2521 if (rc) 2522 goto out; 2523 } 2524 2525 st->state = TCP_SEQ_STATE_LISTENING; 2526 st->num = 0; 2527 st->bucket = 0; 2528 st->offset = 0; 2529 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2530 2531 out: 2532 st->last_pos = *pos; 2533 return rc; 2534 } 2535 tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2536 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2537 { 2538 struct tcp_iter_state *st = seq->private; 2539 void *rc = NULL; 2540 2541 if (v == SEQ_START_TOKEN) { 2542 rc = tcp_get_idx(seq, 0); 2543 goto out; 2544 } 2545 2546 switch (st->state) { 2547 case TCP_SEQ_STATE_OPENREQ: 2548 case TCP_SEQ_STATE_LISTENING: 2549 rc = listening_get_next(seq, v); 2550 if (!rc) { 2551 st->state = TCP_SEQ_STATE_ESTABLISHED; 2552 st->bucket = 0; 2553 st->offset = 0; 2554 rc = established_get_first(seq); 2555 } 2556 break; 2557 case TCP_SEQ_STATE_ESTABLISHED: 2558 case TCP_SEQ_STATE_TIME_WAIT: 2559 rc = established_get_next(seq, v); 2560 break; 2561 } 2562 out: 2563 ++*pos; 2564 st->last_pos = *pos; 2565 return rc; 2566 } 2567 tcp_seq_stop(struct seq_file * seq,void * v)2568 static void tcp_seq_stop(struct seq_file *seq, void *v) 2569 { 2570 struct tcp_iter_state *st = seq->private; 2571 2572 switch (st->state) { 2573 case TCP_SEQ_STATE_OPENREQ: 2574 if (v) { 2575 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); 2576 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2577 } 2578 case TCP_SEQ_STATE_LISTENING: 2579 if (v != SEQ_START_TOKEN) 2580 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); 2581 break; 2582 case TCP_SEQ_STATE_TIME_WAIT: 2583 case TCP_SEQ_STATE_ESTABLISHED: 2584 if (v) 2585 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2586 break; 2587 } 2588 } 2589 tcp_seq_open(struct inode * inode,struct file * file)2590 int tcp_seq_open(struct inode *inode, struct file *file) 2591 { 2592 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); 2593 struct tcp_iter_state *s; 2594 int err; 2595 2596 err = seq_open_net(inode, file, &afinfo->seq_ops, 2597 sizeof(struct tcp_iter_state)); 2598 if (err < 0) 2599 return err; 2600 2601 s = ((struct seq_file *)file->private_data)->private; 2602 s->family = afinfo->family; 2603 s->last_pos = 0; 2604 return 0; 2605 } 2606 EXPORT_SYMBOL(tcp_seq_open); 2607 tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2608 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2609 { 2610 int rc = 0; 2611 struct proc_dir_entry *p; 2612 2613 afinfo->seq_ops.start = tcp_seq_start; 2614 afinfo->seq_ops.next = tcp_seq_next; 2615 afinfo->seq_ops.stop = tcp_seq_stop; 2616 2617 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2618 afinfo->seq_fops, afinfo); 2619 if (!p) 2620 rc = -ENOMEM; 2621 return rc; 2622 } 2623 EXPORT_SYMBOL(tcp_proc_register); 2624 tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2625 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2626 { 2627 remove_proc_entry(afinfo->name, net->proc_net); 2628 } 2629 EXPORT_SYMBOL(tcp_proc_unregister); 2630 get_openreq4(const struct sock * sk,const struct request_sock * req,struct seq_file * f,int i,kuid_t uid,int * len)2631 static void get_openreq4(const struct sock *sk, const struct request_sock *req, 2632 struct seq_file *f, int i, kuid_t uid, int *len) 2633 { 2634 const struct inet_request_sock *ireq = inet_rsk(req); 2635 long delta = req->expires - jiffies; 2636 2637 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2638 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", 2639 i, 2640 ireq->loc_addr, 2641 ntohs(inet_sk(sk)->inet_sport), 2642 ireq->rmt_addr, 2643 ntohs(ireq->rmt_port), 2644 TCP_SYN_RECV, 2645 0, 0, /* could print option size, but that is af dependent. */ 2646 1, /* timers active (only the expire timer) */ 2647 jiffies_delta_to_clock_t(delta), 2648 req->num_timeout, 2649 from_kuid_munged(seq_user_ns(f), uid), 2650 0, /* non standard timer */ 2651 0, /* open_requests have no inode */ 2652 atomic_read(&sk->sk_refcnt), 2653 req, 2654 len); 2655 } 2656 get_tcp4_sock(struct sock * sk,struct seq_file * f,int i,int * len)2657 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) 2658 { 2659 int timer_active; 2660 unsigned long timer_expires; 2661 const struct tcp_sock *tp = tcp_sk(sk); 2662 const struct inet_connection_sock *icsk = inet_csk(sk); 2663 const struct inet_sock *inet = inet_sk(sk); 2664 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; 2665 __be32 dest = inet->inet_daddr; 2666 __be32 src = inet->inet_rcv_saddr; 2667 __u16 destp = ntohs(inet->inet_dport); 2668 __u16 srcp = ntohs(inet->inet_sport); 2669 int rx_queue; 2670 2671 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2672 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2673 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2674 timer_active = 1; 2675 timer_expires = icsk->icsk_timeout; 2676 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2677 timer_active = 4; 2678 timer_expires = icsk->icsk_timeout; 2679 } else if (timer_pending(&sk->sk_timer)) { 2680 timer_active = 2; 2681 timer_expires = sk->sk_timer.expires; 2682 } else { 2683 timer_active = 0; 2684 timer_expires = jiffies; 2685 } 2686 2687 if (sk->sk_state == TCP_LISTEN) 2688 rx_queue = sk->sk_ack_backlog; 2689 else 2690 /* 2691 * because we dont lock socket, we might find a transient negative value 2692 */ 2693 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2694 2695 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2696 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", 2697 i, src, srcp, dest, destp, sk->sk_state, 2698 tp->write_seq - tp->snd_una, 2699 rx_queue, 2700 timer_active, 2701 jiffies_delta_to_clock_t(timer_expires - jiffies), 2702 icsk->icsk_retransmits, 2703 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2704 icsk->icsk_probes_out, 2705 sock_i_ino(sk), 2706 atomic_read(&sk->sk_refcnt), sk, 2707 jiffies_to_clock_t(icsk->icsk_rto), 2708 jiffies_to_clock_t(icsk->icsk_ack.ato), 2709 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2710 tp->snd_cwnd, 2711 sk->sk_state == TCP_LISTEN ? 2712 (fastopenq ? fastopenq->max_qlen : 0) : 2713 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), 2714 len); 2715 } 2716 get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i,int * len)2717 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2718 struct seq_file *f, int i, int *len) 2719 { 2720 __be32 dest, src; 2721 __u16 destp, srcp; 2722 long delta = tw->tw_ttd - jiffies; 2723 2724 dest = tw->tw_daddr; 2725 src = tw->tw_rcv_saddr; 2726 destp = ntohs(tw->tw_dport); 2727 srcp = ntohs(tw->tw_sport); 2728 2729 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2730 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", 2731 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2732 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2733 atomic_read(&tw->tw_refcnt), tw, len); 2734 } 2735 2736 #define TMPSZ 150 2737 tcp4_seq_show(struct seq_file * seq,void * v)2738 static int tcp4_seq_show(struct seq_file *seq, void *v) 2739 { 2740 struct tcp_iter_state *st; 2741 int len; 2742 2743 if (v == SEQ_START_TOKEN) { 2744 seq_printf(seq, "%-*s\n", TMPSZ - 1, 2745 " sl local_address rem_address st tx_queue " 2746 "rx_queue tr tm->when retrnsmt uid timeout " 2747 "inode"); 2748 goto out; 2749 } 2750 st = seq->private; 2751 2752 switch (st->state) { 2753 case TCP_SEQ_STATE_LISTENING: 2754 case TCP_SEQ_STATE_ESTABLISHED: 2755 get_tcp4_sock(v, seq, st->num, &len); 2756 break; 2757 case TCP_SEQ_STATE_OPENREQ: 2758 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); 2759 break; 2760 case TCP_SEQ_STATE_TIME_WAIT: 2761 get_timewait4_sock(v, seq, st->num, &len); 2762 break; 2763 } 2764 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); 2765 out: 2766 return 0; 2767 } 2768 2769 static const struct file_operations tcp_afinfo_seq_fops = { 2770 .owner = THIS_MODULE, 2771 .open = tcp_seq_open, 2772 .read = seq_read, 2773 .llseek = seq_lseek, 2774 .release = seq_release_net 2775 }; 2776 2777 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2778 .name = "tcp", 2779 .family = AF_INET, 2780 .seq_fops = &tcp_afinfo_seq_fops, 2781 .seq_ops = { 2782 .show = tcp4_seq_show, 2783 }, 2784 }; 2785 tcp4_proc_init_net(struct net * net)2786 static int __net_init tcp4_proc_init_net(struct net *net) 2787 { 2788 return tcp_proc_register(net, &tcp4_seq_afinfo); 2789 } 2790 tcp4_proc_exit_net(struct net * net)2791 static void __net_exit tcp4_proc_exit_net(struct net *net) 2792 { 2793 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2794 } 2795 2796 static struct pernet_operations tcp4_net_ops = { 2797 .init = tcp4_proc_init_net, 2798 .exit = tcp4_proc_exit_net, 2799 }; 2800 tcp4_proc_init(void)2801 int __init tcp4_proc_init(void) 2802 { 2803 return register_pernet_subsys(&tcp4_net_ops); 2804 } 2805 tcp4_proc_exit(void)2806 void tcp4_proc_exit(void) 2807 { 2808 unregister_pernet_subsys(&tcp4_net_ops); 2809 } 2810 #endif /* CONFIG_PROC_FS */ 2811 tcp4_gro_receive(struct sk_buff ** head,struct sk_buff * skb)2812 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2813 { 2814 const struct iphdr *iph = skb_gro_network_header(skb); 2815 __wsum wsum; 2816 __sum16 sum; 2817 2818 switch (skb->ip_summed) { 2819 case CHECKSUM_COMPLETE: 2820 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, 2821 skb->csum)) { 2822 skb->ip_summed = CHECKSUM_UNNECESSARY; 2823 break; 2824 } 2825 flush: 2826 NAPI_GRO_CB(skb)->flush = 1; 2827 return NULL; 2828 2829 case CHECKSUM_NONE: 2830 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 2831 skb_gro_len(skb), IPPROTO_TCP, 0); 2832 sum = csum_fold(skb_checksum(skb, 2833 skb_gro_offset(skb), 2834 skb_gro_len(skb), 2835 wsum)); 2836 if (sum) 2837 goto flush; 2838 2839 skb->ip_summed = CHECKSUM_UNNECESSARY; 2840 break; 2841 } 2842 2843 return tcp_gro_receive(head, skb); 2844 } 2845 tcp4_gro_complete(struct sk_buff * skb)2846 int tcp4_gro_complete(struct sk_buff *skb) 2847 { 2848 const struct iphdr *iph = ip_hdr(skb); 2849 struct tcphdr *th = tcp_hdr(skb); 2850 2851 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), 2852 iph->saddr, iph->daddr, 0); 2853 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 2854 2855 return tcp_gro_complete(skb); 2856 } 2857 2858 struct proto tcp_prot = { 2859 .name = "TCP", 2860 .owner = THIS_MODULE, 2861 .close = tcp_close, 2862 .connect = tcp_v4_connect, 2863 .disconnect = tcp_disconnect, 2864 .accept = inet_csk_accept, 2865 .ioctl = tcp_ioctl, 2866 .init = tcp_v4_init_sock, 2867 .destroy = tcp_v4_destroy_sock, 2868 .shutdown = tcp_shutdown, 2869 .setsockopt = tcp_setsockopt, 2870 .getsockopt = tcp_getsockopt, 2871 .recvmsg = tcp_recvmsg, 2872 .sendmsg = tcp_sendmsg, 2873 .sendpage = tcp_sendpage, 2874 .backlog_rcv = tcp_v4_do_rcv, 2875 .release_cb = tcp_release_cb, 2876 .mtu_reduced = tcp_v4_mtu_reduced, 2877 .hash = inet_hash, 2878 .unhash = inet_unhash, 2879 .get_port = inet_csk_get_port, 2880 .enter_memory_pressure = tcp_enter_memory_pressure, 2881 .sockets_allocated = &tcp_sockets_allocated, 2882 .orphan_count = &tcp_orphan_count, 2883 .memory_allocated = &tcp_memory_allocated, 2884 .memory_pressure = &tcp_memory_pressure, 2885 .sysctl_wmem = sysctl_tcp_wmem, 2886 .sysctl_rmem = sysctl_tcp_rmem, 2887 .max_header = MAX_TCP_HEADER, 2888 .obj_size = sizeof(struct tcp_sock), 2889 .slab_flags = SLAB_DESTROY_BY_RCU, 2890 .twsk_prot = &tcp_timewait_sock_ops, 2891 .rsk_prot = &tcp_request_sock_ops, 2892 .h.hashinfo = &tcp_hashinfo, 2893 .no_autobind = true, 2894 #ifdef CONFIG_COMPAT 2895 .compat_setsockopt = compat_tcp_setsockopt, 2896 .compat_getsockopt = compat_tcp_getsockopt, 2897 #endif 2898 #ifdef CONFIG_MEMCG_KMEM 2899 .init_cgroup = tcp_init_cgroup, 2900 .destroy_cgroup = tcp_destroy_cgroup, 2901 .proto_cgroup = tcp_proto_cgroup, 2902 #endif 2903 .diag_destroy = tcp_abort, 2904 }; 2905 EXPORT_SYMBOL(tcp_prot); 2906 tcp_sk_exit(struct net * net)2907 static void __net_exit tcp_sk_exit(struct net *net) 2908 { 2909 int cpu; 2910 2911 for_each_possible_cpu(cpu) 2912 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2913 free_percpu(net->ipv4.tcp_sk); 2914 } 2915 tcp_sk_init(struct net * net)2916 static int __net_init tcp_sk_init(struct net *net) 2917 { 2918 int res, cpu; 2919 2920 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2921 if (!net->ipv4.tcp_sk) 2922 return -ENOMEM; 2923 2924 for_each_possible_cpu(cpu) { 2925 struct sock *sk; 2926 2927 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2928 IPPROTO_TCP, net); 2929 if (res) 2930 goto fail; 2931 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2932 } 2933 net->ipv4.sysctl_tcp_ecn = 2; 2934 return 0; 2935 2936 fail: 2937 tcp_sk_exit(net); 2938 2939 return res; 2940 } 2941 tcp_sk_exit_batch(struct list_head * net_exit_list)2942 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2943 { 2944 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); 2945 } 2946 2947 static struct pernet_operations __net_initdata tcp_sk_ops = { 2948 .init = tcp_sk_init, 2949 .exit = tcp_sk_exit, 2950 .exit_batch = tcp_sk_exit_batch, 2951 }; 2952 tcp_v4_init(void)2953 void __init tcp_v4_init(void) 2954 { 2955 inet_hashinfo_init(&tcp_hashinfo); 2956 if (register_pernet_subsys(&tcp_sk_ops)) 2957 panic("Failed to create the TCP control socket.\n"); 2958 } 2959