1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82
83 #include <trace/events/tcp.h>
84
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92
tcp_v4_init_seq(const struct sk_buff * skb)93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
99 }
100
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 if (reuse == 2) {
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
117 */
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
125 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
128 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
129 loopback = true;
130 } else
131 #endif
132 {
133 if (ipv4_is_loopback(tw->tw_daddr) ||
134 ipv4_is_loopback(tw->tw_rcv_saddr))
135 loopback = true;
136 }
137 if (!loopback)
138 reuse = 0;
139 }
140
141 /* With PAWS, it is safe from the viewpoint
142 of data integrity. Even without PAWS it is safe provided sequence
143 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144
145 Actually, the idea is close to VJ's one, only timestamp cache is
146 held not per host, but per port pair and TW bucket is used as state
147 holder.
148
149 If TW bucket has been already destroyed we fall back to VJ's scheme
150 and use initial timestamp retrieved from peer table.
151 */
152 if (tcptw->tw_ts_recent_stamp &&
153 (!twp || (reuse && time_after32(ktime_get_seconds(),
154 tcptw->tw_ts_recent_stamp)))) {
155 /* In case of repair and re-using TIME-WAIT sockets we still
156 * want to be sure that it is safe as above but honor the
157 * sequence numbers and time stamps set as part of the repair
158 * process.
159 *
160 * Without this check re-using a TIME-WAIT socket with TCP
161 * repair would accumulate a -1 on the repair assigned
162 * sequence number. The first time it is reused the sequence
163 * is -1, the second time -2, etc. This fixes that issue
164 * without appearing to create any others.
165 */
166 if (likely(!tp->repair)) {
167 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
168
169 if (!seq)
170 seq = 1;
171 WRITE_ONCE(tp->write_seq, seq);
172 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
173 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
174 }
175 sock_hold(sktw);
176 return 1;
177 }
178
179 return 0;
180 }
181 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)183 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
184 int addr_len)
185 {
186 /* This check is replicated from tcp_v4_connect() and intended to
187 * prevent BPF program called below from accessing bytes that are out
188 * of the bound specified by user in addr_len.
189 */
190 if (addr_len < sizeof(struct sockaddr_in))
191 return -EINVAL;
192
193 sock_owned_by_me(sk);
194
195 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
196 }
197
198 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)199 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 {
201 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
202 struct inet_sock *inet = inet_sk(sk);
203 struct tcp_sock *tp = tcp_sk(sk);
204 __be16 orig_sport, orig_dport;
205 __be32 daddr, nexthop;
206 struct flowi4 *fl4;
207 struct rtable *rt;
208 int err;
209 struct ip_options_rcu *inet_opt;
210 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211
212 if (addr_len < sizeof(struct sockaddr_in))
213 return -EINVAL;
214
215 if (usin->sin_family != AF_INET)
216 return -EAFNOSUPPORT;
217
218 nexthop = daddr = usin->sin_addr.s_addr;
219 inet_opt = rcu_dereference_protected(inet->inet_opt,
220 lockdep_sock_is_held(sk));
221 if (inet_opt && inet_opt->opt.srr) {
222 if (!daddr)
223 return -EINVAL;
224 nexthop = inet_opt->opt.faddr;
225 }
226
227 orig_sport = inet->inet_sport;
228 orig_dport = usin->sin_port;
229 fl4 = &inet->cork.fl.u.ip4;
230 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
231 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 IPPROTO_TCP,
233 orig_sport, orig_dport, sk);
234 if (IS_ERR(rt)) {
235 err = PTR_ERR(rt);
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 return err;
239 }
240
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 ip_rt_put(rt);
243 return -ENETUNREACH;
244 }
245
246 if (!inet_opt || !inet_opt->opt.srr)
247 daddr = fl4->daddr;
248
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
252
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 /* Reset inherited state */
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
259 }
260
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
263
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 if (inet_opt)
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269
270 /* Socket identity is still unknown (sport may be zero).
271 * However we set state to SYN-SENT and not releasing socket
272 * lock select source port, enter ourselves into the hash tables and
273 * complete initialization after this.
274 */
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
277 if (err)
278 goto failure;
279
280 sk_set_txhash(sk);
281
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
284 if (IS_ERR(rt)) {
285 err = PTR_ERR(rt);
286 rt = NULL;
287 goto failure;
288 }
289 /* OK, now commit destination to socket. */
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
292 rt = NULL;
293
294 if (likely(!tp->repair)) {
295 if (!tp->write_seq)
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
298 inet->inet_daddr,
299 inet->inet_sport,
300 usin->sin_port));
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 inet->inet_saddr,
303 inet->inet_daddr);
304 }
305
306 inet->inet_id = prandom_u32();
307
308 if (tcp_fastopen_defer_connect(sk, &err))
309 return err;
310 if (err)
311 goto failure;
312
313 err = tcp_connect(sk);
314
315 if (err)
316 goto failure;
317
318 return 0;
319
320 failure:
321 /*
322 * This unhashes the socket and releases the local port,
323 * if necessary.
324 */
325 tcp_set_state(sk, TCP_CLOSE);
326 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
327 inet_reset_saddr(sk);
328 ip_rt_put(rt);
329 sk->sk_route_caps = 0;
330 inet->inet_dport = 0;
331 return err;
332 }
333 EXPORT_SYMBOL(tcp_v4_connect);
334
335 /*
336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337 * It can be called through tcp_release_cb() if socket was owned by user
338 * at the time tcp_v4_err() was called to handle ICMP message.
339 */
tcp_v4_mtu_reduced(struct sock * sk)340 void tcp_v4_mtu_reduced(struct sock *sk)
341 {
342 struct inet_sock *inet = inet_sk(sk);
343 struct dst_entry *dst;
344 u32 mtu;
345
346 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 return;
348 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
349 dst = inet_csk_update_pmtu(sk, mtu);
350 if (!dst)
351 return;
352
353 /* Something is about to be wrong... Remember soft error
354 * for the case, if this connection will not able to recover.
355 */
356 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 sk->sk_err_soft = EMSGSIZE;
358
359 mtu = dst_mtu(dst);
360
361 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
362 ip_sk_accept_pmtu(sk) &&
363 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
364 tcp_sync_mss(sk, mtu);
365
366 /* Resend the TCP packet because it's
367 * clear that the old packet has been
368 * dropped. This is the new "fast" path mtu
369 * discovery.
370 */
371 tcp_simple_retransmit(sk);
372 } /* else let the usual retransmit timer handle it */
373 }
374 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
375
do_redirect(struct sk_buff * skb,struct sock * sk)376 static void do_redirect(struct sk_buff *skb, struct sock *sk)
377 {
378 struct dst_entry *dst = __sk_dst_check(sk, 0);
379
380 if (dst)
381 dst->ops->redirect(dst, sk, skb);
382 }
383
384
385 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)386 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
387 {
388 struct request_sock *req = inet_reqsk(sk);
389 struct net *net = sock_net(sk);
390
391 /* ICMPs are not backlogged, hence we cannot get
392 * an established socket here.
393 */
394 if (seq != tcp_rsk(req)->snt_isn) {
395 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396 } else if (abort) {
397 /*
398 * Still in SYN_RECV, just remove it silently.
399 * There is no good way to pass the error to the newly
400 * created socket, and POSIX does not want network
401 * errors returned from accept().
402 */
403 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
404 tcp_listendrop(req->rsk_listener);
405 }
406 reqsk_put(req);
407 }
408 EXPORT_SYMBOL(tcp_req_err);
409
410 /*
411 * This routine is called by the ICMP module when it gets some
412 * sort of error condition. If err < 0 then the socket should
413 * be closed and the error returned to the user. If err > 0
414 * it's just the icmp type << 8 | icmp code. After adjustment
415 * header points to the first 8 bytes of the tcp header. We need
416 * to find the appropriate port.
417 *
418 * The locking strategy used here is very "optimistic". When
419 * someone else accesses the socket the ICMP is just dropped
420 * and for some paths there is no check at all.
421 * A more general error queue to queue errors for later handling
422 * is probably better.
423 *
424 */
425
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)426 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
427 {
428 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
429 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
430 struct inet_connection_sock *icsk;
431 struct tcp_sock *tp;
432 struct inet_sock *inet;
433 const int type = icmp_hdr(icmp_skb)->type;
434 const int code = icmp_hdr(icmp_skb)->code;
435 struct sock *sk;
436 struct sk_buff *skb;
437 struct request_sock *fastopen;
438 u32 seq, snd_una;
439 s32 remaining;
440 u32 delta_us;
441 int err;
442 struct net *net = dev_net(icmp_skb->dev);
443
444 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 th->dest, iph->saddr, ntohs(th->source),
446 inet_iif(icmp_skb), 0);
447 if (!sk) {
448 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
449 return -ENOENT;
450 }
451 if (sk->sk_state == TCP_TIME_WAIT) {
452 inet_twsk_put(inet_twsk(sk));
453 return 0;
454 }
455 seq = ntohl(th->seq);
456 if (sk->sk_state == TCP_NEW_SYN_RECV) {
457 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458 type == ICMP_TIME_EXCEEDED ||
459 (type == ICMP_DEST_UNREACH &&
460 (code == ICMP_NET_UNREACH ||
461 code == ICMP_HOST_UNREACH)));
462 return 0;
463 }
464
465 bh_lock_sock(sk);
466 /* If too many ICMPs get dropped on busy
467 * servers this needs to be solved differently.
468 * We do take care of PMTU discovery (RFC1191) special case :
469 * we can receive locally generated ICMP messages while socket is held.
470 */
471 if (sock_owned_by_user(sk)) {
472 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
473 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
474 }
475 if (sk->sk_state == TCP_CLOSE)
476 goto out;
477
478 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
479 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
480 goto out;
481 }
482
483 icsk = inet_csk(sk);
484 tp = tcp_sk(sk);
485 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486 fastopen = rcu_dereference(tp->fastopen_rsk);
487 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
488 if (sk->sk_state != TCP_LISTEN &&
489 !between(seq, snd_una, tp->snd_nxt)) {
490 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
491 goto out;
492 }
493
494 switch (type) {
495 case ICMP_REDIRECT:
496 if (!sock_owned_by_user(sk))
497 do_redirect(icmp_skb, sk);
498 goto out;
499 case ICMP_SOURCE_QUENCH:
500 /* Just silently ignore these. */
501 goto out;
502 case ICMP_PARAMETERPROB:
503 err = EPROTO;
504 break;
505 case ICMP_DEST_UNREACH:
506 if (code > NR_ICMP_UNREACH)
507 goto out;
508
509 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
510 /* We are not interested in TCP_LISTEN and open_requests
511 * (SYN-ACKs send out by Linux are always <576bytes so
512 * they should go through unfragmented).
513 */
514 if (sk->sk_state == TCP_LISTEN)
515 goto out;
516
517 WRITE_ONCE(tp->mtu_info, info);
518 if (!sock_owned_by_user(sk)) {
519 tcp_v4_mtu_reduced(sk);
520 } else {
521 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
522 sock_hold(sk);
523 }
524 goto out;
525 }
526
527 err = icmp_err_convert[code].errno;
528 /* check if icmp_skb allows revert of backoff
529 * (see draft-zimmermann-tcp-lcd) */
530 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
531 break;
532 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
533 !icsk->icsk_backoff || fastopen)
534 break;
535
536 if (sock_owned_by_user(sk))
537 break;
538
539 skb = tcp_rtx_queue_head(sk);
540 if (WARN_ON_ONCE(!skb))
541 break;
542
543 icsk->icsk_backoff--;
544 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
545 TCP_TIMEOUT_INIT;
546 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
547
548
549 tcp_mstamp_refresh(tp);
550 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
551 remaining = icsk->icsk_rto -
552 usecs_to_jiffies(delta_us);
553
554 if (remaining > 0) {
555 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
556 remaining, TCP_RTO_MAX);
557 } else {
558 /* RTO revert clocked out retransmission.
559 * Will retransmit now */
560 tcp_retransmit_timer(sk);
561 }
562
563 break;
564 case ICMP_TIME_EXCEEDED:
565 err = EHOSTUNREACH;
566 break;
567 default:
568 goto out;
569 }
570
571 switch (sk->sk_state) {
572 case TCP_SYN_SENT:
573 case TCP_SYN_RECV:
574 /* Only in fast or simultaneous open. If a fast open socket is
575 * is already accepted it is treated as a connected one below.
576 */
577 if (fastopen && !fastopen->sk)
578 break;
579
580 if (!sock_owned_by_user(sk)) {
581 sk->sk_err = err;
582
583 sk->sk_error_report(sk);
584
585 tcp_done(sk);
586 } else {
587 sk->sk_err_soft = err;
588 }
589 goto out;
590 }
591
592 /* If we've already connected we will keep trying
593 * until we time out, or the user gives up.
594 *
595 * rfc1122 4.2.3.9 allows to consider as hard errors
596 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
597 * but it is obsoleted by pmtu discovery).
598 *
599 * Note, that in modern internet, where routing is unreliable
600 * and in each dark corner broken firewalls sit, sending random
601 * errors ordered by their masters even this two messages finally lose
602 * their original sense (even Linux sends invalid PORT_UNREACHs)
603 *
604 * Now we are in compliance with RFCs.
605 * --ANK (980905)
606 */
607
608 inet = inet_sk(sk);
609 if (!sock_owned_by_user(sk) && inet->recverr) {
610 sk->sk_err = err;
611 sk->sk_error_report(sk);
612 } else { /* Only an error on timeout */
613 sk->sk_err_soft = err;
614 }
615
616 out:
617 bh_unlock_sock(sk);
618 sock_put(sk);
619 return 0;
620 }
621
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)622 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
623 {
624 struct tcphdr *th = tcp_hdr(skb);
625
626 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
627 skb->csum_start = skb_transport_header(skb) - skb->head;
628 skb->csum_offset = offsetof(struct tcphdr, check);
629 }
630
631 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)632 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
633 {
634 const struct inet_sock *inet = inet_sk(sk);
635
636 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
637 }
638 EXPORT_SYMBOL(tcp_v4_send_check);
639
640 /*
641 * This routine will send an RST to the other tcp.
642 *
643 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
644 * for reset.
645 * Answer: if a packet caused RST, it is not for a socket
646 * existing in our system, if it is matched to a socket,
647 * it is just duplicate segment or bug in other side's TCP.
648 * So that we build reply only basing on parameters
649 * arrived with segment.
650 * Exception: precedence violation. We do not implement it in any case.
651 */
652
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)653 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
654 {
655 const struct tcphdr *th = tcp_hdr(skb);
656 struct {
657 struct tcphdr th;
658 #ifdef CONFIG_TCP_MD5SIG
659 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
660 #endif
661 } rep;
662 struct ip_reply_arg arg;
663 #ifdef CONFIG_TCP_MD5SIG
664 struct tcp_md5sig_key *key = NULL;
665 const __u8 *hash_location = NULL;
666 unsigned char newhash[16];
667 int genhash;
668 struct sock *sk1 = NULL;
669 #endif
670 u64 transmit_time = 0;
671 struct sock *ctl_sk;
672 struct net *net;
673
674 /* Never send a reset in response to a reset. */
675 if (th->rst)
676 return;
677
678 /* If sk not NULL, it means we did a successful lookup and incoming
679 * route had to be correct. prequeue might have dropped our dst.
680 */
681 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
682 return;
683
684 /* Swap the send and the receive. */
685 memset(&rep, 0, sizeof(rep));
686 rep.th.dest = th->source;
687 rep.th.source = th->dest;
688 rep.th.doff = sizeof(struct tcphdr) / 4;
689 rep.th.rst = 1;
690
691 if (th->ack) {
692 rep.th.seq = th->ack_seq;
693 } else {
694 rep.th.ack = 1;
695 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
696 skb->len - (th->doff << 2));
697 }
698
699 memset(&arg, 0, sizeof(arg));
700 arg.iov[0].iov_base = (unsigned char *)&rep;
701 arg.iov[0].iov_len = sizeof(rep.th);
702
703 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
704 #ifdef CONFIG_TCP_MD5SIG
705 rcu_read_lock();
706 hash_location = tcp_parse_md5sig_option(th);
707 if (sk && sk_fullsock(sk)) {
708 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
709 &ip_hdr(skb)->saddr, AF_INET);
710 } else if (hash_location) {
711 /*
712 * active side is lost. Try to find listening socket through
713 * source port, and then find md5 key through listening socket.
714 * we are not loose security here:
715 * Incoming packet is checked with md5 hash with finding key,
716 * no RST generated if md5 hash doesn't match.
717 */
718 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
719 ip_hdr(skb)->saddr,
720 th->source, ip_hdr(skb)->daddr,
721 ntohs(th->source), inet_iif(skb),
722 tcp_v4_sdif(skb));
723 /* don't send rst if it can't find key */
724 if (!sk1)
725 goto out;
726
727 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
728 &ip_hdr(skb)->saddr, AF_INET);
729 if (!key)
730 goto out;
731
732
733 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
734 if (genhash || memcmp(hash_location, newhash, 16) != 0)
735 goto out;
736
737 }
738
739 if (key) {
740 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
741 (TCPOPT_NOP << 16) |
742 (TCPOPT_MD5SIG << 8) |
743 TCPOLEN_MD5SIG);
744 /* Update length and the length the header thinks exists */
745 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
746 rep.th.doff = arg.iov[0].iov_len / 4;
747
748 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
749 key, ip_hdr(skb)->saddr,
750 ip_hdr(skb)->daddr, &rep.th);
751 }
752 #endif
753 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
754 ip_hdr(skb)->saddr, /* XXX */
755 arg.iov[0].iov_len, IPPROTO_TCP, 0);
756 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
757 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
758
759 /* When socket is gone, all binding information is lost.
760 * routing might fail in this case. No choice here, if we choose to force
761 * input interface, we will misroute in case of asymmetric route.
762 */
763 if (sk) {
764 arg.bound_dev_if = sk->sk_bound_dev_if;
765 if (sk_fullsock(sk))
766 trace_tcp_send_reset(sk, skb);
767 }
768
769 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
770 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
771
772 arg.tos = ip_hdr(skb)->tos;
773 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
774 local_bh_disable();
775 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
776 if (sk) {
777 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
778 inet_twsk(sk)->tw_mark : sk->sk_mark;
779 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
780 inet_twsk(sk)->tw_priority : sk->sk_priority;
781 transmit_time = tcp_transmit_time(sk);
782 }
783 ip_send_unicast_reply(ctl_sk,
784 skb, &TCP_SKB_CB(skb)->header.h4.opt,
785 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
786 &arg, arg.iov[0].iov_len,
787 transmit_time);
788
789 ctl_sk->sk_mark = 0;
790 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
791 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
792 local_bh_enable();
793
794 #ifdef CONFIG_TCP_MD5SIG
795 out:
796 rcu_read_unlock();
797 #endif
798 }
799
800 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
801 outside socket context is ugly, certainly. What can I do?
802 */
803
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)804 static void tcp_v4_send_ack(const struct sock *sk,
805 struct sk_buff *skb, u32 seq, u32 ack,
806 u32 win, u32 tsval, u32 tsecr, int oif,
807 struct tcp_md5sig_key *key,
808 int reply_flags, u8 tos)
809 {
810 const struct tcphdr *th = tcp_hdr(skb);
811 struct {
812 struct tcphdr th;
813 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
814 #ifdef CONFIG_TCP_MD5SIG
815 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
816 #endif
817 ];
818 } rep;
819 struct net *net = sock_net(sk);
820 struct ip_reply_arg arg;
821 struct sock *ctl_sk;
822 u64 transmit_time;
823
824 memset(&rep.th, 0, sizeof(struct tcphdr));
825 memset(&arg, 0, sizeof(arg));
826
827 arg.iov[0].iov_base = (unsigned char *)&rep;
828 arg.iov[0].iov_len = sizeof(rep.th);
829 if (tsecr) {
830 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
831 (TCPOPT_TIMESTAMP << 8) |
832 TCPOLEN_TIMESTAMP);
833 rep.opt[1] = htonl(tsval);
834 rep.opt[2] = htonl(tsecr);
835 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
836 }
837
838 /* Swap the send and the receive. */
839 rep.th.dest = th->source;
840 rep.th.source = th->dest;
841 rep.th.doff = arg.iov[0].iov_len / 4;
842 rep.th.seq = htonl(seq);
843 rep.th.ack_seq = htonl(ack);
844 rep.th.ack = 1;
845 rep.th.window = htons(win);
846
847 #ifdef CONFIG_TCP_MD5SIG
848 if (key) {
849 int offset = (tsecr) ? 3 : 0;
850
851 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
852 (TCPOPT_NOP << 16) |
853 (TCPOPT_MD5SIG << 8) |
854 TCPOLEN_MD5SIG);
855 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 rep.th.doff = arg.iov[0].iov_len/4;
857
858 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
859 key, ip_hdr(skb)->saddr,
860 ip_hdr(skb)->daddr, &rep.th);
861 }
862 #endif
863 arg.flags = reply_flags;
864 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
865 ip_hdr(skb)->saddr, /* XXX */
866 arg.iov[0].iov_len, IPPROTO_TCP, 0);
867 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
868 if (oif)
869 arg.bound_dev_if = oif;
870 arg.tos = tos;
871 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
872 local_bh_disable();
873 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
874 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
875 inet_twsk(sk)->tw_mark : sk->sk_mark;
876 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
877 inet_twsk(sk)->tw_priority : sk->sk_priority;
878 transmit_time = tcp_transmit_time(sk);
879 ip_send_unicast_reply(ctl_sk,
880 skb, &TCP_SKB_CB(skb)->header.h4.opt,
881 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
882 &arg, arg.iov[0].iov_len,
883 transmit_time);
884
885 ctl_sk->sk_mark = 0;
886 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
887 local_bh_enable();
888 }
889
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)890 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
891 {
892 struct inet_timewait_sock *tw = inet_twsk(sk);
893 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
894
895 tcp_v4_send_ack(sk, skb,
896 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
897 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
898 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
899 tcptw->tw_ts_recent,
900 tw->tw_bound_dev_if,
901 tcp_twsk_md5_key(tcptw),
902 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
903 tw->tw_tos
904 );
905
906 inet_twsk_put(tw);
907 }
908
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)909 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
910 struct request_sock *req)
911 {
912 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
913 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
914 */
915 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
916 tcp_sk(sk)->snd_nxt;
917
918 /* RFC 7323 2.3
919 * The window field (SEG.WND) of every outgoing segment, with the
920 * exception of <SYN> segments, MUST be right-shifted by
921 * Rcv.Wind.Shift bits:
922 */
923 tcp_v4_send_ack(sk, skb, seq,
924 tcp_rsk(req)->rcv_nxt,
925 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
926 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
927 req->ts_recent,
928 0,
929 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
930 AF_INET),
931 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
932 ip_hdr(skb)->tos);
933 }
934
935 /*
936 * Send a SYN-ACK after having received a SYN.
937 * This still operates on a request_sock only, not on a big
938 * socket.
939 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type)940 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
941 struct flowi *fl,
942 struct request_sock *req,
943 struct tcp_fastopen_cookie *foc,
944 enum tcp_synack_type synack_type)
945 {
946 const struct inet_request_sock *ireq = inet_rsk(req);
947 struct flowi4 fl4;
948 int err = -1;
949 struct sk_buff *skb;
950
951 /* First, grab a route. */
952 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
953 return -1;
954
955 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
956
957 if (skb) {
958 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
959
960 rcu_read_lock();
961 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
962 ireq->ir_rmt_addr,
963 rcu_dereference(ireq->ireq_opt));
964 rcu_read_unlock();
965 err = net_xmit_eval(err);
966 }
967
968 return err;
969 }
970
971 /*
972 * IPv4 request_sock destructor.
973 */
tcp_v4_reqsk_destructor(struct request_sock * req)974 static void tcp_v4_reqsk_destructor(struct request_sock *req)
975 {
976 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
977 }
978
979 #ifdef CONFIG_TCP_MD5SIG
980 /*
981 * RFC2385 MD5 checksumming requires a mapping of
982 * IP address->MD5 Key.
983 * We need to maintain these in the sk structure.
984 */
985
986 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
987 EXPORT_SYMBOL(tcp_md5_needed);
988
989 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,const union tcp_md5_addr * addr,int family)990 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
991 const union tcp_md5_addr *addr,
992 int family)
993 {
994 const struct tcp_sock *tp = tcp_sk(sk);
995 struct tcp_md5sig_key *key;
996 const struct tcp_md5sig_info *md5sig;
997 __be32 mask;
998 struct tcp_md5sig_key *best_match = NULL;
999 bool match;
1000
1001 /* caller either holds rcu_read_lock() or socket lock */
1002 md5sig = rcu_dereference_check(tp->md5sig_info,
1003 lockdep_sock_is_held(sk));
1004 if (!md5sig)
1005 return NULL;
1006
1007 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1008 if (key->family != family)
1009 continue;
1010
1011 if (family == AF_INET) {
1012 mask = inet_make_mask(key->prefixlen);
1013 match = (key->addr.a4.s_addr & mask) ==
1014 (addr->a4.s_addr & mask);
1015 #if IS_ENABLED(CONFIG_IPV6)
1016 } else if (family == AF_INET6) {
1017 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1018 key->prefixlen);
1019 #endif
1020 } else {
1021 match = false;
1022 }
1023
1024 if (match && (!best_match ||
1025 key->prefixlen > best_match->prefixlen))
1026 best_match = key;
1027 }
1028 return best_match;
1029 }
1030 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1031
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1032 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1033 const union tcp_md5_addr *addr,
1034 int family, u8 prefixlen)
1035 {
1036 const struct tcp_sock *tp = tcp_sk(sk);
1037 struct tcp_md5sig_key *key;
1038 unsigned int size = sizeof(struct in_addr);
1039 const struct tcp_md5sig_info *md5sig;
1040
1041 /* caller either holds rcu_read_lock() or socket lock */
1042 md5sig = rcu_dereference_check(tp->md5sig_info,
1043 lockdep_sock_is_held(sk));
1044 if (!md5sig)
1045 return NULL;
1046 #if IS_ENABLED(CONFIG_IPV6)
1047 if (family == AF_INET6)
1048 size = sizeof(struct in6_addr);
1049 #endif
1050 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1051 if (key->family != family)
1052 continue;
1053 if (!memcmp(&key->addr, addr, size) &&
1054 key->prefixlen == prefixlen)
1055 return key;
1056 }
1057 return NULL;
1058 }
1059
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1060 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1061 const struct sock *addr_sk)
1062 {
1063 const union tcp_md5_addr *addr;
1064
1065 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1066 return tcp_md5_do_lookup(sk, addr, AF_INET);
1067 }
1068 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1069
1070 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,const u8 * newkey,u8 newkeylen,gfp_t gfp)1071 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1072 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1073 gfp_t gfp)
1074 {
1075 /* Add Key to the list */
1076 struct tcp_md5sig_key *key;
1077 struct tcp_sock *tp = tcp_sk(sk);
1078 struct tcp_md5sig_info *md5sig;
1079
1080 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1081 if (key) {
1082 /* Pre-existing entry - just update that one.
1083 * Note that the key might be used concurrently.
1084 */
1085 memcpy(key->key, newkey, newkeylen);
1086
1087 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1088 * Also note that a reader could catch new key->keylen value
1089 * but old key->key[], this is the reason we use __GFP_ZERO
1090 * at sock_kmalloc() time below these lines.
1091 */
1092 WRITE_ONCE(key->keylen, newkeylen);
1093
1094 return 0;
1095 }
1096
1097 md5sig = rcu_dereference_protected(tp->md5sig_info,
1098 lockdep_sock_is_held(sk));
1099 if (!md5sig) {
1100 md5sig = kmalloc(sizeof(*md5sig), gfp);
1101 if (!md5sig)
1102 return -ENOMEM;
1103
1104 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1105 INIT_HLIST_HEAD(&md5sig->head);
1106 rcu_assign_pointer(tp->md5sig_info, md5sig);
1107 }
1108
1109 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1110 if (!key)
1111 return -ENOMEM;
1112 if (!tcp_alloc_md5sig_pool()) {
1113 sock_kfree_s(sk, key, sizeof(*key));
1114 return -ENOMEM;
1115 }
1116
1117 memcpy(key->key, newkey, newkeylen);
1118 key->keylen = newkeylen;
1119 key->family = family;
1120 key->prefixlen = prefixlen;
1121 memcpy(&key->addr, addr,
1122 (family == AF_INET6) ? sizeof(struct in6_addr) :
1123 sizeof(struct in_addr));
1124 hlist_add_head_rcu(&key->node, &md5sig->head);
1125 return 0;
1126 }
1127 EXPORT_SYMBOL(tcp_md5_do_add);
1128
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen)1129 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1130 u8 prefixlen)
1131 {
1132 struct tcp_md5sig_key *key;
1133
1134 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1135 if (!key)
1136 return -ENOENT;
1137 hlist_del_rcu(&key->node);
1138 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1139 kfree_rcu(key, rcu);
1140 return 0;
1141 }
1142 EXPORT_SYMBOL(tcp_md5_do_del);
1143
tcp_clear_md5_list(struct sock * sk)1144 static void tcp_clear_md5_list(struct sock *sk)
1145 {
1146 struct tcp_sock *tp = tcp_sk(sk);
1147 struct tcp_md5sig_key *key;
1148 struct hlist_node *n;
1149 struct tcp_md5sig_info *md5sig;
1150
1151 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1152
1153 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1154 hlist_del_rcu(&key->node);
1155 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1156 kfree_rcu(key, rcu);
1157 }
1158 }
1159
tcp_v4_parse_md5_keys(struct sock * sk,int optname,char __user * optval,int optlen)1160 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1161 char __user *optval, int optlen)
1162 {
1163 struct tcp_md5sig cmd;
1164 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1165 u8 prefixlen = 32;
1166
1167 if (optlen < sizeof(cmd))
1168 return -EINVAL;
1169
1170 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1171 return -EFAULT;
1172
1173 if (sin->sin_family != AF_INET)
1174 return -EINVAL;
1175
1176 if (optname == TCP_MD5SIG_EXT &&
1177 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1178 prefixlen = cmd.tcpm_prefixlen;
1179 if (prefixlen > 32)
1180 return -EINVAL;
1181 }
1182
1183 if (!cmd.tcpm_keylen)
1184 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1185 AF_INET, prefixlen);
1186
1187 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1188 return -EINVAL;
1189
1190 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1191 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1192 GFP_KERNEL);
1193 }
1194
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1195 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1196 __be32 daddr, __be32 saddr,
1197 const struct tcphdr *th, int nbytes)
1198 {
1199 struct tcp4_pseudohdr *bp;
1200 struct scatterlist sg;
1201 struct tcphdr *_th;
1202
1203 bp = hp->scratch;
1204 bp->saddr = saddr;
1205 bp->daddr = daddr;
1206 bp->pad = 0;
1207 bp->protocol = IPPROTO_TCP;
1208 bp->len = cpu_to_be16(nbytes);
1209
1210 _th = (struct tcphdr *)(bp + 1);
1211 memcpy(_th, th, sizeof(*th));
1212 _th->check = 0;
1213
1214 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1215 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1216 sizeof(*bp) + sizeof(*th));
1217 return crypto_ahash_update(hp->md5_req);
1218 }
1219
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1220 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1221 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1222 {
1223 struct tcp_md5sig_pool *hp;
1224 struct ahash_request *req;
1225
1226 hp = tcp_get_md5sig_pool();
1227 if (!hp)
1228 goto clear_hash_noput;
1229 req = hp->md5_req;
1230
1231 if (crypto_ahash_init(req))
1232 goto clear_hash;
1233 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1234 goto clear_hash;
1235 if (tcp_md5_hash_key(hp, key))
1236 goto clear_hash;
1237 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1238 if (crypto_ahash_final(req))
1239 goto clear_hash;
1240
1241 tcp_put_md5sig_pool();
1242 return 0;
1243
1244 clear_hash:
1245 tcp_put_md5sig_pool();
1246 clear_hash_noput:
1247 memset(md5_hash, 0, 16);
1248 return 1;
1249 }
1250
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1251 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1252 const struct sock *sk,
1253 const struct sk_buff *skb)
1254 {
1255 struct tcp_md5sig_pool *hp;
1256 struct ahash_request *req;
1257 const struct tcphdr *th = tcp_hdr(skb);
1258 __be32 saddr, daddr;
1259
1260 if (sk) { /* valid for establish/request sockets */
1261 saddr = sk->sk_rcv_saddr;
1262 daddr = sk->sk_daddr;
1263 } else {
1264 const struct iphdr *iph = ip_hdr(skb);
1265 saddr = iph->saddr;
1266 daddr = iph->daddr;
1267 }
1268
1269 hp = tcp_get_md5sig_pool();
1270 if (!hp)
1271 goto clear_hash_noput;
1272 req = hp->md5_req;
1273
1274 if (crypto_ahash_init(req))
1275 goto clear_hash;
1276
1277 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1278 goto clear_hash;
1279 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1280 goto clear_hash;
1281 if (tcp_md5_hash_key(hp, key))
1282 goto clear_hash;
1283 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1284 if (crypto_ahash_final(req))
1285 goto clear_hash;
1286
1287 tcp_put_md5sig_pool();
1288 return 0;
1289
1290 clear_hash:
1291 tcp_put_md5sig_pool();
1292 clear_hash_noput:
1293 memset(md5_hash, 0, 16);
1294 return 1;
1295 }
1296 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1297
1298 #endif
1299
1300 /* Called with rcu_read_lock() */
tcp_v4_inbound_md5_hash(const struct sock * sk,const struct sk_buff * skb)1301 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1302 const struct sk_buff *skb)
1303 {
1304 #ifdef CONFIG_TCP_MD5SIG
1305 /*
1306 * This gets called for each TCP segment that arrives
1307 * so we want to be efficient.
1308 * We have 3 drop cases:
1309 * o No MD5 hash and one expected.
1310 * o MD5 hash and we're not expecting one.
1311 * o MD5 hash and its wrong.
1312 */
1313 const __u8 *hash_location = NULL;
1314 struct tcp_md5sig_key *hash_expected;
1315 const struct iphdr *iph = ip_hdr(skb);
1316 const struct tcphdr *th = tcp_hdr(skb);
1317 int genhash;
1318 unsigned char newhash[16];
1319
1320 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1321 AF_INET);
1322 hash_location = tcp_parse_md5sig_option(th);
1323
1324 /* We've parsed the options - do we have a hash? */
1325 if (!hash_expected && !hash_location)
1326 return false;
1327
1328 if (hash_expected && !hash_location) {
1329 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1330 return true;
1331 }
1332
1333 if (!hash_expected && hash_location) {
1334 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1335 return true;
1336 }
1337
1338 /* Okay, so this is hash_expected and hash_location -
1339 * so we need to calculate the checksum.
1340 */
1341 genhash = tcp_v4_md5_hash_skb(newhash,
1342 hash_expected,
1343 NULL, skb);
1344
1345 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1346 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1347 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1348 &iph->saddr, ntohs(th->source),
1349 &iph->daddr, ntohs(th->dest),
1350 genhash ? " tcp_v4_calc_md5_hash failed"
1351 : "");
1352 return true;
1353 }
1354 return false;
1355 #endif
1356 return false;
1357 }
1358
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1359 static void tcp_v4_init_req(struct request_sock *req,
1360 const struct sock *sk_listener,
1361 struct sk_buff *skb)
1362 {
1363 struct inet_request_sock *ireq = inet_rsk(req);
1364 struct net *net = sock_net(sk_listener);
1365
1366 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1367 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1368 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1369 }
1370
tcp_v4_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)1371 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1372 struct flowi *fl,
1373 const struct request_sock *req)
1374 {
1375 return inet_csk_route_req(sk, &fl->u.ip4, req);
1376 }
1377
1378 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1379 .family = PF_INET,
1380 .obj_size = sizeof(struct tcp_request_sock),
1381 .rtx_syn_ack = tcp_rtx_synack,
1382 .send_ack = tcp_v4_reqsk_send_ack,
1383 .destructor = tcp_v4_reqsk_destructor,
1384 .send_reset = tcp_v4_send_reset,
1385 .syn_ack_timeout = tcp_syn_ack_timeout,
1386 };
1387
1388 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1389 .mss_clamp = TCP_MSS_DEFAULT,
1390 #ifdef CONFIG_TCP_MD5SIG
1391 .req_md5_lookup = tcp_v4_md5_lookup,
1392 .calc_md5_hash = tcp_v4_md5_hash_skb,
1393 #endif
1394 .init_req = tcp_v4_init_req,
1395 #ifdef CONFIG_SYN_COOKIES
1396 .cookie_init_seq = cookie_v4_init_sequence,
1397 #endif
1398 .route_req = tcp_v4_route_req,
1399 .init_seq = tcp_v4_init_seq,
1400 .init_ts_off = tcp_v4_init_ts_off,
1401 .send_synack = tcp_v4_send_synack,
1402 };
1403
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1404 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1405 {
1406 /* Never answer to SYNs send to broadcast or multicast */
1407 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1408 goto drop;
1409
1410 return tcp_conn_request(&tcp_request_sock_ops,
1411 &tcp_request_sock_ipv4_ops, sk, skb);
1412
1413 drop:
1414 tcp_listendrop(sk);
1415 return 0;
1416 }
1417 EXPORT_SYMBOL(tcp_v4_conn_request);
1418
1419
1420 /*
1421 * The three way handshake has completed - we got a valid synack -
1422 * now create the new socket.
1423 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1424 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1425 struct request_sock *req,
1426 struct dst_entry *dst,
1427 struct request_sock *req_unhash,
1428 bool *own_req)
1429 {
1430 struct inet_request_sock *ireq;
1431 bool found_dup_sk = false;
1432 struct inet_sock *newinet;
1433 struct tcp_sock *newtp;
1434 struct sock *newsk;
1435 #ifdef CONFIG_TCP_MD5SIG
1436 struct tcp_md5sig_key *key;
1437 #endif
1438 struct ip_options_rcu *inet_opt;
1439
1440 if (sk_acceptq_is_full(sk))
1441 goto exit_overflow;
1442
1443 newsk = tcp_create_openreq_child(sk, req, skb);
1444 if (!newsk)
1445 goto exit_nonewsk;
1446
1447 newsk->sk_gso_type = SKB_GSO_TCPV4;
1448 inet_sk_rx_dst_set(newsk, skb);
1449
1450 newtp = tcp_sk(newsk);
1451 newinet = inet_sk(newsk);
1452 ireq = inet_rsk(req);
1453 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1454 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1455 newsk->sk_bound_dev_if = ireq->ir_iif;
1456 newinet->inet_saddr = ireq->ir_loc_addr;
1457 inet_opt = rcu_dereference(ireq->ireq_opt);
1458 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1459 newinet->mc_index = inet_iif(skb);
1460 newinet->mc_ttl = ip_hdr(skb)->ttl;
1461 newinet->rcv_tos = ip_hdr(skb)->tos;
1462 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1463 if (inet_opt)
1464 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1465 newinet->inet_id = prandom_u32();
1466
1467 if (!dst) {
1468 dst = inet_csk_route_child_sock(sk, newsk, req);
1469 if (!dst)
1470 goto put_and_exit;
1471 } else {
1472 /* syncookie case : see end of cookie_v4_check() */
1473 }
1474 sk_setup_caps(newsk, dst);
1475
1476 tcp_ca_openreq_child(newsk, dst);
1477
1478 tcp_sync_mss(newsk, dst_mtu(dst));
1479 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1480
1481 tcp_initialize_rcv_mss(newsk);
1482
1483 #ifdef CONFIG_TCP_MD5SIG
1484 /* Copy over the MD5 key from the original socket */
1485 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1486 AF_INET);
1487 if (key) {
1488 /*
1489 * We're using one, so create a matching key
1490 * on the newsk structure. If we fail to get
1491 * memory, then we end up not copying the key
1492 * across. Shucks.
1493 */
1494 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1495 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1496 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1497 }
1498 #endif
1499
1500 if (__inet_inherit_port(sk, newsk) < 0)
1501 goto put_and_exit;
1502 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1503 &found_dup_sk);
1504 if (likely(*own_req)) {
1505 tcp_move_syn(newtp, req);
1506 ireq->ireq_opt = NULL;
1507 } else {
1508 newinet->inet_opt = NULL;
1509
1510 if (!req_unhash && found_dup_sk) {
1511 /* This code path should only be executed in the
1512 * syncookie case only
1513 */
1514 bh_unlock_sock(newsk);
1515 sock_put(newsk);
1516 newsk = NULL;
1517 }
1518 }
1519 return newsk;
1520
1521 exit_overflow:
1522 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1523 exit_nonewsk:
1524 dst_release(dst);
1525 exit:
1526 tcp_listendrop(sk);
1527 return NULL;
1528 put_and_exit:
1529 newinet->inet_opt = NULL;
1530 inet_csk_prepare_forced_close(newsk);
1531 tcp_done(newsk);
1532 goto exit;
1533 }
1534 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1535
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1536 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1537 {
1538 #ifdef CONFIG_SYN_COOKIES
1539 const struct tcphdr *th = tcp_hdr(skb);
1540
1541 if (!th->syn)
1542 sk = cookie_v4_check(sk, skb);
1543 #endif
1544 return sk;
1545 }
1546
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1547 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1548 struct tcphdr *th, u32 *cookie)
1549 {
1550 u16 mss = 0;
1551 #ifdef CONFIG_SYN_COOKIES
1552 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1553 &tcp_request_sock_ipv4_ops, sk, th);
1554 if (mss) {
1555 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1556 tcp_synq_overflow(sk);
1557 }
1558 #endif
1559 return mss;
1560 }
1561
1562 /* The socket must have it's spinlock held when we get
1563 * here, unless it is a TCP_LISTEN socket.
1564 *
1565 * We have a potential double-lock case here, so even when
1566 * doing backlog processing we use the BH locking scheme.
1567 * This is because we cannot sleep with the original spinlock
1568 * held.
1569 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1570 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1571 {
1572 struct sock *rsk;
1573
1574 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575 struct dst_entry *dst;
1576
1577 dst = rcu_dereference_protected(sk->sk_rx_dst,
1578 lockdep_sock_is_held(sk));
1579
1580 sock_rps_save_rxhash(sk, skb);
1581 sk_mark_napi_id(sk, skb);
1582 if (dst) {
1583 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1584 !dst->ops->check(dst, 0)) {
1585 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1586 dst_release(dst);
1587 }
1588 }
1589 tcp_rcv_established(sk, skb);
1590 return 0;
1591 }
1592
1593 if (tcp_checksum_complete(skb))
1594 goto csum_err;
1595
1596 if (sk->sk_state == TCP_LISTEN) {
1597 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1598
1599 if (!nsk)
1600 goto discard;
1601 if (nsk != sk) {
1602 if (tcp_child_process(sk, nsk, skb)) {
1603 rsk = nsk;
1604 goto reset;
1605 }
1606 return 0;
1607 }
1608 } else
1609 sock_rps_save_rxhash(sk, skb);
1610
1611 if (tcp_rcv_state_process(sk, skb)) {
1612 rsk = sk;
1613 goto reset;
1614 }
1615 return 0;
1616
1617 reset:
1618 tcp_v4_send_reset(rsk, skb);
1619 discard:
1620 kfree_skb(skb);
1621 /* Be careful here. If this function gets more complicated and
1622 * gcc suffers from register pressure on the x86, sk (in %ebx)
1623 * might be destroyed here. This current version compiles correctly,
1624 * but you have been warned.
1625 */
1626 return 0;
1627
1628 csum_err:
1629 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1630 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1631 goto discard;
1632 }
1633 EXPORT_SYMBOL(tcp_v4_do_rcv);
1634
tcp_v4_early_demux(struct sk_buff * skb)1635 int tcp_v4_early_demux(struct sk_buff *skb)
1636 {
1637 const struct iphdr *iph;
1638 const struct tcphdr *th;
1639 struct sock *sk;
1640
1641 if (skb->pkt_type != PACKET_HOST)
1642 return 0;
1643
1644 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1645 return 0;
1646
1647 iph = ip_hdr(skb);
1648 th = tcp_hdr(skb);
1649
1650 if (th->doff < sizeof(struct tcphdr) / 4)
1651 return 0;
1652
1653 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1654 iph->saddr, th->source,
1655 iph->daddr, ntohs(th->dest),
1656 skb->skb_iif, inet_sdif(skb));
1657 if (sk) {
1658 skb->sk = sk;
1659 skb->destructor = sock_edemux;
1660 if (sk_fullsock(sk)) {
1661 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1662
1663 if (dst)
1664 dst = dst_check(dst, 0);
1665 if (dst &&
1666 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1667 skb_dst_set_noref(skb, dst);
1668 }
1669 }
1670 return 0;
1671 }
1672
tcp_add_backlog(struct sock * sk,struct sk_buff * skb)1673 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1674 {
1675 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1676 u32 tail_gso_size, tail_gso_segs;
1677 struct skb_shared_info *shinfo;
1678 const struct tcphdr *th;
1679 struct tcphdr *thtail;
1680 struct sk_buff *tail;
1681 unsigned int hdrlen;
1682 bool fragstolen;
1683 u32 gso_segs;
1684 u32 gso_size;
1685 int delta;
1686
1687 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1688 * we can fix skb->truesize to its real value to avoid future drops.
1689 * This is valid because skb is not yet charged to the socket.
1690 * It has been noticed pure SACK packets were sometimes dropped
1691 * (if cooked by drivers without copybreak feature).
1692 */
1693 skb_condense(skb);
1694
1695 skb_dst_drop(skb);
1696
1697 if (unlikely(tcp_checksum_complete(skb))) {
1698 bh_unlock_sock(sk);
1699 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1700 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1701 return true;
1702 }
1703
1704 /* Attempt coalescing to last skb in backlog, even if we are
1705 * above the limits.
1706 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1707 */
1708 th = (const struct tcphdr *)skb->data;
1709 hdrlen = th->doff * 4;
1710
1711 tail = sk->sk_backlog.tail;
1712 if (!tail)
1713 goto no_coalesce;
1714 thtail = (struct tcphdr *)tail->data;
1715
1716 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1717 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1718 ((TCP_SKB_CB(tail)->tcp_flags |
1719 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1720 !((TCP_SKB_CB(tail)->tcp_flags &
1721 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1722 ((TCP_SKB_CB(tail)->tcp_flags ^
1723 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1724 #ifdef CONFIG_TLS_DEVICE
1725 tail->decrypted != skb->decrypted ||
1726 #endif
1727 thtail->doff != th->doff ||
1728 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1729 goto no_coalesce;
1730
1731 __skb_pull(skb, hdrlen);
1732
1733 shinfo = skb_shinfo(skb);
1734 gso_size = shinfo->gso_size ?: skb->len;
1735 gso_segs = shinfo->gso_segs ?: 1;
1736
1737 shinfo = skb_shinfo(tail);
1738 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1739 tail_gso_segs = shinfo->gso_segs ?: 1;
1740
1741 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1742 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1743
1744 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1745 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1746 thtail->window = th->window;
1747 }
1748
1749 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1750 * thtail->fin, so that the fast path in tcp_rcv_established()
1751 * is not entered if we append a packet with a FIN.
1752 * SYN, RST, URG are not present.
1753 * ACK is set on both packets.
1754 * PSH : we do not really care in TCP stack,
1755 * at least for 'GRO' packets.
1756 */
1757 thtail->fin |= th->fin;
1758 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1759
1760 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1761 TCP_SKB_CB(tail)->has_rxtstamp = true;
1762 tail->tstamp = skb->tstamp;
1763 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1764 }
1765
1766 /* Not as strict as GRO. We only need to carry mss max value */
1767 shinfo->gso_size = max(gso_size, tail_gso_size);
1768 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1769
1770 sk->sk_backlog.len += delta;
1771 __NET_INC_STATS(sock_net(sk),
1772 LINUX_MIB_TCPBACKLOGCOALESCE);
1773 kfree_skb_partial(skb, fragstolen);
1774 return false;
1775 }
1776 __skb_push(skb, hdrlen);
1777
1778 no_coalesce:
1779 /* Only socket owner can try to collapse/prune rx queues
1780 * to reduce memory overhead, so add a little headroom here.
1781 * Few sockets backlog are possibly concurrently non empty.
1782 */
1783 limit += 64*1024;
1784
1785 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1786 bh_unlock_sock(sk);
1787 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1788 return true;
1789 }
1790 return false;
1791 }
1792 EXPORT_SYMBOL(tcp_add_backlog);
1793
tcp_filter(struct sock * sk,struct sk_buff * skb)1794 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1795 {
1796 struct tcphdr *th = (struct tcphdr *)skb->data;
1797
1798 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1799 }
1800 EXPORT_SYMBOL(tcp_filter);
1801
tcp_v4_restore_cb(struct sk_buff * skb)1802 static void tcp_v4_restore_cb(struct sk_buff *skb)
1803 {
1804 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1805 sizeof(struct inet_skb_parm));
1806 }
1807
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1808 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1809 const struct tcphdr *th)
1810 {
1811 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1812 * barrier() makes sure compiler wont play fool^Waliasing games.
1813 */
1814 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1815 sizeof(struct inet_skb_parm));
1816 barrier();
1817
1818 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1819 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1820 skb->len - th->doff * 4);
1821 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1822 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1823 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1824 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1825 TCP_SKB_CB(skb)->sacked = 0;
1826 TCP_SKB_CB(skb)->has_rxtstamp =
1827 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1828 }
1829
1830 /*
1831 * From tcp_input.c
1832 */
1833
tcp_v4_rcv(struct sk_buff * skb)1834 int tcp_v4_rcv(struct sk_buff *skb)
1835 {
1836 struct net *net = dev_net(skb->dev);
1837 struct sk_buff *skb_to_free;
1838 int sdif = inet_sdif(skb);
1839 const struct iphdr *iph;
1840 const struct tcphdr *th;
1841 bool refcounted;
1842 struct sock *sk;
1843 int ret;
1844
1845 if (skb->pkt_type != PACKET_HOST)
1846 goto discard_it;
1847
1848 /* Count it even if it's bad */
1849 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1850
1851 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1852 goto discard_it;
1853
1854 th = (const struct tcphdr *)skb->data;
1855
1856 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1857 goto bad_packet;
1858 if (!pskb_may_pull(skb, th->doff * 4))
1859 goto discard_it;
1860
1861 /* An explanation is required here, I think.
1862 * Packet length and doff are validated by header prediction,
1863 * provided case of th->doff==0 is eliminated.
1864 * So, we defer the checks. */
1865
1866 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1867 goto csum_error;
1868
1869 th = (const struct tcphdr *)skb->data;
1870 iph = ip_hdr(skb);
1871 lookup:
1872 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1873 th->dest, sdif, &refcounted);
1874 if (!sk)
1875 goto no_tcp_socket;
1876
1877 process:
1878 if (sk->sk_state == TCP_TIME_WAIT)
1879 goto do_time_wait;
1880
1881 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1882 struct request_sock *req = inet_reqsk(sk);
1883 bool req_stolen = false;
1884 struct sock *nsk;
1885
1886 sk = req->rsk_listener;
1887 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1888 sk_drops_add(sk, skb);
1889 reqsk_put(req);
1890 goto discard_it;
1891 }
1892 if (tcp_checksum_complete(skb)) {
1893 reqsk_put(req);
1894 goto csum_error;
1895 }
1896 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1897 inet_csk_reqsk_queue_drop_and_put(sk, req);
1898 goto lookup;
1899 }
1900 /* We own a reference on the listener, increase it again
1901 * as we might lose it too soon.
1902 */
1903 sock_hold(sk);
1904 refcounted = true;
1905 nsk = NULL;
1906 if (!tcp_filter(sk, skb)) {
1907 th = (const struct tcphdr *)skb->data;
1908 iph = ip_hdr(skb);
1909 tcp_v4_fill_cb(skb, iph, th);
1910 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1911 }
1912 if (!nsk) {
1913 reqsk_put(req);
1914 if (req_stolen) {
1915 /* Another cpu got exclusive access to req
1916 * and created a full blown socket.
1917 * Try to feed this packet to this socket
1918 * instead of discarding it.
1919 */
1920 tcp_v4_restore_cb(skb);
1921 sock_put(sk);
1922 goto lookup;
1923 }
1924 goto discard_and_relse;
1925 }
1926 if (nsk == sk) {
1927 reqsk_put(req);
1928 tcp_v4_restore_cb(skb);
1929 } else if (tcp_child_process(sk, nsk, skb)) {
1930 tcp_v4_send_reset(nsk, skb);
1931 goto discard_and_relse;
1932 } else {
1933 sock_put(sk);
1934 return 0;
1935 }
1936 }
1937 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1938 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1939 goto discard_and_relse;
1940 }
1941
1942 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1943 goto discard_and_relse;
1944
1945 if (tcp_v4_inbound_md5_hash(sk, skb))
1946 goto discard_and_relse;
1947
1948 nf_reset_ct(skb);
1949
1950 if (tcp_filter(sk, skb))
1951 goto discard_and_relse;
1952 th = (const struct tcphdr *)skb->data;
1953 iph = ip_hdr(skb);
1954 tcp_v4_fill_cb(skb, iph, th);
1955
1956 skb->dev = NULL;
1957
1958 if (sk->sk_state == TCP_LISTEN) {
1959 ret = tcp_v4_do_rcv(sk, skb);
1960 goto put_and_return;
1961 }
1962
1963 sk_incoming_cpu_update(sk);
1964
1965 bh_lock_sock_nested(sk);
1966 tcp_segs_in(tcp_sk(sk), skb);
1967 ret = 0;
1968 if (!sock_owned_by_user(sk)) {
1969 skb_to_free = sk->sk_rx_skb_cache;
1970 sk->sk_rx_skb_cache = NULL;
1971 ret = tcp_v4_do_rcv(sk, skb);
1972 } else {
1973 if (tcp_add_backlog(sk, skb))
1974 goto discard_and_relse;
1975 skb_to_free = NULL;
1976 }
1977 bh_unlock_sock(sk);
1978 if (skb_to_free)
1979 __kfree_skb(skb_to_free);
1980
1981 put_and_return:
1982 if (refcounted)
1983 sock_put(sk);
1984
1985 return ret;
1986
1987 no_tcp_socket:
1988 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1989 goto discard_it;
1990
1991 tcp_v4_fill_cb(skb, iph, th);
1992
1993 if (tcp_checksum_complete(skb)) {
1994 csum_error:
1995 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1996 bad_packet:
1997 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1998 } else {
1999 tcp_v4_send_reset(NULL, skb);
2000 }
2001
2002 discard_it:
2003 /* Discard frame. */
2004 kfree_skb(skb);
2005 return 0;
2006
2007 discard_and_relse:
2008 sk_drops_add(sk, skb);
2009 if (refcounted)
2010 sock_put(sk);
2011 goto discard_it;
2012
2013 do_time_wait:
2014 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2015 inet_twsk_put(inet_twsk(sk));
2016 goto discard_it;
2017 }
2018
2019 tcp_v4_fill_cb(skb, iph, th);
2020
2021 if (tcp_checksum_complete(skb)) {
2022 inet_twsk_put(inet_twsk(sk));
2023 goto csum_error;
2024 }
2025 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2026 case TCP_TW_SYN: {
2027 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2028 &tcp_hashinfo, skb,
2029 __tcp_hdrlen(th),
2030 iph->saddr, th->source,
2031 iph->daddr, th->dest,
2032 inet_iif(skb),
2033 sdif);
2034 if (sk2) {
2035 inet_twsk_deschedule_put(inet_twsk(sk));
2036 sk = sk2;
2037 tcp_v4_restore_cb(skb);
2038 refcounted = false;
2039 goto process;
2040 }
2041 }
2042 /* to ACK */
2043 /* fall through */
2044 case TCP_TW_ACK:
2045 tcp_v4_timewait_ack(sk, skb);
2046 break;
2047 case TCP_TW_RST:
2048 tcp_v4_send_reset(sk, skb);
2049 inet_twsk_deschedule_put(inet_twsk(sk));
2050 goto discard_it;
2051 case TCP_TW_SUCCESS:;
2052 }
2053 goto discard_it;
2054 }
2055
2056 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2057 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2058 .twsk_unique = tcp_twsk_unique,
2059 .twsk_destructor= tcp_twsk_destructor,
2060 };
2061
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2062 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2063 {
2064 struct dst_entry *dst = skb_dst(skb);
2065
2066 if (dst && dst_hold_safe(dst)) {
2067 rcu_assign_pointer(sk->sk_rx_dst, dst);
2068 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2069 }
2070 }
2071 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2072
2073 const struct inet_connection_sock_af_ops ipv4_specific = {
2074 .queue_xmit = ip_queue_xmit,
2075 .send_check = tcp_v4_send_check,
2076 .rebuild_header = inet_sk_rebuild_header,
2077 .sk_rx_dst_set = inet_sk_rx_dst_set,
2078 .conn_request = tcp_v4_conn_request,
2079 .syn_recv_sock = tcp_v4_syn_recv_sock,
2080 .net_header_len = sizeof(struct iphdr),
2081 .setsockopt = ip_setsockopt,
2082 .getsockopt = ip_getsockopt,
2083 .addr2sockaddr = inet_csk_addr2sockaddr,
2084 .sockaddr_len = sizeof(struct sockaddr_in),
2085 #ifdef CONFIG_COMPAT
2086 .compat_setsockopt = compat_ip_setsockopt,
2087 .compat_getsockopt = compat_ip_getsockopt,
2088 #endif
2089 .mtu_reduced = tcp_v4_mtu_reduced,
2090 };
2091 EXPORT_SYMBOL(ipv4_specific);
2092
2093 #ifdef CONFIG_TCP_MD5SIG
2094 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2095 .md5_lookup = tcp_v4_md5_lookup,
2096 .calc_md5_hash = tcp_v4_md5_hash_skb,
2097 .md5_parse = tcp_v4_parse_md5_keys,
2098 };
2099 #endif
2100
2101 /* NOTE: A lot of things set to zero explicitly by call to
2102 * sk_alloc() so need not be done here.
2103 */
tcp_v4_init_sock(struct sock * sk)2104 static int tcp_v4_init_sock(struct sock *sk)
2105 {
2106 struct inet_connection_sock *icsk = inet_csk(sk);
2107
2108 tcp_init_sock(sk);
2109
2110 icsk->icsk_af_ops = &ipv4_specific;
2111
2112 #ifdef CONFIG_TCP_MD5SIG
2113 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2114 #endif
2115
2116 return 0;
2117 }
2118
tcp_v4_destroy_sock(struct sock * sk)2119 void tcp_v4_destroy_sock(struct sock *sk)
2120 {
2121 struct tcp_sock *tp = tcp_sk(sk);
2122
2123 trace_tcp_destroy_sock(sk);
2124
2125 tcp_clear_xmit_timers(sk);
2126
2127 tcp_cleanup_congestion_control(sk);
2128
2129 tcp_cleanup_ulp(sk);
2130
2131 /* Cleanup up the write buffer. */
2132 tcp_write_queue_purge(sk);
2133
2134 /* Check if we want to disable active TFO */
2135 tcp_fastopen_active_disable_ofo_check(sk);
2136
2137 /* Cleans up our, hopefully empty, out_of_order_queue. */
2138 skb_rbtree_purge(&tp->out_of_order_queue);
2139
2140 #ifdef CONFIG_TCP_MD5SIG
2141 /* Clean up the MD5 key list, if any */
2142 if (tp->md5sig_info) {
2143 tcp_clear_md5_list(sk);
2144 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2145 tp->md5sig_info = NULL;
2146 }
2147 #endif
2148
2149 /* Clean up a referenced TCP bind bucket. */
2150 if (inet_csk(sk)->icsk_bind_hash)
2151 inet_put_port(sk);
2152
2153 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2154
2155 /* If socket is aborted during connect operation */
2156 tcp_free_fastopen_req(tp);
2157 tcp_fastopen_destroy_cipher(sk);
2158 tcp_saved_syn_free(tp);
2159
2160 sk_sockets_allocated_dec(sk);
2161 }
2162 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2163
2164 #ifdef CONFIG_PROC_FS
2165 /* Proc filesystem TCP sock list dumping. */
2166
2167 /*
2168 * Get next listener socket follow cur. If cur is NULL, get first socket
2169 * starting from bucket given in st->bucket; when st->bucket is zero the
2170 * very first socket in the hash table is returned.
2171 */
listening_get_next(struct seq_file * seq,void * cur)2172 static void *listening_get_next(struct seq_file *seq, void *cur)
2173 {
2174 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2175 struct tcp_iter_state *st = seq->private;
2176 struct net *net = seq_file_net(seq);
2177 struct inet_listen_hashbucket *ilb;
2178 struct hlist_nulls_node *node;
2179 struct sock *sk = cur;
2180
2181 if (!sk) {
2182 get_head:
2183 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2184 spin_lock(&ilb->lock);
2185 sk = sk_nulls_head(&ilb->nulls_head);
2186 st->offset = 0;
2187 goto get_sk;
2188 }
2189 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2190 ++st->num;
2191 ++st->offset;
2192
2193 sk = sk_nulls_next(sk);
2194 get_sk:
2195 sk_nulls_for_each_from(sk, node) {
2196 if (!net_eq(sock_net(sk), net))
2197 continue;
2198 if (sk->sk_family == afinfo->family)
2199 return sk;
2200 }
2201 spin_unlock(&ilb->lock);
2202 st->offset = 0;
2203 if (++st->bucket < INET_LHTABLE_SIZE)
2204 goto get_head;
2205 return NULL;
2206 }
2207
listening_get_idx(struct seq_file * seq,loff_t * pos)2208 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2209 {
2210 struct tcp_iter_state *st = seq->private;
2211 void *rc;
2212
2213 st->bucket = 0;
2214 st->offset = 0;
2215 rc = listening_get_next(seq, NULL);
2216
2217 while (rc && *pos) {
2218 rc = listening_get_next(seq, rc);
2219 --*pos;
2220 }
2221 return rc;
2222 }
2223
empty_bucket(const struct tcp_iter_state * st)2224 static inline bool empty_bucket(const struct tcp_iter_state *st)
2225 {
2226 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2227 }
2228
2229 /*
2230 * Get first established socket starting from bucket given in st->bucket.
2231 * If st->bucket is zero, the very first socket in the hash is returned.
2232 */
established_get_first(struct seq_file * seq)2233 static void *established_get_first(struct seq_file *seq)
2234 {
2235 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2236 struct tcp_iter_state *st = seq->private;
2237 struct net *net = seq_file_net(seq);
2238 void *rc = NULL;
2239
2240 st->offset = 0;
2241 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2242 struct sock *sk;
2243 struct hlist_nulls_node *node;
2244 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2245
2246 /* Lockless fast path for the common case of empty buckets */
2247 if (empty_bucket(st))
2248 continue;
2249
2250 spin_lock_bh(lock);
2251 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2252 if (sk->sk_family != afinfo->family ||
2253 !net_eq(sock_net(sk), net)) {
2254 continue;
2255 }
2256 rc = sk;
2257 goto out;
2258 }
2259 spin_unlock_bh(lock);
2260 }
2261 out:
2262 return rc;
2263 }
2264
established_get_next(struct seq_file * seq,void * cur)2265 static void *established_get_next(struct seq_file *seq, void *cur)
2266 {
2267 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268 struct sock *sk = cur;
2269 struct hlist_nulls_node *node;
2270 struct tcp_iter_state *st = seq->private;
2271 struct net *net = seq_file_net(seq);
2272
2273 ++st->num;
2274 ++st->offset;
2275
2276 sk = sk_nulls_next(sk);
2277
2278 sk_nulls_for_each_from(sk, node) {
2279 if (sk->sk_family == afinfo->family &&
2280 net_eq(sock_net(sk), net))
2281 return sk;
2282 }
2283
2284 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2285 ++st->bucket;
2286 return established_get_first(seq);
2287 }
2288
established_get_idx(struct seq_file * seq,loff_t pos)2289 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2290 {
2291 struct tcp_iter_state *st = seq->private;
2292 void *rc;
2293
2294 st->bucket = 0;
2295 rc = established_get_first(seq);
2296
2297 while (rc && pos) {
2298 rc = established_get_next(seq, rc);
2299 --pos;
2300 }
2301 return rc;
2302 }
2303
tcp_get_idx(struct seq_file * seq,loff_t pos)2304 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2305 {
2306 void *rc;
2307 struct tcp_iter_state *st = seq->private;
2308
2309 st->state = TCP_SEQ_STATE_LISTENING;
2310 rc = listening_get_idx(seq, &pos);
2311
2312 if (!rc) {
2313 st->state = TCP_SEQ_STATE_ESTABLISHED;
2314 rc = established_get_idx(seq, pos);
2315 }
2316
2317 return rc;
2318 }
2319
tcp_seek_last_pos(struct seq_file * seq)2320 static void *tcp_seek_last_pos(struct seq_file *seq)
2321 {
2322 struct tcp_iter_state *st = seq->private;
2323 int bucket = st->bucket;
2324 int offset = st->offset;
2325 int orig_num = st->num;
2326 void *rc = NULL;
2327
2328 switch (st->state) {
2329 case TCP_SEQ_STATE_LISTENING:
2330 if (st->bucket >= INET_LHTABLE_SIZE)
2331 break;
2332 st->state = TCP_SEQ_STATE_LISTENING;
2333 rc = listening_get_next(seq, NULL);
2334 while (offset-- && rc && bucket == st->bucket)
2335 rc = listening_get_next(seq, rc);
2336 if (rc)
2337 break;
2338 st->bucket = 0;
2339 st->state = TCP_SEQ_STATE_ESTABLISHED;
2340 /* Fallthrough */
2341 case TCP_SEQ_STATE_ESTABLISHED:
2342 if (st->bucket > tcp_hashinfo.ehash_mask)
2343 break;
2344 rc = established_get_first(seq);
2345 while (offset-- && rc && bucket == st->bucket)
2346 rc = established_get_next(seq, rc);
2347 }
2348
2349 st->num = orig_num;
2350
2351 return rc;
2352 }
2353
tcp_seq_start(struct seq_file * seq,loff_t * pos)2354 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2355 {
2356 struct tcp_iter_state *st = seq->private;
2357 void *rc;
2358
2359 if (*pos && *pos == st->last_pos) {
2360 rc = tcp_seek_last_pos(seq);
2361 if (rc)
2362 goto out;
2363 }
2364
2365 st->state = TCP_SEQ_STATE_LISTENING;
2366 st->num = 0;
2367 st->bucket = 0;
2368 st->offset = 0;
2369 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2370
2371 out:
2372 st->last_pos = *pos;
2373 return rc;
2374 }
2375 EXPORT_SYMBOL(tcp_seq_start);
2376
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2377 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2378 {
2379 struct tcp_iter_state *st = seq->private;
2380 void *rc = NULL;
2381
2382 if (v == SEQ_START_TOKEN) {
2383 rc = tcp_get_idx(seq, 0);
2384 goto out;
2385 }
2386
2387 switch (st->state) {
2388 case TCP_SEQ_STATE_LISTENING:
2389 rc = listening_get_next(seq, v);
2390 if (!rc) {
2391 st->state = TCP_SEQ_STATE_ESTABLISHED;
2392 st->bucket = 0;
2393 st->offset = 0;
2394 rc = established_get_first(seq);
2395 }
2396 break;
2397 case TCP_SEQ_STATE_ESTABLISHED:
2398 rc = established_get_next(seq, v);
2399 break;
2400 }
2401 out:
2402 ++*pos;
2403 st->last_pos = *pos;
2404 return rc;
2405 }
2406 EXPORT_SYMBOL(tcp_seq_next);
2407
tcp_seq_stop(struct seq_file * seq,void * v)2408 void tcp_seq_stop(struct seq_file *seq, void *v)
2409 {
2410 struct tcp_iter_state *st = seq->private;
2411
2412 switch (st->state) {
2413 case TCP_SEQ_STATE_LISTENING:
2414 if (v != SEQ_START_TOKEN)
2415 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2416 break;
2417 case TCP_SEQ_STATE_ESTABLISHED:
2418 if (v)
2419 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2420 break;
2421 }
2422 }
2423 EXPORT_SYMBOL(tcp_seq_stop);
2424
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2425 static void get_openreq4(const struct request_sock *req,
2426 struct seq_file *f, int i)
2427 {
2428 const struct inet_request_sock *ireq = inet_rsk(req);
2429 long delta = req->rsk_timer.expires - jiffies;
2430
2431 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2432 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2433 i,
2434 ireq->ir_loc_addr,
2435 ireq->ir_num,
2436 ireq->ir_rmt_addr,
2437 ntohs(ireq->ir_rmt_port),
2438 TCP_SYN_RECV,
2439 0, 0, /* could print option size, but that is af dependent. */
2440 1, /* timers active (only the expire timer) */
2441 jiffies_delta_to_clock_t(delta),
2442 req->num_timeout,
2443 from_kuid_munged(seq_user_ns(f),
2444 sock_i_uid(req->rsk_listener)),
2445 0, /* non standard timer */
2446 0, /* open_requests have no inode */
2447 0,
2448 req);
2449 }
2450
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2451 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2452 {
2453 int timer_active;
2454 unsigned long timer_expires;
2455 const struct tcp_sock *tp = tcp_sk(sk);
2456 const struct inet_connection_sock *icsk = inet_csk(sk);
2457 const struct inet_sock *inet = inet_sk(sk);
2458 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2459 __be32 dest = inet->inet_daddr;
2460 __be32 src = inet->inet_rcv_saddr;
2461 __u16 destp = ntohs(inet->inet_dport);
2462 __u16 srcp = ntohs(inet->inet_sport);
2463 int rx_queue;
2464 int state;
2465
2466 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2467 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2468 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2469 timer_active = 1;
2470 timer_expires = icsk->icsk_timeout;
2471 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2472 timer_active = 4;
2473 timer_expires = icsk->icsk_timeout;
2474 } else if (timer_pending(&sk->sk_timer)) {
2475 timer_active = 2;
2476 timer_expires = sk->sk_timer.expires;
2477 } else {
2478 timer_active = 0;
2479 timer_expires = jiffies;
2480 }
2481
2482 state = inet_sk_state_load(sk);
2483 if (state == TCP_LISTEN)
2484 rx_queue = sk->sk_ack_backlog;
2485 else
2486 /* Because we don't lock the socket,
2487 * we might find a transient negative value.
2488 */
2489 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2490 READ_ONCE(tp->copied_seq), 0);
2491
2492 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2493 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2494 i, src, srcp, dest, destp, state,
2495 READ_ONCE(tp->write_seq) - tp->snd_una,
2496 rx_queue,
2497 timer_active,
2498 jiffies_delta_to_clock_t(timer_expires - jiffies),
2499 icsk->icsk_retransmits,
2500 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2501 icsk->icsk_probes_out,
2502 sock_i_ino(sk),
2503 refcount_read(&sk->sk_refcnt), sk,
2504 jiffies_to_clock_t(icsk->icsk_rto),
2505 jiffies_to_clock_t(icsk->icsk_ack.ato),
2506 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2507 tp->snd_cwnd,
2508 state == TCP_LISTEN ?
2509 fastopenq->max_qlen :
2510 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2511 }
2512
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2513 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2514 struct seq_file *f, int i)
2515 {
2516 long delta = tw->tw_timer.expires - jiffies;
2517 __be32 dest, src;
2518 __u16 destp, srcp;
2519
2520 dest = tw->tw_daddr;
2521 src = tw->tw_rcv_saddr;
2522 destp = ntohs(tw->tw_dport);
2523 srcp = ntohs(tw->tw_sport);
2524
2525 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2526 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2527 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2528 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2529 refcount_read(&tw->tw_refcnt), tw);
2530 }
2531
2532 #define TMPSZ 150
2533
tcp4_seq_show(struct seq_file * seq,void * v)2534 static int tcp4_seq_show(struct seq_file *seq, void *v)
2535 {
2536 struct tcp_iter_state *st;
2537 struct sock *sk = v;
2538
2539 seq_setwidth(seq, TMPSZ - 1);
2540 if (v == SEQ_START_TOKEN) {
2541 seq_puts(seq, " sl local_address rem_address st tx_queue "
2542 "rx_queue tr tm->when retrnsmt uid timeout "
2543 "inode");
2544 goto out;
2545 }
2546 st = seq->private;
2547
2548 if (sk->sk_state == TCP_TIME_WAIT)
2549 get_timewait4_sock(v, seq, st->num);
2550 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2551 get_openreq4(v, seq, st->num);
2552 else
2553 get_tcp4_sock(v, seq, st->num);
2554 out:
2555 seq_pad(seq, '\n');
2556 return 0;
2557 }
2558
2559 static const struct seq_operations tcp4_seq_ops = {
2560 .show = tcp4_seq_show,
2561 .start = tcp_seq_start,
2562 .next = tcp_seq_next,
2563 .stop = tcp_seq_stop,
2564 };
2565
2566 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2567 .family = AF_INET,
2568 };
2569
tcp4_proc_init_net(struct net * net)2570 static int __net_init tcp4_proc_init_net(struct net *net)
2571 {
2572 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2573 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2574 return -ENOMEM;
2575 return 0;
2576 }
2577
tcp4_proc_exit_net(struct net * net)2578 static void __net_exit tcp4_proc_exit_net(struct net *net)
2579 {
2580 remove_proc_entry("tcp", net->proc_net);
2581 }
2582
2583 static struct pernet_operations tcp4_net_ops = {
2584 .init = tcp4_proc_init_net,
2585 .exit = tcp4_proc_exit_net,
2586 };
2587
tcp4_proc_init(void)2588 int __init tcp4_proc_init(void)
2589 {
2590 return register_pernet_subsys(&tcp4_net_ops);
2591 }
2592
tcp4_proc_exit(void)2593 void tcp4_proc_exit(void)
2594 {
2595 unregister_pernet_subsys(&tcp4_net_ops);
2596 }
2597 #endif /* CONFIG_PROC_FS */
2598
2599 struct proto tcp_prot = {
2600 .name = "TCP",
2601 .owner = THIS_MODULE,
2602 .close = tcp_close,
2603 .pre_connect = tcp_v4_pre_connect,
2604 .connect = tcp_v4_connect,
2605 .disconnect = tcp_disconnect,
2606 .accept = inet_csk_accept,
2607 .ioctl = tcp_ioctl,
2608 .init = tcp_v4_init_sock,
2609 .destroy = tcp_v4_destroy_sock,
2610 .shutdown = tcp_shutdown,
2611 .setsockopt = tcp_setsockopt,
2612 .getsockopt = tcp_getsockopt,
2613 .keepalive = tcp_set_keepalive,
2614 .recvmsg = tcp_recvmsg,
2615 .sendmsg = tcp_sendmsg,
2616 .sendpage = tcp_sendpage,
2617 .backlog_rcv = tcp_v4_do_rcv,
2618 .release_cb = tcp_release_cb,
2619 .hash = inet_hash,
2620 .unhash = inet_unhash,
2621 .get_port = inet_csk_get_port,
2622 .enter_memory_pressure = tcp_enter_memory_pressure,
2623 .leave_memory_pressure = tcp_leave_memory_pressure,
2624 .stream_memory_free = tcp_stream_memory_free,
2625 .sockets_allocated = &tcp_sockets_allocated,
2626 .orphan_count = &tcp_orphan_count,
2627 .memory_allocated = &tcp_memory_allocated,
2628 .memory_pressure = &tcp_memory_pressure,
2629 .sysctl_mem = sysctl_tcp_mem,
2630 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2631 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2632 .max_header = MAX_TCP_HEADER,
2633 .obj_size = sizeof(struct tcp_sock),
2634 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2635 .twsk_prot = &tcp_timewait_sock_ops,
2636 .rsk_prot = &tcp_request_sock_ops,
2637 .h.hashinfo = &tcp_hashinfo,
2638 .no_autobind = true,
2639 #ifdef CONFIG_COMPAT
2640 .compat_setsockopt = compat_tcp_setsockopt,
2641 .compat_getsockopt = compat_tcp_getsockopt,
2642 #endif
2643 .diag_destroy = tcp_abort,
2644 };
2645 EXPORT_SYMBOL(tcp_prot);
2646
tcp_sk_exit(struct net * net)2647 static void __net_exit tcp_sk_exit(struct net *net)
2648 {
2649 int cpu;
2650
2651 if (net->ipv4.tcp_congestion_control)
2652 module_put(net->ipv4.tcp_congestion_control->owner);
2653
2654 for_each_possible_cpu(cpu)
2655 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2656 free_percpu(net->ipv4.tcp_sk);
2657 }
2658
tcp_sk_init(struct net * net)2659 static int __net_init tcp_sk_init(struct net *net)
2660 {
2661 int res, cpu, cnt;
2662
2663 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2664 if (!net->ipv4.tcp_sk)
2665 return -ENOMEM;
2666
2667 for_each_possible_cpu(cpu) {
2668 struct sock *sk;
2669
2670 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2671 IPPROTO_TCP, net);
2672 if (res)
2673 goto fail;
2674 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2675
2676 /* Please enforce IP_DF and IPID==0 for RST and
2677 * ACK sent in SYN-RECV and TIME-WAIT state.
2678 */
2679 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2680
2681 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2682 }
2683
2684 net->ipv4.sysctl_tcp_ecn = 2;
2685 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2686
2687 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2688 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2689 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2690 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2691 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2692
2693 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2694 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2695 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2696
2697 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2698 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2699 net->ipv4.sysctl_tcp_syncookies = 1;
2700 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2701 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2702 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2703 net->ipv4.sysctl_tcp_orphan_retries = 0;
2704 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2705 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2706 net->ipv4.sysctl_tcp_tw_reuse = 2;
2707
2708 cnt = tcp_hashinfo.ehash_mask + 1;
2709 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2710 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2711
2712 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2713 net->ipv4.sysctl_tcp_sack = 1;
2714 net->ipv4.sysctl_tcp_window_scaling = 1;
2715 net->ipv4.sysctl_tcp_timestamps = 1;
2716 net->ipv4.sysctl_tcp_early_retrans = 3;
2717 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2718 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2719 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2720 net->ipv4.sysctl_tcp_max_reordering = 300;
2721 net->ipv4.sysctl_tcp_dsack = 1;
2722 net->ipv4.sysctl_tcp_app_win = 31;
2723 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2724 net->ipv4.sysctl_tcp_frto = 2;
2725 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2726 /* This limits the percentage of the congestion window which we
2727 * will allow a single TSO frame to consume. Building TSO frames
2728 * which are too large can cause TCP streams to be bursty.
2729 */
2730 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2731 /* Default TSQ limit of 16 TSO segments */
2732 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2733 /* rfc5961 challenge ack rate limiting */
2734 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2735 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2736 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2737 net->ipv4.sysctl_tcp_autocorking = 1;
2738 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2739 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2740 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2741 if (net != &init_net) {
2742 memcpy(net->ipv4.sysctl_tcp_rmem,
2743 init_net.ipv4.sysctl_tcp_rmem,
2744 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2745 memcpy(net->ipv4.sysctl_tcp_wmem,
2746 init_net.ipv4.sysctl_tcp_wmem,
2747 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2748 }
2749 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2750 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2751 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2752 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2753 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2754 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2755
2756 /* Reno is always built in */
2757 if (!net_eq(net, &init_net) &&
2758 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2759 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2760 else
2761 net->ipv4.tcp_congestion_control = &tcp_reno;
2762
2763 return 0;
2764 fail:
2765 tcp_sk_exit(net);
2766
2767 return res;
2768 }
2769
tcp_sk_exit_batch(struct list_head * net_exit_list)2770 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2771 {
2772 struct net *net;
2773
2774 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2775
2776 list_for_each_entry(net, net_exit_list, exit_list)
2777 tcp_fastopen_ctx_destroy(net);
2778 }
2779
2780 static struct pernet_operations __net_initdata tcp_sk_ops = {
2781 .init = tcp_sk_init,
2782 .exit = tcp_sk_exit,
2783 .exit_batch = tcp_sk_exit_batch,
2784 };
2785
tcp_v4_init(void)2786 void __init tcp_v4_init(void)
2787 {
2788 if (register_pernet_subsys(&tcp_sk_ops))
2789 panic("Failed to create the TCP control socket.\n");
2790 }
2791