• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Based on net/ipv4/tcp.c
4  * Authors:	Ross Biro
5  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
6  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
7  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
8  *		Florian La Roche, <flla@stud.uni-sb.de>
9  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
10  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
11  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *		Matthew Dillon, <dillon@apollo.west.oic.com>
13  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
14  *		Jorge Cwik, <jorge@laser.satlink.net>
15  *
16  * Fixes:
17  *		Alan Cox	:	Numerous verify_area() calls
18  *		Alan Cox	:	Set the ACK bit on a reset
19  *		Alan Cox	:	Stopped it crashing if it closed while
20  *					sk->inuse=1 and was trying to connect
21  *					(tcp_err()).
22  *		Alan Cox	:	All icmp error handling was broken
23  *					pointers passed where wrong and the
24  *					socket was looked up backwards. Nobody
25  *					tested any icmp error code obviously.
26  *		Alan Cox	:	tcp_err() now handled properly. It
27  *					wakes people on errors. poll
28  *					behaves and the icmp error race
29  *					has gone by moving it into sock.c
30  *		Alan Cox	:	tcp_send_reset() fixed to work for
31  *					everything not just packets for
32  *					unknown sockets.
33  *		Alan Cox	:	tcp option processing.
34  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
35  *					syn rule wrong]
36  *		Herp Rosmanith  :	More reset fixes
37  *		Alan Cox	:	No longer acks invalid rst frames.
38  *					Acking any kind of RST is right out.
39  *		Alan Cox	:	Sets an ignore me flag on an rst
40  *					receive otherwise odd bits of prattle
41  *					escape still
42  *		Alan Cox	:	Fixed another acking RST frame bug.
43  *					Should stop LAN workplace lockups.
44  *		Alan Cox	:	Some tidyups using the new skb list
45  *					facilities
46  *		Alan Cox	:	sk->keepopen now seems to work
47  *		Alan Cox	:	Pulls options out correctly on accepts
48  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
49  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
50  *					bit to skb ops.
51  *		Alan Cox	:	Tidied tcp_data to avoid a potential
52  *					nasty.
53  *		Alan Cox	:	Added some better commenting, as the
54  *					tcp is hard to follow
55  *		Alan Cox	:	Removed incorrect check for 20 * psh
56  *	Michael O'Reilly	:	ack < copied bug fix.
57  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
58  *		Alan Cox	:	FIN with no memory -> CRASH
59  *		Alan Cox	:	Added socket option proto entries.
60  *					Also added awareness of them to accept.
61  *		Alan Cox	:	Added TCP options (SOL_TCP)
62  *		Alan Cox	:	Switched wakeup calls to callbacks,
63  *					so the kernel can layer network
64  *					sockets.
65  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
66  *		Alan Cox	:	Handle FIN (more) properly (we hope).
67  *		Alan Cox	:	RST frames sent on unsynchronised
68  *					state ack error.
69  *		Alan Cox	:	Put in missing check for SYN bit.
70  *		Alan Cox	:	Added tcp_select_window() aka NET2E
71  *					window non shrink trick.
72  *		Alan Cox	:	Added a couple of small NET2E timer
73  *					fixes
74  *		Charles Hedrick :	TCP fixes
75  *		Toomas Tamm	:	TCP window fixes
76  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
77  *		Charles Hedrick	:	Rewrote most of it to actually work
78  *		Linus		:	Rewrote tcp_read() and URG handling
79  *					completely
80  *		Gerhard Koerting:	Fixed some missing timer handling
81  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
82  *		Gerhard Koerting:	PC/TCP workarounds
83  *		Adam Caldwell	:	Assorted timer/timing errors
84  *		Matthew Dillon	:	Fixed another RST bug
85  *		Alan Cox	:	Move to kernel side addressing changes.
86  *		Alan Cox	:	Beginning work on TCP fastpathing
87  *					(not yet usable)
88  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
89  *		Alan Cox	:	TCP fast path debugging
90  *		Alan Cox	:	Window clamping
91  *		Michael Riepe	:	Bug in tcp_check()
92  *		Matt Dillon	:	More TCP improvements and RST bug fixes
93  *		Matt Dillon	:	Yet more small nasties remove from the
94  *					TCP code (Be very nice to this man if
95  *					tcp finally works 100%) 8)
96  *		Alan Cox	:	BSD accept semantics.
97  *		Alan Cox	:	Reset on closedown bug.
98  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
99  *		Michael Pall	:	Handle poll() after URG properly in
100  *					all cases.
101  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
102  *					(multi URG PUSH broke rlogin).
103  *		Michael Pall	:	Fix the multi URG PUSH problem in
104  *					tcp_readable(), poll() after URG
105  *					works now.
106  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
107  *					BSD api.
108  *		Alan Cox	:	Changed the semantics of sk->socket to
109  *					fix a race and a signal problem with
110  *					accept() and async I/O.
111  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
112  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
113  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
114  *					clients/servers which listen in on
115  *					fixed ports.
116  *		Alan Cox	:	Cleaned the above up and shrank it to
117  *					a sensible code size.
118  *		Alan Cox	:	Self connect lockup fix.
119  *		Alan Cox	:	No connect to multicast.
120  *		Ross Biro	:	Close unaccepted children on master
121  *					socket close.
122  *		Alan Cox	:	Reset tracing code.
123  *		Alan Cox	:	Spurious resets on shutdown.
124  *		Alan Cox	:	Giant 15 minute/60 second timer error
125  *		Alan Cox	:	Small whoops in polling before an
126  *					accept.
127  *		Alan Cox	:	Kept the state trace facility since
128  *					it's handy for debugging.
129  *		Alan Cox	:	More reset handler fixes.
130  *		Alan Cox	:	Started rewriting the code based on
131  *					the RFC's for other useful protocol
132  *					references see: Comer, KA9Q NOS, and
133  *					for a reference on the difference
134  *					between specifications and how BSD
135  *					works see the 4.4lite source.
136  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
137  *					close.
138  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
139  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
140  *		Alan Cox	:	Reimplemented timers as per the RFC
141  *					and using multiple timers for sanity.
142  *		Alan Cox	:	Small bug fixes, and a lot of new
143  *					comments.
144  *		Alan Cox	:	Fixed dual reader crash by locking
145  *					the buffers (much like datagram.c)
146  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
147  *					now gets fed up of retrying without
148  *					(even a no space) answer.
149  *		Alan Cox	:	Extracted closing code better
150  *		Alan Cox	:	Fixed the closing state machine to
151  *					resemble the RFC.
152  *		Alan Cox	:	More 'per spec' fixes.
153  *		Jorge Cwik	:	Even faster checksumming.
154  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
155  *					only frames. At least one pc tcp stack
156  *					generates them.
157  *		Alan Cox	:	Cache last socket.
158  *		Alan Cox	:	Per route irtt.
159  *		Matt Day	:	poll()->select() match BSD precisely on error
160  *		Alan Cox	:	New buffers
161  *		Marc Tamsky	:	Various sk->prot->retransmits and
162  *					sk->retransmits misupdating fixed.
163  *					Fixed tcp_write_timeout: stuck close,
164  *					and TCP syn retries gets used now.
165  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
166  *					ack if state is TCP_CLOSED.
167  *		Alan Cox	:	Look up device on a retransmit - routes may
168  *					change. Doesn't yet cope with MSS shrink right
169  *					but it's a start!
170  *		Marc Tamsky	:	Closing in closing fixes.
171  *		Mike Shaver	:	RFC1122 verifications.
172  *		Alan Cox	:	rcv_saddr errors.
173  *		Alan Cox	:	Block double connect().
174  *		Alan Cox	:	Small hooks for enSKIP.
175  *		Alexey Kuznetsov:	Path MTU discovery.
176  *		Alan Cox	:	Support soft errors.
177  *		Alan Cox	:	Fix MTU discovery pathological case
178  *					when the remote claims no mtu!
179  *		Marc Tamsky	:	TCP_CLOSE fix.
180  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
181  *					window but wrong (fixes NT lpd problems)
182  *		Pedro Roque	:	Better TCP window handling, delayed ack.
183  *		Joerg Reuter	:	No modification of locked buffers in
184  *					tcp_do_retransmit()
185  *		Eric Schenk	:	Changed receiver side silly window
186  *					avoidance algorithm to BSD style
187  *					algorithm. This doubles throughput
188  *					against machines running Solaris,
189  *					and seems to result in general
190  *					improvement.
191  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
192  *	Willy Konynenberg	:	Transparent proxying support.
193  *	Mike McLagan		:	Routing by source
194  *		Keith Owens	:	Do proper merging with partial SKB's in
195  *					tcp_do_sendmsg to avoid burstiness.
196  *		Eric Schenk	:	Fix fast close down bug with
197  *					shutdown() followed by close().
198  *		Andi Kleen	:	Make poll agree with SIGIO
199  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
200  *					lingertime == 0 (RFC 793 ABORT Call)
201  *	Hirokazu Takahashi	:	Use copy_from_user() instead of
202  *					csum_and_copy_from_user() if possible.
203  *
204  * Based on net/ipv4/tcp_ipv4.c
205  *		See tcp.c for author information
206  *
207  * Changes:
208  *		David S. Miller	:	New socket lookup architecture.
209  *					This code is dedicated to John Dyson.
210  *		David S. Miller :	Change semantics of established hash,
211  *					half is devoted to TIME_WAIT sockets
212  *					and the rest go in the other half.
213  *		Andi Kleen :		Add support for syncookies and fixed
214  *					some bugs: ip options weren't passed to
215  *					the TCP layer, missed a check for an
216  *					ACK bit.
217  *		Andi Kleen :		Implemented fast path mtu discovery.
218  *						Fixed many serious bugs in the
219  *					request_sock handling and moved
220  *					most of it into the af independent code.
221  *					Added tail drop and some other bugfixes.
222  *					Added new listen semantics.
223  *		Mike McLagan	:	Routing by source
224  *	Juan Jose Ciarlante:		ip_dynaddr bits
225  *		Andi Kleen:		various fixes.
226  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
227  *					coma.
228  *	Andi Kleen		:	Fix new listen.
229  *	Andi Kleen		:	Fix accept error reporting.
230  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
231  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
232  *					a single port at the same time.
233  *
234  * Based on net/ipv6/tcp_ipv6.c
235  *	Authors:
236  *	Pedro Roque		<roque@di.fc.ul.pt>
237  *
238  *	Fixes:
239  *	Hideaki YOSHIFUJI	:	sin6_scope_id support
240  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
241  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
242  *					a single port at the same time.
243  *	YOSHIFUJI Hideaki @USAGI:	convert /proc/net/tcp6 to seq_file.
244  *
245  * Based on net/core/stream.c
246  *     Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
247  *                     (from old tcp.c code)
248  *                     Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
249  *
250  * Based on net/ipv4/tcp_output.c
251  * Authors:	Ross Biro
252  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
253  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
254  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
255  *		Florian La Roche, <flla@stud.uni-sb.de>
256  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
257  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
258  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
259  *		Matthew Dillon, <dillon@apollo.west.oic.com>
260  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
261  *		Jorge Cwik, <jorge@laser.satlink.net>
262  *
263  * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
264  *				:	Fragmentation on mtu decrease
265  *				:	Segment collapse on retransmit
266  *				:	AF independence
267  *
268  *		Linus Torvalds	:	send_delayed_ack
269  *		David S. Miller	:	Charge memory using the right skb
270  *					during syn/ack processing.
271  *		David S. Miller :	Output engine completely rewritten.
272  *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
273  *		Cacophonix Gaul :	draft-minshall-nagle-01
274  *		J Hadi Salim	:	ECN support
275  *
276  * Based on net/ipv4/tcp_input.c
277  * Authors:	Ross Biro
278  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
279  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
280  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
281  *		Florian La Roche, <flla@stud.uni-sb.de>
282  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
283  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
284  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
285  *		Matthew Dillon, <dillon@apollo.west.oic.com>
286  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
287  *		Jorge Cwik, <jorge@laser.satlink.net>
288  *
289  * Changes:
290  *		Pedro Roque	:	Fast Retransmit/Recovery.
291  *					Two receive queues.
292  *					Retransmit queue handled by TCP.
293  *					Better retransmit timer handling.
294  *					New congestion avoidance.
295  *					Header prediction.
296  *					Variable renaming.
297  *
298  *		Eric		:	Fast Retransmit.
299  *		Randy Scott	:	MSS option defines.
300  *		Eric Schenk	:	Fixes to slow start algorithm.
301  *		Eric Schenk	:	Yet another double ACK bug.
302  *		Eric Schenk	:	Delayed ACK bug fixes.
303  *		Eric Schenk	:	Floyd style fast retrans war avoidance.
304  *		David S. Miller	:	Don't allow zero congestion window.
305  *		Eric Schenk	:	Fix retransmitter so that it sends
306  *					next packet on ack of previous packet.
307  *		Andi Kleen	:	Moved open_request checking here
308  *					and process RSTs for open_requests.
309  *		Andi Kleen	:	Better prune_queue, and other fixes.
310  *		Andrey Savochkin:	Fix RTT measurements in the presence of
311  *					timestamps.
312  *		Andrey Savochkin:	Check sequence numbers correctly when
313  *					removing SACKs due to in sequence incoming
314  *					data segments.
315  *		Andi Kleen:		Make sure we never ack data there is not
316  *					enough room for. Also make this condition
317  *					a fatal error if it might still happen.
318  *		Andi Kleen:		Add tcp_measure_rcv_mss to make
319  *					connections with MSS<min(MTU,ann. MSS)
320  *					work without delayed acks.
321  *		Andi Kleen:		Process packets with PSH set in the
322  *					fast path.
323  *		J Hadi Salim:		ECN support
324  *		Andrei Gurtov,
325  *		Pasi Sarolahti,
326  *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
327  *					engine. Lots of bugs are found.
328  *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
329  *
330  * NewIP INET
331  * An implementation of the TCP/IP protocol suite for the LINUX
332  * operating system. NewIP INET is implemented using the  BSD Socket
333  * interface as the means of communication with the user level.
334  *
335  * Implementation of the Transmission Control Protocol(TCP).
336  *
337  * TCP over NewIP
338  *
339  * Description of States:
340  *
341  *    TCP_SYN_SENT      sent a connection request, waiting for ack
342  *
343  *    TCP_SYN_RECV      received a connection request, sent ack,
344  *                      waiting for final ack in three-way handshake.
345  *
346  *    TCP_ESTABLISHED   connection established
347  *
348  *    TCP_FIN_WAIT1     our side has shutdown, waiting to complete
349  *                      transmission of remaining buffered data
350  *
351  *    TCP_FIN_WAIT2     all buffered data sent, waiting for remote
352  *                      to shutdown
353  *
354  *    TCP_CLOSING       both sides have shutdown but we still have
355  *                      data we have to finish sending
356  *
357  *    TCP_TIME_WAIT     timeout to catch resent junk before entering
358  *                      closed, can only be entered from FIN_WAIT2
359  *                      or CLOSING.  Required because the other end
360  *                      may not have gotten our last ACK causing it
361  *                      to retransmit the data packet (which we ignore)
362  *
363  *    TCP_CLOSE_WAIT    remote side has shutdown and is waiting for
364  *                      us to finish writing our data and to shutdown
365  *                      (we have to close() to move on to LAST_ACK)
366  *
367  *    TCP_LAST_ACK      out side has shutdown after remote has
368  *                      shutdown.  There may still be data in our
369  *                      buffer that we have to finish sending
370  *
371  *    TCP_CLOSE         socket is finished
372  */
373 #define pr_fmt(fmt) KBUILD_MODNAME ": [%s:%d] " fmt, __func__, __LINE__
374 
375 #include <linux/module.h>
376 #include <linux/errno.h>
377 #include <linux/types.h>
378 #include <linux/socket.h>
379 #include <linux/sockios.h>
380 #include <linux/net.h>
381 #include <linux/jiffies.h>
382 #include <linux/netdevice.h>
383 #include <linux/init.h>
384 #include <linux/jhash.h>
385 #include <linux/times.h>
386 #include <linux/random.h>
387 #include <linux/seq_file.h>
388 
389 #include <net/tcp.h>
390 #include <net/ninet_hashtables.h>
391 #include <net/ninet_connection_sock.h>
392 #include <net/protocol.h>
393 #include <net/dsfield.h>
394 #include <net/timewait_sock.h>
395 #include <net/inet_common.h>
396 #include <net/secure_seq.h>
397 #include <net/nip.h>
398 #include <net/tcp_nip.h>
399 #include <net/nip_addrconf.h>
400 #include <net/nip_route.h>
401 #include <linux/nip.h>
402 #include "nip_checksum.h"
403 #include "nip_hdr.h"
404 #include "tcp_nip_parameter.h"
405 
406 #define tcp_header_length(th) ((th)->doff << 2)
407 #define TCP_ACK_NUM_MULTIPLIER      20
408 #define TCP_WINDOW_RAISE_THRESHOLD  2
409 #define TCP_BACKLOG_HEADROOM        (64 * 1024)
410 #define BYTES_PER_TCP_HEADER        4
411 
412 static const struct inet_connection_sock_af_ops newip_specific;
413 
tcp_nip_push(struct sock * sk,int flags,int mss_now,int nonagle,int size_goal)414 static void tcp_nip_push(struct sock *sk, int flags, int mss_now,
415 			 int nonagle, int size_goal)
416 {
417 	__tcp_nip_push_pending_frames(sk, mss_now, nonagle);
418 }
419 
420 static const unsigned char new_state[16] = {
421   /* current state:        new state:      action: */
422 [0]	= TCP_CLOSE,
423 [TCP_ESTABLISHED]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
424 [TCP_SYN_SENT]	= TCP_CLOSE,
425 [TCP_SYN_RECV]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
426 [TCP_FIN_WAIT1]	= TCP_FIN_WAIT1,
427 [TCP_FIN_WAIT2]	= TCP_FIN_WAIT2,
428 [TCP_TIME_WAIT]	= TCP_CLOSE,
429 [TCP_CLOSE]		= TCP_CLOSE,
430 [TCP_CLOSE_WAIT]	= TCP_LAST_ACK  | TCP_ACTION_FIN,
431 [TCP_LAST_ACK]	= TCP_LAST_ACK,
432 [TCP_LISTEN]		= TCP_CLOSE,
433 [TCP_CLOSING]		= TCP_CLOSING,
434 [TCP_NEW_SYN_RECV]	= TCP_CLOSE, /* should not happen ! */
435 };
436 
nip_get_tcp_input_checksum(struct sk_buff * skb)437 bool nip_get_tcp_input_checksum(struct sk_buff *skb)
438 {
439 	struct nip_pseudo_header nph = {0};
440 
441 	nph.nexthdr = nipcb(skb)->nexthdr;
442 	nph.saddr = nipcb(skb)->srcaddr;
443 	nph.daddr = nipcb(skb)->dstaddr;
444 
445 	nph.check_len = htons(skb->len);
446 	return nip_check_sum_parse(skb_transport_header(skb),
447 				   skb->len, &nph)
448 				   == 0xffff ? true : false;
449 }
450 
tcp_nip_close_state(struct sock * sk)451 static int tcp_nip_close_state(struct sock *sk)
452 {
453 	int next;
454 	int ns;
455 
456 	if (sk->sk_state >= TCP_MAX_STATES)
457 		return TCP_ACTION_FIN;
458 
459 	next = (int)new_state[sk->sk_state];
460 	ns = next & TCP_STATE_MASK;
461 	tcp_set_state(sk, ns);
462 
463 	return next & TCP_ACTION_FIN;
464 }
465 
sk_nip_stream_kill_queues(struct sock * sk)466 void sk_nip_stream_kill_queues(struct sock *sk)
467 {
468 	/* First the read buffer. */
469 	__skb_queue_purge(&sk->sk_receive_queue);
470 
471 	/* Next, the error queue. */
472 	__skb_queue_purge(&sk->sk_error_queue);
473 
474 	/* Next, the write queue. */
475 	WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
476 
477 	WARN_ON(sk->sk_wmem_queued);
478 }
479 
tcp_nip_shutdown(struct sock * sk,int how)480 void tcp_nip_shutdown(struct sock *sk, int how)
481 {
482 	if (!(how & SEND_SHUTDOWN))
483 		return;
484 
485 	/* If we've already sent a FIN, or it's a closed state, skip this. */
486 	if ((1 << sk->sk_state) &
487 	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
488 	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
489 		/* Clear out any half completed packets.  FIN if needed. */
490 		if (tcp_nip_close_state(sk))
491 			tcp_nip_send_fin(sk);
492 	}
493 }
494 
tcp_nip_close(struct sock * sk,long timeout)495 void tcp_nip_close(struct sock *sk, long timeout)
496 {
497 	struct sk_buff *skb;
498 	int data_was_unread = 0;
499 	int state;
500 	u32 sk_ack_backlog;
501 
502 	lock_sock(sk);
503 	sk->sk_shutdown = SHUTDOWN_MASK;
504 
505 	nip_dbg("sk_state:%d", sk->sk_state);
506 
507 	if (sk->sk_state == TCP_LISTEN) {
508 		tcp_set_state(sk, TCP_CLOSE);
509 
510 		sk_ack_backlog = READ_ONCE(sk->sk_ack_backlog);
511 		inet_csk_listen_stop(sk);
512 		nip_dbg("sk_state CLOSE, sk_ack_backlog=%u to %u, sk_max_ack_backlog=%u",
513 			sk_ack_backlog, READ_ONCE(sk->sk_ack_backlog),
514 			READ_ONCE(sk->sk_max_ack_backlog));
515 		goto adjudge_to_death;
516 	}
517 
518 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
519 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
520 
521 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
522 			len--;
523 		data_was_unread += len;
524 		__kfree_skb(skb);
525 	}
526 
527 	if (sk->sk_state == TCP_CLOSE)
528 		goto adjudge_to_death;
529 
530 	if (data_was_unread) {
531 		tcp_set_state(sk, TCP_CLOSE);
532 		nip_sock_debug(sk, __func__);
533 		tcp_nip_send_active_reset(sk, sk->sk_allocation);
534 	} else if (tcp_nip_close_state(sk)) {
535 		/* RED-PEN. Formally speaking, we have broken TCP state
536 		 * machine. State transitions:
537 		 *
538 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
539 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
540 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
541 		 */
542 		nip_dbg("ready to send fin, sk_state=%d", sk->sk_state);
543 		nip_sock_debug(sk, __func__);
544 		tcp_nip_send_fin(sk);
545 	}
546 
547 adjudge_to_death:
548 	state = sk->sk_state;
549 	sock_hold(sk);
550 	sock_orphan(sk);
551 
552 	/* It is the last release_sock in its life. It will remove backlog. */
553 	release_sock(sk);
554 
555 	local_bh_disable();
556 	bh_lock_sock(sk);
557 	WARN_ON(sock_owned_by_user(sk));
558 
559 	this_cpu_dec(*sk->sk_prot->orphan_count);
560 
561 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
562 		goto out;
563 
564 	if (sk->sk_state == TCP_CLOSE)
565 		inet_csk_destroy_sock(sk);
566 
567 out:
568 	bh_unlock_sock(sk);
569 	local_bh_enable();
570 	sock_put(sk);
571 }
572 
573 /* These states need RST on ABORT according to RFC793 */
tcp_nip_need_reset(int state)574 static inline bool tcp_nip_need_reset(int state)
575 {
576 	return (1 << state) &
577 	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
578 			TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
579 }
580 
581 /* Function
582  *    Initialize some of the parameters in request_sock
583  * Parameter
584  *    req: Request connection control block
585  *    sk_listener: Transmission control block
586  *    skb: Transfer control block buffer
587  */
tcp_nip_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)588 static void tcp_nip_init_req(struct request_sock *req,
589 			     const struct sock *sk_listener,
590 			     struct sk_buff *skb)
591 {
592 	struct inet_request_sock *ireq = inet_rsk(req);
593 
594 	ireq->IR_NIP_RMT_ADDR = nipcb(skb)->srcaddr;
595 	ireq->IR_NIP_LOC_ADDR = nipcb(skb)->dstaddr;
596 }
597 
598 /* Function
599  *    Initialize The initialization number SEQ. Calculate the initial serial number of
600  *    the server based on part of the source address source port, part of the destination
601  *    address, and destination port
602  * Parameter
603  *    skb: Transfer control block buffer
604  */
tcp_nip_init_sequence(const struct sk_buff * skb)605 static __u32 tcp_nip_init_sequence(const struct sk_buff *skb)
606 {
607 	return secure_tcp_nip_sequence_number(nipcb(skb)->dstaddr.NIP_ADDR_FIELD32,
608 					    nipcb(skb)->srcaddr.NIP_ADDR_FIELD32,
609 					    tcp_hdr(skb)->dest,
610 					    tcp_hdr(skb)->source);
611 }
612 
tcp_nip_route_req(const struct sock * sk,struct flowi * fl,const struct request_sock * req)613 static struct dst_entry *tcp_nip_route_req(const struct sock *sk,
614 					   struct flowi *fl,
615 					   const struct request_sock *req)
616 {
617 	struct dst_entry *dst;
618 	struct inet_request_sock *ireq = inet_rsk(req);
619 	struct flow_nip fln;
620 
621 	fln.daddr = ireq->IR_NIP_RMT_ADDR;
622 	dst = nip_route_output(sock_net(sk), sk, &fln);
623 	return dst;
624 }
625 
626 /* Function
627  *    Functions used by the client transport layer to connect requests
628  *    This parameter is used to set the source address, destination address and interface
629  * Parameter
630  *    sk: Transmission control block
631  *    uaddr:The destination address
632  *    addr_len:Destination address Length
633  */
tcp_nip_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)634 static int tcp_nip_connect(struct sock *sk, struct sockaddr *uaddr,
635 			   int addr_len)
636 {
637 	struct sockaddr_nin *usin = (struct sockaddr_nin *)uaddr;
638 	struct inet_sock *inet = inet_sk(sk);
639 	struct tcp_sock *tp = tcp_sk(sk);
640 	__be16 orig_dport;
641 	struct nip_addr *daddr;
642 	struct dst_entry *dst;
643 	int err;
644 	struct ip_options_rcu *inet_opt;
645 	struct inet_timewait_death_row *tcp_death_row;
646 	struct flow_nip fln;
647 
648 	fln.daddr = usin->sin_addr;
649 
650 	if (addr_len < sizeof(struct sockaddr_nin))
651 		return -EINVAL;
652 
653 	if (usin->sin_family != AF_NINET)
654 		return -EAFNOSUPPORT;
655 
656 	inet_opt = rcu_dereference_protected(inet->inet_opt,
657 					     lockdep_sock_is_held(sk));
658 	/* Destination ADDRESS and port */
659 	daddr = &usin->sin_addr;
660 	orig_dport = usin->sin_port;
661 
662 	/* Find the route and obtain the source address */
663 	nip_dbg("sk->sk_bound_dev_if is %d", sk->sk_bound_dev_if);
664 	fln.FLOWIN_OIF = sk->sk_bound_dev_if;
665 	dst = nip_dst_lookup_flow(sock_net(sk), sk, &fln, NULL);
666 	if (IS_ERR(dst)) {
667 		nip_dbg("cannot find dst");
668 		err = PTR_ERR(dst);
669 		goto failure;
670 	}
671 
672 	/* find the actual source addr for sk->SK_NIP_RCV_SADDR */
673 	if (nip_addr_eq(&sk->SK_NIP_RCV_SADDR, &nip_any_addr))
674 		sk->SK_NIP_RCV_SADDR = fln.saddr;
675 	fln.saddr = sk->SK_NIP_RCV_SADDR;
676 
677 	if (nip_addr_invalid(&fln.daddr)) {
678 		nip_dbg("nip daddr invalid, bitlen=%u", fln.daddr.bitlen);
679 		err = -EFAULT;
680 		goto failure;
681 	}
682 
683 	if (nip_addr_invalid(&fln.saddr)) {
684 		nip_dbg("nip saddr invalid, bitlen=%u", fln.saddr.bitlen);
685 		err = -EFAULT;
686 		goto failure;
687 	}
688 
689 	/* The destination address and port are set to the transport control block */
690 	inet->inet_dport = usin->sin_port;
691 	sk->SK_NIP_DADDR = usin->sin_addr;
692 
693 	inet_csk(sk)->icsk_ext_hdr_len = 0;
694 	if (inet_opt)
695 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
696 
697 	tcp_set_state(sk, TCP_SYN_SENT);
698 	sk_set_txhash(sk);
699 	sk_dst_set(sk, dst);
700 
701 	/* Dynamically bind local ports */
702 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
703 	err = ninet_hash_connect(tcp_death_row, sk);
704 	if (err)
705 		goto late_failure;
706 
707 	/* Class if the transport control block has already been linked */
708 	if (tp->rx_opt.ts_recent_stamp) {
709 		/* Reset inherited state */
710 		tp->rx_opt.ts_recent	   = 0;
711 		tp->rx_opt.ts_recent_stamp = 0;
712 		if (likely(!tp->repair))
713 			tp->write_seq	   = 0;
714 	}
715 
716 	if (!tp->write_seq)
717 		tp->write_seq =
718 		secure_tcp_nip_sequence_number(sk->SK_NIP_RCV_SADDR.NIP_ADDR_FIELD32,
719 					       sk->SK_NIP_DADDR.NIP_ADDR_FIELD32,
720 					       inet->inet_sport,
721 					       usin->sin_port);
722 
723 	inet->inet_id = prandom_u32();
724 
725 	/* Call tcp_connect to send the SYN field */
726 	err = __tcp_nip_connect(sk);
727 	if (err)
728 		goto late_failure;
729 	nip_sock_debug(sk, __func__);
730 	return 0;
731 
732 /* failure after tcp_set_state(sk, TCP_SYN_SENT) */
733 late_failure:
734 	tcp_set_state(sk, TCP_CLOSE);
735 failure:
736 	nip_sock_debug_output(&usin->sin_addr, &sk->SK_NIP_RCV_SADDR,
737 			      usin->sin_port, inet->inet_sport, __func__);
738 	sk->sk_route_caps = 0;
739 	inet->inet_dport = 0;
740 	return err;
741 }
742 
tcp_nip_send_reset(struct sock * sk,struct sk_buff * skb)743 static void tcp_nip_send_reset(struct sock *sk, struct sk_buff *skb)
744 {
745 	const struct tcphdr *th = tcp_hdr(skb);
746 	u32 seq = 0;
747 	u32 ack_seq = 0;
748 	u32 priority = gfp_any();
749 
750 	/* Never send a reset in response to a reset. */
751 	if (th->rst)
752 		return;
753 
754 	nip_dbg("send rst");
755 	if (th->ack)
756 		seq = ntohl(th->ack_seq);
757 	else
758 		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
759 			  tcp_header_length(th);
760 
761 	tcp_nip_actual_send_reset(sk, skb, seq, ack_seq, 0, 1, priority);
762 }
763 
764 /* Function
765  *    function used by the server to send SYN+ACK segments
766  * Parameter
767  *    sk: Transmission control block
768  *    dst: routing。
769  *    flowi: Flow control block
770  *    req: Request connection control block
771  *    foc: Fast open options
772  *    synack_type: Type of the SYN+ACK segment
773  */
tcp_nip_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)774 static int tcp_nip_send_synack(const struct sock *sk, struct dst_entry *dst,
775 			       struct flowi *fl,
776 			       struct request_sock *req,
777 			       struct tcp_fastopen_cookie *foc,
778 			       enum tcp_synack_type synack_type,
779 			       struct sk_buff *syn_skb)
780 {
781 	struct sk_buff *skb;
782 	int err = -ENOMEM;
783 
784 	skb = tcp_nip_make_synack(sk, dst, req, foc, synack_type);
785 	if (skb) {
786 		nip_dbg("TCP server create SYN+ACK skb successfully");
787 		rcu_read_lock();
788 		err = nip_send_synack(req, skb);
789 		rcu_read_unlock();
790 	}
791 
792 	return err;
793 }
794 
tcp_nip_reqsk_destructor(struct request_sock * req)795 static void tcp_nip_reqsk_destructor(struct request_sock *req)
796 {
797 	;
798 }
799 
tcp_nip_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)800 static void tcp_nip_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
801 				   struct request_sock *req)
802 {
803 }
804 
tcp_nip_reqsk_send_reset(const struct sock * sk,struct sk_buff * skb)805 static void tcp_nip_reqsk_send_reset(const struct sock *sk, struct sk_buff *skb)
806 {
807 }
808 
tcp_nip_reqsk_syn_ack_timeout(const struct request_sock * req)809 static void tcp_nip_reqsk_syn_ack_timeout(const struct request_sock *req)
810 {
811 }
812 
813 struct request_sock_ops tcp_nip_request_sock_ops __read_mostly = {
814 	.family		 =	AF_NINET,
815 	.obj_size	 =	sizeof(struct tcp_nip_request_sock),
816 	.rtx_syn_ack	 =	tcp_nip_rtx_synack,
817 	.send_ack	 =	tcp_nip_reqsk_send_ack,
818 	.destructor	 =	tcp_nip_reqsk_destructor,
819 	.send_reset	 =	tcp_nip_reqsk_send_reset,
820 	.syn_ack_timeout =	tcp_nip_reqsk_syn_ack_timeout,
821 };
822 
823 #ifdef CONFIG_TCP_MD5SIG
nip_calc_md5_hash(char * location,const struct tcp_md5sig_key * md5,const struct sock * sk,const struct sk_buff * skb)824 static int nip_calc_md5_hash(char *location, const struct tcp_md5sig_key *md5,
825 			     const struct sock *sk, const struct sk_buff *skb)
826 {
827 	return -EINVAL;
828 }
829 
nip_req_md5_lookup(const struct sock * sk,const struct sock * addr_sk)830 static struct tcp_md5sig_key *nip_req_md5_lookup(const struct sock *sk,
831 						 const struct sock *addr_sk)
832 {
833 	return NULL;
834 }
835 #endif
836 
837 #ifdef CONFIG_SYN_COOKIES
nip_cookie_init_seq(const struct sk_buff * skb,__u16 * mss)838 static __u32 nip_cookie_init_seq(const struct sk_buff *skb, __u16 *mss)
839 {
840 	return 0;
841 }
842 #endif
843 
tcp_nip_init_ts_off(const struct net * net,const struct sk_buff * skb)844 static u32 tcp_nip_init_ts_off(const struct net *net, const struct sk_buff *skb)
845 {
846 	return 0;
847 }
848 
849 static const struct tcp_request_sock_ops tcp_request_sock_newip_ops = {
850 	.mss_clamp	=	TCP_BASE_MSS,
851 #ifdef CONFIG_TCP_MD5SIG
852 	.req_md5_lookup	=	nip_req_md5_lookup,
853 	.calc_md5_hash	=	nip_calc_md5_hash,
854 #endif
855 	.init_req	=	tcp_nip_init_req,
856 #ifdef CONFIG_SYN_COOKIES
857 	.cookie_init_seq =	nip_cookie_init_seq,
858 #endif
859 	.route_req	=	tcp_nip_route_req,
860 	.init_seq	=	tcp_nip_init_sequence,
861 	.send_synack	=	tcp_nip_send_synack,
862 	.init_ts_off	=	tcp_nip_init_ts_off,
863 };
864 
865 /* Function
866  *    The route cache saves the transport control block from the SKB
867  * Parameter
868  *    sk: Transmission control block
869  *    skb: Transfer control block buffer
870  *    req: Request connection control block
871  *    dst: routing
872  *    req_unhash: Request connection control block
873  */
ninet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)874 void ninet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
875 {
876 	struct dst_entry *dst = skb_dst(skb);
877 
878 	if (dst && dst_hold_safe(dst)) {
879 		rcu_assign_pointer(sk->sk_rx_dst, dst);
880 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
881 	}
882 }
883 
884 /* Function
885  *    A function used by the server to process client connection requests
886  * Parameter
887  *    sk: Transmission control block
888  *    skb: Transfer control block buffer
889  */
tcp_nip_conn_request(struct sock * sk,struct sk_buff * skb)890 static int tcp_nip_conn_request(struct sock *sk, struct sk_buff *skb)
891 {
892 	return _tcp_nip_conn_request(&tcp_nip_request_sock_ops,
893 				     &tcp_request_sock_newip_ops, sk, skb);
894 }
895 
896 /* Function
897  *    Create child control blocks
898  * Parameter
899  *    sk: Transmission control block
900  *    skb: Transfer control block buffer
901  *    req: Request connection control block
902  *    dst: routing
903  *    req_unhash: Request connection control block
904  */
tcp_nip_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)905 static struct sock *tcp_nip_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
906 					  struct request_sock *req,
907 					  struct dst_entry *dst,
908 					  struct request_sock *req_unhash,
909 					  bool *own_req)
910 {
911 	struct tcp_nip_request_sock *niptreq = tcp_nip_rsk(req);
912 	struct inet_request_sock *ireq = inet_rsk(req);
913 	bool found_dup_sk = false;
914 	struct tcp_nip_sock *newtcpnipsk;
915 	struct inet_sock *newinet;
916 	struct tcp_sock *newtp;
917 	struct sock *newsk;
918 	struct flow_nip fln;
919 
920 	if (sk_acceptq_is_full(sk))
921 		goto out_overflow;
922 
923 	fln.daddr = ireq->IR_NIP_RMT_ADDR;
924 	if (!dst) {
925 		dst = nip_route_output(sock_net(sk), sk, &fln);
926 		if (!dst)
927 			goto out;
928 	}
929 
930 	newsk = tcp_nip_create_openreq_child(sk, req, skb);
931 	if (!newsk)
932 		goto out_nonewsk;
933 
934 	/* Save the received route cache */
935 	ninet_sk_rx_dst_set(newsk, skb);
936 
937 	newtcpnipsk = (struct tcp_nip_sock *)newsk;
938 	newtcpnipsk->common = niptreq->common;
939 
940 	newtp = tcp_sk(newsk);
941 	newinet = inet_sk(newsk);
942 
943 	newsk->SK_NIP_DADDR = ireq->IR_NIP_RMT_ADDR;
944 	newsk->SK_NIP_RCV_SADDR = ireq->IR_NIP_LOC_ADDR;
945 
946 	newinet->inet_opt = NULL;
947 
948 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
949 
950 	newtp->retrans_stamp = jiffies;
951 
952 	/* Negotiate MSS */
953 	newtp->mss_cache = TCP_BASE_MSS;
954 	newtp->out_of_order_queue = RB_ROOT;
955 	newtp->advmss = dst_metric_advmss(dst);
956 	if (tcp_sk(sk)->rx_opt.user_mss &&
957 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
958 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
959 
960 	tcp_nip_initialize_rcv_mss(newsk);
961 	if (__inet_inherit_port(sk, newsk) < 0)
962 		goto put_and_exit;
963 	/* Deleting the old sock from the ehash table and adding the new sock to the
964 	 * ehash table succeeds *own_req equals true
965 	 */
966 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
967 				       &found_dup_sk);
968 
969 	/* newip newsk doesn't save this dst. release it. */
970 	dst_release(dst);
971 	return newsk;
972 
973 out_overflow:
974 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
975 out_nonewsk:
976 out:
977 	/* newip newsk doesn't save this dst. release it. */
978 	dst_release(dst);
979 	tcp_listendrop(sk);
980 	return NULL;
981 put_and_exit:
982 	newinet->inet_opt = NULL;
983 	inet_csk_prepare_forced_close(newsk);
984 	tcp_nip_done(newsk);
985 	goto out;
986 }
987 
tcp_nip_send_check(struct sock * sk,struct sk_buff * skb)988 static void tcp_nip_send_check(struct sock *sk, struct sk_buff *skb)
989 {
990 }
991 
tcp_nip_rebuild_header(struct sock * sk)992 static int tcp_nip_rebuild_header(struct sock *sk)
993 {
994 	return -EINVAL;
995 }
996 
nip_addr2sockaddr(struct sock * sk,struct sockaddr * addr)997 static void nip_addr2sockaddr(struct sock *sk, struct sockaddr *addr)
998 {
999 }
1000 
nip_mtu_reduced(struct sock * sk)1001 static void nip_mtu_reduced(struct sock *sk)
1002 {
1003 }
1004 
1005 static const struct inet_connection_sock_af_ops newip_specific = {
1006 	.queue_xmit		= tcp_nip_queue_xmit,
1007 	.send_check		= tcp_nip_send_check,
1008 	.rebuild_header		= tcp_nip_rebuild_header,
1009 	.sk_rx_dst_set		= ninet_sk_rx_dst_set,
1010 	.conn_request		= tcp_nip_conn_request,
1011 	.syn_recv_sock		= tcp_nip_syn_recv_sock,
1012 	.net_header_len		= 0,
1013 	.net_frag_header_len	= 0,
1014 	.setsockopt		= nip_setsockopt,
1015 	.getsockopt		= nip_getsockopt,
1016 	.addr2sockaddr		= nip_addr2sockaddr,
1017 	.sockaddr_len		= sizeof(struct sockaddr_nin),
1018 	.mtu_reduced		= nip_mtu_reduced,
1019 };
1020 
1021 #if IS_ENABLED(CONFIG_NEWIP_FAST_KEEPALIVE)
1022 #define MAX_NIP_TCP_KEEPIDLE	32767
1023 #define MAX_NIP_TCP_KEEPINTVL	32767
1024 #define MAX_NIP_TCP_KEEPCNT	255
tcp_nip_keepalive_para_update(struct sock * sk,u32 keepalive_time,u32 keepalive_intvl,u8 keepalive_probes)1025 static int tcp_nip_keepalive_para_update(struct sock *sk,
1026 					 u32 keepalive_time,
1027 					 u32 keepalive_intvl,
1028 					 u8 keepalive_probes)
1029 {
1030 	int val;
1031 	struct tcp_sock *tp = tcp_sk(sk);
1032 
1033 	/* set keep idle (TCP_KEEPIDLE) */
1034 	val = keepalive_time;
1035 	if (val < 1 || val > MAX_NIP_TCP_KEEPIDLE) {
1036 		nip_dbg("keepalive_time(%u) invalid", val);
1037 		return -EINVAL;
1038 	}
1039 
1040 	tp->keepalive_time = val;
1041 	if (sock_flag(sk, SOCK_KEEPOPEN) &&
1042 	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
1043 		u32 elapsed = keepalive_time_elapsed(tp);
1044 
1045 		if (tp->keepalive_time > elapsed)
1046 			elapsed = tp->keepalive_time - elapsed;
1047 		else
1048 			elapsed = 0;
1049 		inet_csk_reset_keepalive_timer(sk, elapsed);
1050 	}
1051 
1052 	/* set keep intvl (TCP_KEEPINTVL) */
1053 	val = keepalive_intvl;
1054 	if (val < 1 || val > MAX_NIP_TCP_KEEPINTVL) {
1055 		nip_dbg("keepalive_intvl(%u) invalid", val);
1056 		return -EINVAL;
1057 	}
1058 	tp->keepalive_intvl = val;
1059 
1060 	/* set keep cnt (TCP_KEEPCNT) */
1061 	val = keepalive_probes;
1062 	if (val < 1 || val > MAX_NIP_TCP_KEEPCNT) {
1063 		nip_dbg("keepalive_probes(%u) invalid", val);
1064 		return -EINVAL;
1065 	}
1066 	tp->keepalive_probes = val;
1067 
1068 	/* enable keepalive (SO_KEEPALIVE) */
1069 	if (sk->sk_prot->keepalive) {
1070 		sk->sk_prot->keepalive(sk, 1);
1071 		sock_valbool_flag(sk, SOCK_KEEPOPEN, 1);
1072 	} else {
1073 		nip_dbg("keepalive func is null");
1074 	}
1075 
1076 	return 0;
1077 }
1078 #endif
1079 
1080 #define NIP_PKT_TOTAL_LEN_BOUNDARY 100000  // 100K
1081 #define NIP_KEEPALIVE_PROBES 255
tcp_nip_keepalive_enable(struct sock * sk)1082 void tcp_nip_keepalive_enable(struct sock *sk)
1083 {
1084 #if IS_ENABLED(CONFIG_NEWIP_FAST_KEEPALIVE)
1085 	int ret;
1086 	struct tcp_sock *tp = tcp_sk(sk);
1087 	struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
1088 	struct sk_buff *skb = tcp_nip_send_head(sk);
1089 
1090 	if (!skb)
1091 		return;
1092 
1093 	if (ntp->nip_keepalive_enable) {
1094 		/* If keepalive set by setsockopt, backup para and change para to nip para */
1095 		if (tp->keepalive_time > HZ) {
1096 			ntp->keepalive_time_bak = tp->keepalive_time;
1097 			ntp->keepalive_probes_bak = tp->keepalive_probes;
1098 			ntp->keepalive_intvl_bak = tp->keepalive_intvl;
1099 
1100 			nip_dbg("HZ=%u, change time/probes/intvl [%u, %u, %u] to [%u, %u, %u]",
1101 				HZ, tp->keepalive_time, tp->keepalive_probes,
1102 				tp->keepalive_intvl, get_nip_keepalive_time(),
1103 				NIP_KEEPALIVE_PROBES, get_nip_keepalive_intvl());
1104 
1105 			tp->keepalive_time = get_nip_keepalive_time();
1106 			tp->keepalive_probes = NIP_KEEPALIVE_PROBES;
1107 			tp->keepalive_intvl = get_nip_keepalive_intvl();
1108 			inet_csk_reset_keepalive_timer(sk, tp->keepalive_time);
1109 		}
1110 		return;
1111 	}
1112 
1113 	/* If keepalive set by setsockopt, backup para */
1114 	if (sock_flag(sk, SOCK_KEEPOPEN)) {
1115 		ntp->keepalive_time_bak = tp->keepalive_time;
1116 		ntp->keepalive_probes_bak = tp->keepalive_probes;
1117 		ntp->keepalive_intvl_bak = tp->keepalive_intvl;
1118 		nip_dbg("HZ=%u, backup normal time/probes/intvl [%u, %u, %u]",
1119 			HZ, tp->keepalive_time, tp->keepalive_probes, tp->keepalive_intvl);
1120 	}
1121 
1122 	/* change para to nip para */
1123 	ret = tcp_nip_keepalive_para_update(sk, get_nip_keepalive_time(),
1124 					    get_nip_keepalive_intvl(),
1125 					    NIP_KEEPALIVE_PROBES);
1126 	if (ret != 0) {
1127 		nip_dbg("fail, HZ=%u, time/probes/intvl [%u, %u, %u]",
1128 			HZ, tp->keepalive_time, tp->keepalive_probes, tp->keepalive_intvl);
1129 		return;
1130 	}
1131 
1132 	nip_dbg("ok, HZ=%u, time/probes/intvl [%u, %u, %u]",
1133 		HZ, tp->keepalive_time, tp->keepalive_probes, tp->keepalive_intvl);
1134 	ntp->nip_keepalive_enable = true;
1135 #endif
1136 }
1137 
tcp_nip_keepalive_disable(struct sock * sk)1138 void tcp_nip_keepalive_disable(struct sock *sk)
1139 {
1140 #if IS_ENABLED(CONFIG_NEWIP_FAST_KEEPALIVE)
1141 	struct tcp_sock *tp = tcp_sk(sk);
1142 	struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
1143 
1144 	if (!ntp->nip_keepalive_enable)
1145 		return;
1146 
1147 	if (!sock_flag(sk, SOCK_KEEPOPEN)) {
1148 		ntp->nip_keepalive_enable = false;
1149 		nip_dbg("ok, HZ=%u, normal ka has disable", HZ);
1150 		return;
1151 	}
1152 
1153 	if (ntp->idle_ka_probes_out < get_nip_idle_ka_probes_out())
1154 		return;
1155 
1156 	/* newip keepalive change to normal keepalive */
1157 	if (ntp->keepalive_time_bak) {
1158 		nip_dbg("HZ=%u, change normal time/probes/intvl [%u, %u, %u] to [%u, %u, %u]",
1159 			HZ, tp->keepalive_time, tp->keepalive_probes,
1160 			tp->keepalive_intvl, ntp->keepalive_time_bak, ntp->keepalive_probes_bak,
1161 			ntp->keepalive_intvl_bak);
1162 		tp->keepalive_time = ntp->keepalive_time_bak;
1163 		tp->keepalive_probes = ntp->keepalive_probes_bak;
1164 		tp->keepalive_intvl = ntp->keepalive_intvl_bak;
1165 		inet_csk_reset_keepalive_timer(sk, tp->keepalive_time);
1166 		return;
1167 	}
1168 
1169 	ntp->keepalive_time_bak = 0;
1170 	ntp->keepalive_probes_bak = 0;
1171 	ntp->keepalive_intvl_bak = 0;
1172 
1173 	/* enable keepalive (SO_KEEPALIVE) */
1174 	if (sk->sk_prot->keepalive)
1175 		sk->sk_prot->keepalive(sk, 0);
1176 	sock_valbool_flag(sk, SOCK_KEEPOPEN, 0);
1177 
1178 	nip_dbg("ok, HZ=%u, idle_ka_probes_out=%u", HZ, get_nip_idle_ka_probes_out());
1179 	ntp->nip_keepalive_enable = false;
1180 #endif
1181 }
1182 
_tcp_sock_priv_init(struct sock * sk)1183 static void _tcp_sock_priv_init(struct sock *sk)
1184 {
1185 	struct tcp_sock *tp = tcp_sk(sk);
1186 	struct tcp_nip_common *ntp = &tcp_nip_sk(sk)->common;
1187 
1188 	memset(ntp, 0, sizeof(*ntp));
1189 	ntp->nip_ssthresh = get_nip_ssthresh_default();
1190 	tp->sacked_out = 0;
1191 	tp->rcv_tstamp = 0;
1192 	tp->selective_acks[0].start_seq = 0;
1193 	tp->selective_acks[0].end_seq = 0;
1194 	tp->keepalive_time = 0;
1195 	tp->keepalive_probes = 0;
1196 	tp->keepalive_intvl = 0;
1197 }
1198 
tcp_sock_priv_init(struct sock * sk)1199 static void tcp_sock_priv_init(struct sock *sk)
1200 {
1201 	_tcp_sock_priv_init(sk);
1202 }
1203 
nip_icsk_ca_init(struct sock * sk)1204 static void nip_icsk_ca_init(struct sock *sk)
1205 {
1206 }
1207 
nip_icsk_ca_release(struct sock * sk)1208 static void nip_icsk_ca_release(struct sock *sk)
1209 {
1210 }
1211 
nip_icsk_ca_ssthresh(struct sock * sk)1212 static u32 nip_icsk_ca_ssthresh(struct sock *sk)
1213 {
1214 	return 0;
1215 }
1216 
nip_icsk_ca_cong_avoid(struct sock * sk,u32 ack,u32 acked)1217 static void nip_icsk_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked)
1218 {
1219 }
1220 
nip_icsk_ca_set_state(struct sock * sk,u8 new_state)1221 static void nip_icsk_ca_set_state(struct sock *sk, u8 new_state)
1222 {
1223 }
1224 
nip_icsk_ca_cwnd_event(struct sock * sk,enum tcp_ca_event ev)1225 static void nip_icsk_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
1226 {
1227 }
1228 
nip_icsk_ca_in_ack_event(struct sock * sk,u32 flags)1229 static void nip_icsk_ca_in_ack_event(struct sock *sk, u32 flags)
1230 {
1231 }
1232 
nip_icsk_ca_undo_cwnd(struct sock * sk)1233 static u32 nip_icsk_ca_undo_cwnd(struct sock *sk)
1234 {
1235 	return 0;
1236 }
1237 
nip_icsk_ca_pkts_acked(struct sock * sk,const struct ack_sample * sample)1238 static void nip_icsk_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample)
1239 {
1240 }
1241 
nip_icsk_ca_min_tso_segs(struct sock * sk)1242 static u32 nip_icsk_ca_min_tso_segs(struct sock *sk)
1243 {
1244 	return 0;
1245 }
1246 
nip_icsk_ca_sndbuf_expand(struct sock * sk)1247 static u32 nip_icsk_ca_sndbuf_expand(struct sock *sk)
1248 {
1249 	return 0;
1250 }
1251 
nip_icsk_ca_cong_control(struct sock * sk,const struct rate_sample * rs)1252 static void nip_icsk_ca_cong_control(struct sock *sk, const struct rate_sample *rs)
1253 {
1254 }
1255 
nip_icsk_ca_get_info(struct sock * sk,u32 ext,int * attr,union tcp_cc_info * info)1256 static size_t nip_icsk_ca_get_info(struct sock *sk, u32 ext, int *attr,
1257 				   union tcp_cc_info *info)
1258 {
1259 	return 0;
1260 }
1261 
nip_icsk_ulp_init(struct sock * sk)1262 static int nip_icsk_ulp_init(struct sock *sk)
1263 {
1264 	return -EINVAL;
1265 }
1266 
nip_icsk_ulp_update(struct sock * sk,struct proto * p,void (* write_space)(struct sock * sk))1267 static void nip_icsk_ulp_update(struct sock *sk, struct proto *p,
1268 				void (*write_space)(struct sock *sk))
1269 {
1270 }
1271 
nip_icsk_ulp_release(struct sock * sk)1272 static void nip_icsk_ulp_release(struct sock *sk)
1273 {
1274 }
1275 
nip_icsk_ulp_get_info(const struct sock * sk,struct sk_buff * skb)1276 static int nip_icsk_ulp_get_info(const struct sock *sk, struct sk_buff *skb)
1277 {
1278 	return -EINVAL;
1279 }
1280 
nip_icsk_ulp_get_info_size(const struct sock * sk)1281 static size_t nip_icsk_ulp_get_info_size(const struct sock *sk)
1282 {
1283 	return 0;
1284 }
1285 
nip_icsk_ulp_clone(const struct request_sock * req,struct sock * newsk,const gfp_t priority)1286 static void nip_icsk_ulp_clone(const struct request_sock *req, struct sock *newsk,
1287 			       const gfp_t priority)
1288 {
1289 }
1290 
1291 static struct module nip_owner;
1292 
1293 struct tcp_ulp_ops nip_icsk_ulp_ops = {
1294 	.init			= nip_icsk_ulp_init,
1295 	.update			= nip_icsk_ulp_update,
1296 	.release		= nip_icsk_ulp_release,
1297 	.get_info		= nip_icsk_ulp_get_info,
1298 	.get_info_size		= nip_icsk_ulp_get_info_size,
1299 	.clone			= nip_icsk_ulp_clone,
1300 	.owner			= &nip_owner,
1301 };
1302 
1303 struct tcp_congestion_ops nip_icsk_ca_ops = {
1304 	.init			= nip_icsk_ca_init,
1305 	.release		= nip_icsk_ca_release,
1306 	.ssthresh		= nip_icsk_ca_ssthresh,
1307 	.cong_avoid		= nip_icsk_ca_cong_avoid,
1308 	.set_state		= nip_icsk_ca_set_state,
1309 	.cwnd_event		= nip_icsk_ca_cwnd_event,
1310 	.in_ack_event		= nip_icsk_ca_in_ack_event,
1311 	.undo_cwnd		= nip_icsk_ca_undo_cwnd,
1312 	.pkts_acked		= nip_icsk_ca_pkts_acked,
1313 	.min_tso_segs		= nip_icsk_ca_min_tso_segs,
1314 	.sndbuf_expand		= nip_icsk_ca_sndbuf_expand,
1315 	.cong_control		= nip_icsk_ca_cong_control,
1316 	.get_info		= nip_icsk_ca_get_info,
1317 };
1318 
nip_icsk_clean_acked(struct sock * sk,u32 acked_seq)1319 static void nip_icsk_clean_acked(struct sock *sk, u32 acked_seq)
1320 {
1321 }
1322 
inet_connection_sock_pre_init(struct inet_connection_sock * icsk)1323 static void inet_connection_sock_pre_init(struct inet_connection_sock *icsk)
1324 {
1325 	icsk->icsk_ca_ops = &nip_icsk_ca_ops;
1326 	icsk->icsk_ulp_ops = &nip_icsk_ulp_ops;
1327 	icsk->icsk_clean_acked = nip_icsk_clean_acked;
1328 }
1329 
1330 #ifdef CONFIG_TCP_MD5SIG
nip_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1331 struct tcp_md5sig_key *nip_md5_lookup(const struct sock *sk,
1332 				      const struct sock *addr_sk)
1333 {
1334 	return NULL;
1335 }
1336 
nip_md5_parse(struct sock * sk,int optname,sockptr_t optval,int optlen)1337 int nip_md5_parse(struct sock *sk, int optname, sockptr_t optval,
1338 		  int optlen)
1339 {
1340 	return -EINVAL;
1341 }
1342 
1343 const struct tcp_sock_af_ops nip_af_specific = {
1344 	.md5_lookup = nip_md5_lookup,
1345 	.calc_md5_hash = nip_calc_md5_hash,
1346 	.md5_parse = nip_md5_parse,
1347 };
1348 
1349 struct tcp_md5sig_info	__rcu nip_md5sig_info;
1350 #endif
1351 
tcp_sock_pre_init(struct tcp_sock * tp)1352 static void tcp_sock_pre_init(struct tcp_sock *tp)
1353 {
1354 #ifdef CONFIG_TCP_MD5SIG
1355 	tp->af_specific = &nip_af_specific;
1356 	tp->md5sig_info = &nip_md5sig_info;
1357 #endif
1358 }
1359 
1360 /* Function
1361  *    Example Initialize sock information in TCP
1362  * Parameter
1363  *    sk: Sock to be initialized
1364  * Note: Currently, this function does not initialize timer, pre-queue, and congestion control,
1365  * and does not allow fast retransmission. No function is set to adjust MSS
1366  */
tcp_nip_init_sock(struct sock * sk)1367 static int tcp_nip_init_sock(struct sock *sk)
1368 {
1369 	struct inet_connection_sock *icsk = inet_csk(sk);
1370 	struct tcp_sock *tp = tcp_sk(sk);
1371 
1372 	tcp_sock_priv_init(sk);
1373 
1374 	tp->out_of_order_queue = RB_ROOT;
1375 	tcp_nip_init_xmit_timers(sk);
1376 	INIT_LIST_HEAD(&tp->tsq_node);
1377 
1378 	inet_connection_sock_pre_init(icsk);
1379 	tcp_sock_pre_init(tp);
1380 	icsk->icsk_rto = get_nip_rto() == 0 ? TCP_TIMEOUT_INIT : (HZ / get_nip_rto());
1381 	icsk->icsk_rto_min = TCP_RTO_MIN;
1382 	icsk->icsk_delack_max = TCP_DELACK_MAX;
1383 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
1384 	minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
1385 
1386 	tp->snd_cwnd = TCP_INIT_CWND;
1387 	tp->app_limited = ~0U;
1388 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1389 	tp->snd_cwnd_clamp = ~0;
1390 	tp->mss_cache = TCP_MSS_DEFAULT;
1391 
1392 	tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
1393 	tp->tsoffset = 0;
1394 	sk->sk_state = TCP_CLOSE;
1395 	sk->sk_write_space = sk_stream_write_space;
1396 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1397 
1398 	icsk->icsk_sync_mss = tcp_nip_sync_mss;
1399 
1400 	WRITE_ONCE(sk->sk_sndbuf, get_nip_sndbuf()); // sock_net(sk)->ipv4.sysctl_tcp_wmem[1]
1401 	WRITE_ONCE(sk->sk_rcvbuf, get_nip_rcvbuf()); // sock_net(sk)->ipv4.sysctl_tcp_rmem[1]
1402 
1403 	local_bh_disable();
1404 	sk_sockets_allocated_inc(sk);
1405 	local_bh_enable();
1406 
1407 	icsk->icsk_af_ops = &newip_specific;
1408 
1409 	return 0;
1410 }
1411 
skb_nip_entail(struct sock * sk,struct sk_buff * skb)1412 static void skb_nip_entail(struct sock *sk, struct sk_buff *skb)
1413 {
1414 	struct tcp_sock *tp = tcp_sk(sk);
1415 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
1416 
1417 	skb->csum    = 0;
1418 	tcb->seq     = tp->write_seq;
1419 	tcb->end_seq = tp->write_seq;
1420 	tcb->tcp_flags = TCPHDR_ACK;
1421 	tcb->sacked  = 0;
1422 
1423 	tcp_nip_add_write_queue_tail(sk, skb);
1424 
1425 	sk->sk_wmem_queued += skb->truesize;
1426 	sk_mem_charge(sk, skb->truesize);
1427 }
1428 
tcp_nip_xmit_size_goal(struct sock * sk,u32 mss_now,int large_allowed)1429 static unsigned int tcp_nip_xmit_size_goal(struct sock *sk, u32 mss_now,
1430 					   int large_allowed)
1431 {
1432 	struct tcp_sock *tp = tcp_sk(sk);
1433 	u32 new_size_goal = NIP_MIN_MTU;
1434 	u32 size_goal;
1435 
1436 	if (!large_allowed || !mss_now)
1437 		return mss_now;
1438 
1439 	/* Note : tcp_tso_autosize() will eventually split this later */
1440 	if (sk->sk_gso_max_size > MAX_TCP_HEADER + 1)
1441 		new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
1442 	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
1443 
1444 	/* We try hard to avoid divides here */
1445 	size_goal = tp->gso_segs * mss_now;
1446 	if (unlikely(new_size_goal < size_goal ||
1447 		     new_size_goal >= size_goal + mss_now)) {
1448 		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
1449 				     sk->sk_gso_max_segs);
1450 		size_goal = tp->gso_segs * mss_now;
1451 	}
1452 
1453 	return max(size_goal, mss_now);
1454 }
1455 
tcp_nip_send_mss(struct sock * sk,int * size_goal,int flags)1456 int tcp_nip_send_mss(struct sock *sk, int *size_goal, int flags)
1457 {
1458 	int mss_now;
1459 
1460 	mss_now = tcp_nip_current_mss(sk);
1461 	*size_goal = tcp_nip_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
1462 	return mss_now;
1463 }
1464 
tcp_nip_sendmsg(struct sock * sk,struct msghdr * msg,size_t size)1465 int tcp_nip_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1466 {
1467 	struct tcp_sock *tp = tcp_sk(sk);
1468 	struct sk_buff *skb;
1469 	int flags;
1470 	int err;
1471 	int copied = 0;
1472 	int mss_now = 0;
1473 	int size_goal;
1474 	bool process_backlog = false;
1475 	long timeo;
1476 
1477 	lock_sock(sk);
1478 
1479 	flags = msg->msg_flags;
1480 
1481 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1482 
1483 	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1484 	    !tcp_passive_fastopen(sk)) {
1485 		err = sk_stream_wait_connect(sk, &timeo);
1486 		if (err != 0)
1487 			goto do_error;
1488 	}
1489 
1490 	/* This should be in poll */
1491 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1492 
1493 	copied = 0;
1494 
1495 restart:
1496 	mss_now = tcp_nip_send_mss(sk, &size_goal, flags);
1497 
1498 	nip_dbg("mss_now=%d", mss_now);
1499 
1500 	err = -EPIPE;
1501 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1502 		goto do_error;
1503 
1504 	while (msg_data_left(msg)) {
1505 		int copy = 0;
1506 		int max = mss_now;
1507 
1508 		bool first_skb;
1509 
1510 		if (!sk_stream_memory_free(sk))
1511 			goto wait_for_sndbuf;
1512 
1513 		if (process_backlog && sk_flush_backlog(sk)) {
1514 			process_backlog = false;
1515 			goto restart;
1516 		}
1517 		first_skb = skb_queue_empty(&sk->sk_write_queue);
1518 		skb = sk_stream_alloc_skb(sk, mss_now, sk->sk_allocation, first_skb);
1519 		if (!skb)
1520 			goto wait_for_memory;
1521 
1522 		skb->tstamp = 0;
1523 		process_backlog = true;
1524 
1525 		skb_nip_entail(sk, skb);
1526 		copy = mss_now;
1527 		max = mss_now;
1528 
1529 		/* Try to append data to the end of skb. */
1530 		if (copy > msg_data_left(msg))
1531 			copy = msg_data_left(msg);
1532 
1533 		if (skb_availroom(skb) > 0) {
1534 			/* We have some space in skb head. Superb! */
1535 			copy = min_t(int, copy, skb_availroom(skb));
1536 			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1537 			if (err)
1538 				goto do_fault;
1539 		} else {
1540 			nip_dbg("msg too big, tcp cannot devide packet now");
1541 			goto out;
1542 		}
1543 
1544 		if (!copied)
1545 			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1546 
1547 		tp->write_seq += copy;
1548 		TCP_SKB_CB(skb)->end_seq += copy;
1549 		tcp_skb_pcount_set(skb, 0);
1550 		copied += copy;
1551 		if (!msg_data_left(msg)) {
1552 			if (unlikely(flags & MSG_EOR))
1553 				TCP_SKB_CB(skb)->eor = 1;
1554 			goto out;
1555 		}
1556 
1557 		continue;
1558 
1559 wait_for_sndbuf:
1560 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1561 wait_for_memory:
1562 		if (copied)
1563 			tcp_nip_push(sk, flags & ~MSG_MORE, mss_now,
1564 				     TCP_NAGLE_PUSH, size_goal);
1565 
1566 		err = sk_stream_wait_memory(sk, &timeo);
1567 		if (err != 0)
1568 			goto do_error;
1569 
1570 		mss_now = tcp_nip_send_mss(sk, &size_goal, flags);
1571 	}
1572 
1573 out:
1574 	if (copied)
1575 		tcp_nip_push(sk, flags, mss_now, tp->nonagle, size_goal);
1576 	release_sock(sk);
1577 	return copied;
1578 
1579 do_fault:
1580 	if (!skb->len) {
1581 		tcp_nip_modify_send_head(sk, skb);
1582 		tcp_unlink_write_queue(skb, sk);
1583 		sk_wmem_free_skb(sk, skb);
1584 	}
1585 
1586 do_error:
1587 	if (copied)
1588 		goto out;
1589 
1590 	err = sk_stream_error(sk, flags, err);
1591 	/* make sure we wake any epoll edge trigger waiter */
1592 	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
1593 		sk->sk_write_space(sk);
1594 	release_sock(sk);
1595 	return err;
1596 }
1597 
1598 /* Clean up the receive buffer for full frames taken by the user,
1599  * then send an ACK if necessary.  COPIED is the number of bytes
1600  * tcp_recvmsg has given to the user so far, it speeds up the
1601  * calculation of whether or not we must ACK for the sake of
1602  * a window update.
1603  */
tcp_nip_cleanup_rbuf(struct sock * sk,int copied)1604 void tcp_nip_cleanup_rbuf(struct sock *sk, int copied)
1605 {
1606 	struct tcp_sock *tp = tcp_sk(sk);
1607 	bool time_to_ack = false;
1608 
1609 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1610 
1611 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1612 	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X",
1613 	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1614 
1615 	if (inet_csk_ack_scheduled(sk)) {
1616 		const struct inet_connection_sock *icsk = inet_csk(sk);
1617 
1618 		if (tp->rcv_nxt - tp->rcv_wup > (get_ack_num() *
1619 			TCP_ACK_NUM_MULTIPLIER * icsk->icsk_ack.rcv_mss) ||
1620 		    /* If this read emptied read buffer, we send ACK, if
1621 		     * connection is not bidirectional, user drained
1622 		     * receive buffer and there was a small segment
1623 		     * in queue.
1624 		     */
1625 		    (copied > 0 &&
1626 		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1627 		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1628 		       !inet_csk_in_pingpong_mode(sk))) &&
1629 		      !atomic_read(&sk->sk_rmem_alloc)))
1630 			time_to_ack = true;
1631 	}
1632 
1633 	/* We send an ACK if we can now advertise a non-zero window
1634 	 * which has been raised "significantly".
1635 	 *
1636 	 * Even if window raised up to infinity, do not send window open ACK
1637 	 * in states, where we will not receive more. It is useless.
1638 	 */
1639 	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1640 		__u32 rcv_window_now = tcp_receive_window(tp);
1641 
1642 		/* Optimize, __nip_tcp_select_window() is not cheap. */
1643 		if (TCP_WINDOW_RAISE_THRESHOLD * rcv_window_now <= tp->window_clamp) {
1644 			__u32 new_window = __nip_tcp_select_window(sk);
1645 
1646 			/* Send ACK now, if this read freed lots of space
1647 			 * in our buffer. Certainly, new_window is new window.
1648 			 * We can advertise it now, if it is not less than current one.
1649 			 * "Lots" means "at least twice" here.
1650 			 */
1651 			if (new_window && new_window >= TCP_WINDOW_RAISE_THRESHOLD * rcv_window_now)
1652 				time_to_ack = true;
1653 		}
1654 	}
1655 	if (time_to_ack)
1656 		tcp_nip_send_ack(sk);
1657 }
1658 
tcp_nip_recvmsg(struct sock * sk,struct msghdr * msg,size_t len,int nonblock,int flags,int * addr_len)1659 int tcp_nip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1660 		    int flags, int *addr_len)
1661 {
1662 	struct tcp_sock *tp = tcp_sk(sk);
1663 	int copied = 0;
1664 	u32 *seq;
1665 	unsigned long used;
1666 	int err = 0;
1667 	int target;
1668 	long timeo;
1669 	size_t len_tmp = len;
1670 	struct sk_buff *skb, *last;
1671 
1672 	lock_sock(sk);
1673 
1674 	if (sk->sk_state == TCP_LISTEN)
1675 		goto out;
1676 
1677 	timeo = sock_rcvtimeo(sk, nonblock);
1678 
1679 	seq = &tp->copied_seq;
1680 
1681 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len_tmp);
1682 
1683 	do {
1684 		u32 offset;
1685 		/* Next get a buffer. */
1686 		last = skb_peek_tail(&sk->sk_receive_queue);
1687 		skb_queue_walk(&sk->sk_receive_queue, skb) {
1688 			last = skb;
1689 			/* Now that we have two receive queues this
1690 			 * shouldn't happen.
1691 			 */
1692 			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1693 				 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X",
1694 				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1695 				 flags))
1696 				break;
1697 			offset = *seq - TCP_SKB_CB(skb)->seq;
1698 			if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1699 				pr_err_once("found a SYN, please report");
1700 				offset--;
1701 			}
1702 			if (offset < skb->len)
1703 				goto found_ok_skb;
1704 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1705 				goto found_fin_ok;
1706 			/* If the first SKB in the current SK_receive_queue is not the SKB to
1707 			 * be replicated, then MSG_PEEK should be set in flags
1708 			 */
1709 			WARN(!(flags & MSG_PEEK),
1710 			     "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X",
1711 			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1712 		}
1713 
1714 		/* If the program is executed at this point, the SK_receive_queue is finished */
1715 		/* If there is no data in the backlog, stop reading at target */
1716 		if (copied >= target && !sk->sk_backlog.tail)
1717 			break;
1718 
1719 		if (copied) {
1720 			if (sk->sk_err ||
1721 			    sk->sk_state == TCP_CLOSE ||
1722 			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1723 			    !timeo ||
1724 			    signal_pending(current))
1725 				break;
1726 		} else {
1727 			if (sock_flag(sk, SOCK_DONE))
1728 				break;
1729 
1730 			if (sk->sk_err) {
1731 				copied = sock_error(sk);
1732 				break;
1733 			}
1734 
1735 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1736 				break;
1737 
1738 			if (sk->sk_state == TCP_CLOSE) {
1739 				if (!sock_flag(sk, SOCK_DONE)) {
1740 					/* This occurs when user tries to read
1741 					 * from never connected socket.
1742 					 */
1743 					copied = -ENOTCONN;
1744 					break;
1745 				}
1746 				break;
1747 			}
1748 
1749 			if (!timeo) {
1750 				copied = -EAGAIN;
1751 				break;
1752 			}
1753 
1754 			if (signal_pending(current)) {
1755 				copied = sock_intr_errno(timeo);
1756 				break;
1757 			}
1758 		}
1759 
1760 		tcp_nip_cleanup_rbuf(sk, copied);
1761 
1762 		if (copied >= target) {
1763 			/* Do not sleep, just process backlog. */
1764 			release_sock(sk);
1765 			lock_sock(sk);
1766 		} else {
1767 			nip_dbg("no enough data receive queue, wait");
1768 			sk_wait_data(sk, &timeo, last);
1769 		}
1770 		continue;
1771 found_ok_skb:
1772 		used = skb->len - offset;
1773 		if (len_tmp < used)
1774 			used = len_tmp;
1775 		nip_dbg("copy data into msg, len=%ld", used);
1776 		if (!(flags & MSG_TRUNC)) {
1777 			err = skb_copy_datagram_msg(skb, offset, msg, used);
1778 			if (err) {
1779 				nip_dbg("copy data failed");
1780 				if (!copied)
1781 					copied = -EFAULT;
1782 				break;
1783 			}
1784 		}
1785 		*seq += used;
1786 		len_tmp -= used;
1787 		copied += used;
1788 
1789 		if (used + offset < skb->len)
1790 			continue;
1791 
1792 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1793 			goto found_fin_ok;
1794 		if (!(flags & MSG_PEEK))
1795 			sk_eat_skb(sk, skb);
1796 		continue;
1797 
1798 found_fin_ok:
1799 		/* Process the FIN. */
1800 		++*seq;
1801 		if (!(flags & MSG_PEEK))
1802 			sk_eat_skb(sk, skb);
1803 		break;
1804 	} while (len_tmp > 0);
1805 
1806 	/* Clean up data we have read: This will do ACK frames. */
1807 	tcp_nip_cleanup_rbuf(sk, copied);
1808 
1809 	release_sock(sk);
1810 	return copied;
1811 
1812 out:
1813 	release_sock(sk);
1814 	return err;
1815 }
1816 
skb_nip_rbtree_purge(struct sock * sk)1817 static void skb_nip_rbtree_purge(struct sock *sk)
1818 {
1819 	struct tcp_sock *tp = tcp_sk(sk);
1820 
1821 	skb_rbtree_purge(&tp->out_of_order_queue);
1822 }
1823 
tcp_nip_destroy_sock(struct sock * sk)1824 void tcp_nip_destroy_sock(struct sock *sk)
1825 {
1826 	struct tcp_sock *tp = tcp_sk(sk);
1827 
1828 	tcp_nip_clear_xmit_timers(sk);
1829 
1830 	tcp_nip_write_queue_purge(sk);
1831 
1832 	skb_nip_rbtree_purge(sk);
1833 
1834 	if (inet_csk(sk)->icsk_bind_hash)
1835 		inet_put_port(sk);
1836 
1837 	tcp_saved_syn_free(tp);
1838 	local_bh_disable();
1839 	sk_sockets_allocated_dec(sk);
1840 	local_bh_enable();
1841 }
1842 
1843 /* Function
1844  *    The sock handler for THE LISTEN and ESTABLISHED states is called by tcp_nip_rCV
1845  * Parameter
1846  *    skb: Packets received from the network layer
1847  *    sk: A SOCK instance needs to be processed
1848  */
tcp_nip_do_rcv(struct sock * sk,struct sk_buff * skb)1849 static int tcp_nip_do_rcv(struct sock *sk, struct sk_buff *skb)
1850 {
1851 	nip_dbg("received newip tcp skb, sk_state=%d", sk->sk_state);
1852 
1853 	if (sk->sk_state == TCP_ESTABLISHED) {
1854 		struct dst_entry *dst;
1855 
1856 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1857 						lockdep_sock_is_held(sk));
1858 		if (dst) {
1859 			/* Triggered when processing newly received skb after deleting routes */
1860 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1861 			    !dst->ops->check(dst, 0)) {
1862 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1863 				dst_release(dst);
1864 			}
1865 		}
1866 		tcp_nip_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1867 		return 0;
1868 	}
1869 
1870 	/* The connection is established in cookie mode to defend against SYN-flood attacks */
1871 	if (sk->sk_state == TCP_LISTEN)
1872 		nip_dbg("found TCP_LISTEN SOCK");
1873 
1874 	if (tcp_nip_rcv_state_process(sk, skb))
1875 		goto discard;
1876 	return 0;
1877 
1878 discard:
1879 	kfree_skb(skb);
1880 	return 0;
1881 }
1882 
1883 /* Function:
1884  *    Fill the TCP header field in SKB into the TCP private control block,
1885  *    because the TCP header field in SKB is the network byte order,
1886  *    in order to facilitate later call, need to convert the host byte order
1887  *    and store in the TCP control block.
1888  * Parameter:
1889  *    skb:Packets delivered by the network layer
1890  *    th:TCP header field in a packet
1891  */
tcp_nip_fill_cb(struct sk_buff * skb,const struct tcphdr * th)1892 static void tcp_nip_fill_cb(struct sk_buff *skb, const struct tcphdr *th)
1893 {
1894 	barrier();
1895 
1896 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 				    skb->len - th->doff * TCP_NUM_4);
1899 
1900 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1901 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1902 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1903 	TCP_SKB_CB(skb)->sacked = 0;
1904 }
1905 
tcp_nip_add_backlog(struct sock * sk,struct sk_buff * skb)1906 static bool tcp_nip_add_backlog(struct sock *sk, struct sk_buff *skb)
1907 {
1908 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1909 
1910 	/* Only socket owner can try to collapse/prune rx queues
1911 	 * to reduce memory overhead, so add a little headroom here.
1912 	 * Few sockets backlog are possibly concurrently non empty.
1913 	 */
1914 	limit += TCP_BACKLOG_HEADROOM;
1915 
1916 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1917 	 * we can fix skb->truesize to its real value to avoid future drops.
1918 	 * This is valid because skb is not yet charged to the socket.
1919 	 * It has been noticed pure SACK packets were sometimes dropped
1920 	 * (if cooked by drivers without copybreak feature).
1921 	 */
1922 	skb_condense(skb);
1923 
1924 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1925 		bh_unlock_sock(sk);
1926 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1927 		nip_dbg("insert backlog fail");
1928 		return true;
1929 	}
1930 	return false;
1931 }
1932 
nip_skb_precheck(struct sk_buff * skb)1933 static int nip_skb_precheck(struct sk_buff *skb)
1934 {
1935 	if (!pskb_may_pull(skb, sizeof(struct tcphdr))) {
1936 		nip_dbg("invalid tcp packet length, drop the packet(skb->len=%u)", skb->len);
1937 		return -EINVAL;
1938 	}
1939 
1940 	if (skb->pkt_type != PACKET_HOST) {
1941 		nip_dbg("unknown pkt-type(%u), drop skb", skb->pkt_type);
1942 		return -EINVAL;
1943 	}
1944 
1945 	if (!nip_get_tcp_input_checksum(skb)) {
1946 		nip_dbg("checksum fail, drop skb");
1947 		return -EINVAL;
1948 	}
1949 
1950 	return 0;
1951 }
1952 
1953 /* Function
1954  *    TCP is the gateway from the network layer to the transport layer
1955  *    and receives data packets from the network layer
1956  * Parameter
1957  *    skb:Packets delivered by the network layer
1958  */
tcp_nip_rcv(struct sk_buff * skb)1959 static int tcp_nip_rcv(struct sk_buff *skb)
1960 {
1961 	const struct tcphdr *th;
1962 	bool refcounted;
1963 	struct sock *sk = NULL;
1964 	int ret;
1965 	int dif = skb->skb_iif;
1966 
1967 	if (nip_skb_precheck(skb))
1968 		goto discard_it;
1969 
1970 	th = (const struct tcphdr *)skb->data;
1971 
1972 	if (unlikely(th->doff < sizeof(struct tcphdr) / TCP_NUM_4)) {
1973 		nip_dbg("non-four byte alignment, drop skb");
1974 		goto discard_it;
1975 	}
1976 	if (!pskb_may_pull(skb, th->doff * 4))
1977 		goto discard_it;
1978 
1979 	sk = __ninet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
1980 				th->source, th->dest, dif, &refcounted);
1981 	if (!sk) {
1982 		nip_dbg("can`t find related sock for skb, will disconnect");
1983 		goto no_tcp_socket;
1984 	}
1985 
1986 	if (sk->sk_state == TCP_TIME_WAIT) {
1987 		/* Handles the SK portion of the interrupt state */
1988 		nip_dbg("sk_state is TCP_TIME_WAIT, drop skb");
1989 		goto discard_it;
1990 	}
1991 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1992 		struct request_sock *req = inet_reqsk(sk);
1993 		struct sock *nsk;
1994 
1995 		nip_dbg("TCP server into third shake hands, sk->sk_state:%d", sk->sk_state);
1996 		sk = req->rsk_listener;
1997 
1998 		sock_hold(sk);
1999 		refcounted = true;
2000 		nsk = NULL;
2001 		/* You need to create a new SOCK and enter TCP_SYN_RECV,
2002 		 * which is then set to Established
2003 		 */
2004 		if (!tcp_filter(sk, skb)) {
2005 			th = (const struct tcphdr *)skb->data;
2006 			tcp_nip_fill_cb(skb, th);
2007 			nsk = tcp_nip_check_req(sk, skb, req);
2008 		}
2009 		if (!nsk || nsk == sk) {
2010 			nip_dbg("skb info error and create newsk failure, drop skb");
2011 			reqsk_put(req);
2012 			goto discard_and_relse;
2013 		}
2014 		if (tcp_nip_child_process(sk, nsk, skb)) {
2015 			nip_dbg("child process fail, drop skb");
2016 			goto discard_and_relse;
2017 		} else {
2018 			sock_put(sk);
2019 			return 0;
2020 		}
2021 	}
2022 
2023 	tcp_nip_fill_cb(skb, th);
2024 
2025 	if (tcp_filter(sk, skb)) {
2026 		nip_dbg("tcp filter fail, drop skb");
2027 		goto discard_and_relse;
2028 	}
2029 	th = (const struct tcphdr *)skb->data;
2030 	skb->dev = NULL;
2031 
2032 	if (sk->sk_state == TCP_LISTEN) {
2033 		nip_dbg("TCP server into first shake hands! sk->sk_state:%d", sk->sk_state);
2034 		ret  = tcp_nip_do_rcv(sk, skb);
2035 		goto put_and_return;
2036 	}
2037 	bh_lock_sock_nested(sk);
2038 
2039 	ret = 0;
2040 	if (!sock_owned_by_user(sk)) {
2041 		ret = tcp_nip_do_rcv(sk, skb);
2042 	} else {
2043 		nip_dbg("sock locked by user, put packet into backlog");
2044 		if (tcp_nip_add_backlog(sk, skb)) {
2045 			nip_dbg("add backlog fail, drop skb");
2046 			goto discard_and_relse;
2047 		}
2048 	}
2049 
2050 	bh_unlock_sock(sk);
2051 
2052 put_and_return:
2053 	if (refcounted)
2054 		sock_put(sk);
2055 	return ret ? -1 : 0;
2056 
2057 no_tcp_socket:
2058 	tcp_nip_send_reset(NULL, skb);
2059 
2060 discard_it:
2061 	kfree_skb(skb);
2062 	nip_sock_debug(sk, __func__);
2063 	return 0;
2064 
2065 discard_and_relse:
2066 	sk_drops_add(sk, skb);
2067 	nip_sock_debug(sk, __func__);
2068 	if (refcounted)
2069 		sock_put(sk);
2070 	goto discard_it;
2071 }
2072 
tcp_nip_early_demux(struct sk_buff * skb)2073 static void tcp_nip_early_demux(struct sk_buff *skb)
2074 {
2075 	const struct tcphdr *th;
2076 	struct sock *sk;
2077 
2078 	if (skb->pkt_type != PACKET_HOST)
2079 		return;
2080 
2081 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
2082 		return;
2083 
2084 	th = tcp_hdr(skb);
2085 	if (th->doff < sizeof(struct tcphdr) / BYTES_PER_TCP_HEADER)
2086 		return;
2087 
2088 	sk = __ninet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
2089 					&nipcb(skb)->srcaddr, th->source,
2090 					&nipcb(skb)->dstaddr, ntohs(th->dest), skb->skb_iif);
2091 	if (sk) {
2092 		skb->sk = sk;
2093 		skb->destructor = sock_edemux;
2094 		if (sk_fullsock(sk)) {
2095 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2096 
2097 			if (dst)
2098 				dst = dst_check(dst, 0);
2099 			if (dst && inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) {
2100 				nip_dbg("find sock in ehash, set dst for skb");
2101 				skb_dst_set_noref(skb, dst);
2102 			}
2103 		}
2104 	}
2105 }
2106 
tcp_nip_done(struct sock * sk)2107 void tcp_nip_done(struct sock *sk)
2108 {
2109 	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2110 
2111 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2112 		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2113 
2114 	tcp_set_state(sk, TCP_CLOSE);
2115 	inet_csk_clear_xmit_timers(sk);
2116 	if (req)
2117 		reqsk_fastopen_remove(sk, req, false);
2118 
2119 	sk->sk_shutdown = SHUTDOWN_MASK;
2120 
2121 	if (!sock_flag(sk, SOCK_DEAD)) {
2122 		sk->sk_state_change(sk);
2123 	} else {
2124 		WARN_ON(sk->sk_state != TCP_CLOSE);
2125 		WARN_ON(!sock_flag(sk, SOCK_DEAD));
2126 
2127 		/* It cannot be in hash table! */
2128 		WARN_ON(!sk_unhashed(sk));
2129 
2130 		/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
2131 		WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
2132 		sk->sk_prot->destroy(sk);
2133 
2134 		sk_nip_stream_kill_queues(sk);
2135 
2136 		local_bh_disable();
2137 		this_cpu_dec(*sk->sk_prot->orphan_count);
2138 		local_bh_enable();
2139 		sock_put(sk);
2140 		nip_dbg("close sock done");
2141 	}
2142 }
2143 
2144 /* Function
2145  *    Disconnect the connection to the peer end, non-blocking
2146  *    Release read/write queue, send RST (not sent yet), clear timer
2147  * Parameter
2148  *    sk: Transmission control block
2149  */
tcp_nip_disconnect(struct sock * sk,int flags)2150 int tcp_nip_disconnect(struct sock *sk, int flags)
2151 {
2152 	struct inet_sock *inet = inet_sk(sk);
2153 	struct inet_connection_sock *icsk = inet_csk(sk);
2154 	struct tcp_sock *tp = tcp_sk(sk);
2155 	int err = 0;
2156 	int old_state = sk->sk_state;
2157 	u32 sk_ack_backlog;
2158 
2159 	nip_dbg("old_state=%u", old_state);
2160 	if (old_state != TCP_CLOSE)
2161 		tcp_set_state(sk, TCP_CLOSE);
2162 
2163 	if (old_state == TCP_LISTEN) {
2164 		sk_ack_backlog = READ_ONCE(sk->sk_ack_backlog);
2165 		inet_csk_listen_stop(sk);
2166 		nip_dbg("sk_state CLOSE, sk_ack_backlog=%u to %u, sk_max_ack_backlog=%u",
2167 			sk_ack_backlog, READ_ONCE(sk->sk_ack_backlog),
2168 			READ_ONCE(sk->sk_max_ack_backlog));
2169 	} else if (tcp_nip_need_reset(old_state) || (tp->snd_nxt != tp->write_seq &&
2170 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2171 		tcp_nip_send_active_reset(sk, gfp_any());
2172 		sk->sk_err = ECONNRESET;
2173 	} else if (old_state == TCP_SYN_SENT) {
2174 		sk->sk_err = ECONNRESET;
2175 	}
2176 
2177 	tcp_nip_clear_xmit_timers(sk);
2178 	__skb_queue_purge(&sk->sk_receive_queue);
2179 	tcp_write_queue_purge(sk);
2180 
2181 	_tcp_sock_priv_init(sk);
2182 
2183 	inet->inet_dport = 0;
2184 	sk->sk_shutdown = 0;
2185 	sock_reset_flag(sk, SOCK_DONE);
2186 	tp->srtt_us = 0;
2187 	tp->write_seq += tp->max_window + TCP_NUM_2;
2188 	if (tp->write_seq == 0)
2189 		tp->write_seq = 1;
2190 	tp->snd_cwnd = TCP_NUM_2;
2191 	icsk->icsk_backoff = 0;
2192 	icsk->icsk_probes_out = 0;
2193 	icsk->icsk_probes_tstamp = 0;
2194 	icsk->icsk_rto = get_nip_rto() == 0 ? TCP_TIMEOUT_INIT : (HZ / get_nip_rto());
2195 	icsk->icsk_rto_min = TCP_RTO_MIN;
2196 	icsk->icsk_delack_max = TCP_DELACK_MAX;
2197 	tp->packets_out = 0;
2198 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2199 	tp->snd_cwnd_cnt = 0;
2200 	tp->window_clamp = 0;
2201 	tp->delivered = 0;
2202 	tcp_clear_retrans(tp);
2203 	tp->total_retrans = 0;
2204 	inet_csk_delack_init(sk);
2205 
2206 	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2207 	sk->sk_send_head = NULL;
2208 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2209 	__sk_dst_reset(sk);
2210 	dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
2211 	tp->segs_in = 0;
2212 	tp->segs_out = 0;
2213 	tp->bytes_acked = 0;
2214 	tp->bytes_received = 0;
2215 	tp->data_segs_in = 0;
2216 	tp->data_segs_out = 0;
2217 
2218 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2219 
2220 	if (sk->sk_frag.page) {
2221 		put_page(sk->sk_frag.page);
2222 		sk->sk_frag.page = NULL;
2223 		sk->sk_frag.offset = 0;
2224 	}
2225 
2226 	sk->sk_error_report(sk);
2227 	return err;
2228 }
2229 
ninet_csk_accept(struct sock * sk,int flags,int * err,bool kern)2230 struct sock *ninet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
2231 {
2232 	struct sock *newsk;
2233 	u32 sk_ack_backlog_last = READ_ONCE(sk->sk_ack_backlog);
2234 	u32 sk_max_ack_backlog = READ_ONCE(sk->sk_max_ack_backlog);
2235 
2236 	newsk = inet_csk_accept(sk, flags, err, kern);
2237 	nip_dbg("accept %s, sk_ack_backlog_last=%u, sk_max_ack_backlog=%u, err=%d",
2238 		(newsk ? "ok" : "fail"), sk_ack_backlog_last, sk_max_ack_backlog,
2239 		*err);
2240 
2241 	return newsk;
2242 }
2243 
tcp_nip_sendpage(struct sock * sk,struct page * page,int offset,size_t size,int flags)2244 static int tcp_nip_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
2245 			    int flags)
2246 {
2247 	return -EINVAL;
2248 }
2249 
tcp_nip_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)2250 static int tcp_nip_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
2251 {
2252 	return -EINVAL;
2253 }
2254 
2255 struct proto tcp_nip_prot = {
2256 	.name			= "NIP_TCP",
2257 	.owner			= THIS_MODULE,
2258 	.close			= tcp_nip_close,
2259 	.pre_connect		= tcp_nip_pre_connect,
2260 	.connect		= tcp_nip_connect,
2261 	.disconnect		= tcp_nip_disconnect,
2262 	.accept			= ninet_csk_accept,
2263 	.ioctl			= tcp_ioctl,
2264 	.init			= tcp_nip_init_sock,
2265 	.destroy		= tcp_nip_destroy_sock,
2266 	.shutdown		= tcp_nip_shutdown,
2267 	.setsockopt		= tcp_setsockopt,
2268 	.getsockopt		= tcp_getsockopt,
2269 	.keepalive		= tcp_set_keepalive,
2270 	.recvmsg		= tcp_nip_recvmsg,
2271 	.sendmsg		= tcp_nip_sendmsg,
2272 	.sendpage		= tcp_nip_sendpage,
2273 	.backlog_rcv		= tcp_nip_do_rcv,
2274 	.release_cb		= tcp_nip_release_cb,
2275 	.hash			= ninet_hash,
2276 	.unhash			= ninet_unhash,
2277 	.get_port		= inet_csk_get_port,
2278 	.sockets_allocated	= &tcp_sockets_allocated,
2279 	.orphan_count		= &tcp_orphan_count,
2280 	.memory_allocated	= &tcp_memory_allocated,
2281 	.memory_pressure	= &tcp_memory_pressure,
2282 	.sysctl_mem		= sysctl_tcp_mem,
2283 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2284 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2285 	.max_header		= MAX_TCP_HEADER,
2286 	.obj_size		= sizeof(struct tcp_nip_sock),
2287 	.rsk_prot		= &tcp_nip_request_sock_ops,
2288 	.h.hashinfo		= &tcp_hashinfo,
2289 	.no_autobind		= true,
2290 };
2291 
tcp_nip_err_handler(struct sk_buff * skb,struct ninet_skb_parm * opt,u8 type,u8 code,int offset,__be32 info)2292 static void tcp_nip_err_handler(struct sk_buff *skb,
2293 				struct ninet_skb_parm *opt,
2294 				u8 type, u8 code, int offset, __be32 info)
2295 {
2296 }
2297 
2298 static const struct ninet_protocol tcp_nip_protocol   = {
2299 	.early_demux		= tcp_nip_early_demux,
2300 	.handler		= tcp_nip_rcv,
2301 	.err_handler		= tcp_nip_err_handler,
2302 	.flags			= 0,
2303 };
2304 
2305 static struct inet_protosw tcp_nip_protosw = {
2306 	.type		=	SOCK_STREAM,
2307 	.protocol	=	IPPROTO_TCP,
2308 	.prot		=	&tcp_nip_prot,
2309 	.ops		=	&ninet_stream_ops,
2310 	.flags		=	INET_PROTOSW_PERMANENT |
2311 				INET_PROTOSW_ICSK,
2312 };
2313 
tcp_nip_init(void)2314 int __init tcp_nip_init(void)
2315 {
2316 	int ret;
2317 
2318 	ret = ninet_add_protocol(&tcp_nip_protocol, IPPROTO_TCP);
2319 	if (ret)
2320 		goto out;
2321 
2322 	/* register ninet protocol */
2323 	ret = ninet_register_protosw(&tcp_nip_protosw);
2324 	if (ret)
2325 		goto out_nip_tcp_protocol;
2326 
2327 out:
2328 	return ret;
2329 
2330 out_nip_tcp_protocol:
2331 	ninet_del_protocol(&tcp_nip_protocol, IPPROTO_TCP);
2332 	goto out;
2333 }
2334 
2335 /* When adding the __exit tag to a function, it is important to
2336  * ensure that the function is only called during the exit phase
2337  * to avoid unnecessary warnings and errors.
2338  */
tcp_nip_exit(void)2339 void tcp_nip_exit(void)
2340 {
2341 	ninet_unregister_protosw(&tcp_nip_protosw);
2342 	ninet_del_protocol(&tcp_nip_protocol, IPPROTO_TCP);
2343 }
2344