• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1From abeef0770f76cd0eff8e5c6e50de0b280079d7f0 Mon Sep 17 00:00:00 2001
2From: jiangheng <jiangheng14@huawei.com>
3Date: Mon, 13 Mar 2023 19:25:42 +0800
4Subject: [PATCH] fix tso small packet drop in kernel server
5
6---
7 src/core/tcp_out.c     | 254 +++++++++++++++++++++--------------------
8 src/include/lwipopts.h |   2 +
9 2 files changed, 130 insertions(+), 126 deletions(-)
10
11diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c
12index 8a0d653..b1c317d 100644
13--- a/src/core/tcp_out.c
14+++ b/src/core/tcp_out.c
15@@ -1312,60 +1312,33 @@ tcp_build_wnd_scale_option(u32_t *opts)
16 #endif
17
18 #if GAZELLE_ENABLE
19-static struct tcp_seg *tcp_output_over(struct tcp_pcb *pcb, struct tcp_seg *seg, struct tcp_seg *useg)
20-{
21-  if (TCP_TCPLEN(seg) > 0) {
22-    seg->next = NULL;
23-    if (useg == NULL) {
24-      pcb->unacked = seg;
25-      pcb->last_unacked = seg;
26-      useg = seg;
27-    } else {
28-      if (TCP_SEQ_LT(lwip_ntohl(seg->tcphdr->seqno), lwip_ntohl(useg->tcphdr->seqno))) {
29-        /* add segment to before tail of unacked list, keeping the list sorted */
30-        struct tcp_seg **cur_seg = &(pcb->unacked);
31-        while (*cur_seg &&
32-              TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(seg->tcphdr->seqno))) {
33-          cur_seg = &((*cur_seg)->next );
34-        }
35-        seg->next = (*cur_seg);
36-        (*cur_seg) = seg;
37-      } else {
38-        /* add segment to tail of unacked list */
39-        useg->next = seg;
40-        useg = seg;
41-        pcb->last_unacked = seg;
42-      }
43-    }
44-  } else {
45-    tcp_seg_free(seg);
46-  }
47-
48-  return useg;
49-}
50-static err_t tcp_output_seg(struct tcp_pcb *pcb, struct tcp_seg *seg, struct netif *netif, u32_t snd_nxt)
51-{
52-  if (pcb->state != SYN_SENT) {
53-    TCPH_SET_FLAG(seg->tcphdr, TCP_ACK);
54-  }
55-
56-  err_t err = tcp_output_segment(seg, pcb, netif);
57-  if (err != ERR_OK) {
58-    /* segment could not be sent, for whatever reason */
59-    tcp_set_flags(pcb, TF_NAGLEMEMERR);
60-    return err;
61-  }
62-
63-  if (pcb->state != SYN_SENT) {
64-    tcp_clear_flags(pcb, TF_ACK_DELAY | TF_ACK_NOW);
65-  }
66-
67-  if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) {
68-      pcb->snd_nxt = snd_nxt;
69-  }
70-
71-  return ERR_OK;
72-}
73+u32_t start_seqno = 0;
74+#define TCP_INIT_SEGMENT(tem_seg, _pcb, _p, _hdrflags, _seqno, _optflags) \
75+do { \
76+  struct tcp_seg *_seg = tem_seg; \
77+  u8_t _optlen; \
78+  rte_prefetch2(_p); \
79+ \
80+  _optlen = LWIP_TCP_OPT_LENGTH_SEGMENT(_optflags, _pcb); \
81+  _seg->flags = _optflags; \
82+  _seg->next = NULL; \
83+  _seg->p = _p; \
84+  _seg->len = _p->tot_len - _optlen; \
85+  /* build TCP header */ \
86+  pbuf_add_header(_p, TCP_HLEN); \
87+  _seg->tcphdr = (struct tcp_hdr *)_seg->p->payload; \
88+  _seg->tcphdr->src = lwip_htons(_pcb->local_port); \
89+  _seg->tcphdr->dest = lwip_htons(_pcb->remote_port); \
90+ /* _seg->tcphdr->src = lwip_htons(_pcb->local_port); \ */ \
91+ /* _seg->tcphdr->dest = lwip_htons(_pcb->remote_port); \ */ \
92+  _seg->tcphdr->seqno = lwip_htonl(_seqno); \
93+                                          \
94+  if (start_seqno == 0) {\
95+      start_seqno = _seqno; \
96+  } \
97+  TCPH_HDRLEN_FLAGS_SET(_seg->tcphdr, (5 + _optlen / 4), _hdrflags); \
98+  _seg->tcphdr->urgp = 0; \
99+} while(0)
100 #endif
101 /**
102  * @ingroup tcp_raw
103@@ -1471,97 +1444,127 @@ tcp_output(struct tcp_pcb *pcb)
104   pcb->persist_backoff = 0;
105
106   /* useg should point to last segment on unacked queue */
107-  useg = pcb->last_unacked;
108+  useg = pcb->unacked;
109+  if (useg != NULL) {
110+    for (; useg->next != NULL; useg = useg->next);
111+  }
112
113   /* data available and window allows it to be sent? */
114-
115-  u32_t send_len = 0;
116 #if GAZELLE_ENABLE
117   if ((get_eth_params_tx_ol() & DEV_TX_OFFLOAD_TCP_TSO) && pcb->need_tso_send) {
118-    while(seg && send_len < 0xffff) {
119-      /**
120-       * 1) walk unsent queue, find all seg witch wait to send. chain buf in these segs.
121-       * 2) create new segment, send and free new segment.
122-       * 3) update snd_nxt, unacked queue, and unsent queue
123-       */
124-      struct tcp_seg *start_seg = seg;
125-      struct pbuf *first_pbuf = NULL;
126-      struct pbuf *pre_pbuf = NULL;
127-      u8_t pbuf_chain_len = 0;
128-      u32_t next_seqno = lwip_ntohl(seg->tcphdr->seqno);
129-      while (seg != NULL && pbuf_chain_len < GAZELLE_TCP_MAX_PBUF_CHAIN_LEN) {
130+     uint16_t send_pkt = 0;
131+
132+     do {
133+        struct tcp_seg * start_seg = seg;
134+        struct pbuf *new_pbuf = NULL;
135+
136+        struct pbuf *tmp_pbuf = NULL;
137         u32_t seg_seqno = lwip_ntohl(seg->tcphdr->seqno);
138-        if (seg_seqno - pcb->lastack + seg->len > wnd) {
139-          if (first_pbuf)
140-            break;
141-          else
142-            goto output_done;
143+        u32_t last_seg_seqno = seg_seqno;
144+
145+        struct tcp_seg *last_seg = NULL;
146+        u16_t last_seg_len = 0;
147+        u8_t pbuf_chain_len = 0;
148+        while (seg != NULL && seg_seqno - pcb->lastack + seg->len <= wnd && pbuf_chain_len < GAZELLE_TCP_MAX_PBUF_CHAIN_LEN) {
149+            if (last_seg_len != 0 && (last_seg_len + seg->len < 1460) && seg->len < GAZELLE_TCP_MIN_TSO_SEG_LEN) {
150+                break;
151+            }
152+
153+            if ((tcp_do_output_nagle(pcb) == 0) &&
154+                ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) {
155+                break;
156+            }
157+            if (last_seg_seqno + last_seg_len == seg_seqno) {
158+                pbuf_remove_header(seg->p, seg->p->tot_len - seg->len);
159+                if (new_pbuf == NULL) {
160+                    new_pbuf = seg->p;
161+                    tmp_pbuf = new_pbuf;
162+                } else {
163+                    new_pbuf->tot_len += seg->p->len;
164+                    tmp_pbuf->next = seg->p;
165+                    tmp_pbuf = tmp_pbuf->next;
166+                }
167+            } else {
168+                break;
169+            }
170+
171+            last_seg = seg;
172+            last_seg_len = seg->len;
173+            last_seg_seqno = seg_seqno;
174+            seg = seg->next;
175+            seg_seqno = (seg != NULL) ? lwip_ntohl(seg->tcphdr->seqno) : seg_seqno;
176+            pbuf_chain_len++;
177         }
178
179-        if ((tcp_do_output_nagle(pcb) == 0) && ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) {
180-          if (first_pbuf)
181-            break;
182-          else
183-            goto output_done;
184+        // tcp_do_output_nagle, break
185+        if (new_pbuf == NULL) {
186+            goto end_loop;
187         }
188
189-        if (seg->len < TCP_MSS || next_seqno != seg_seqno || pbuf_chain_len >= GAZELLE_TCP_MAX_PBUF_CHAIN_LEN) {
190-          break;
191-        }
192-        if (first_pbuf == NULL && (seg->next == NULL || seg->next->len < TCP_MSS)) {
193-          break;
194-        }
195+        struct tcp_seg new_seg;
196+        TCP_INIT_SEGMENT(&new_seg, pcb, new_pbuf, 0, lwip_ntohl(start_seg->tcphdr->seqno), 0);
197
198-        pbuf_remove_header(seg->p, seg->p->tot_len - seg->len);
199-        if (first_pbuf == NULL) {
200-          first_pbuf = seg->p;
201-        } else {
202-          first_pbuf->tot_len += seg->p->len;
203-          pre_pbuf->next = seg->p;
204+        if (pcb->state != SYN_SENT) {
205+              TCPH_SET_FLAG(new_seg.tcphdr, TCP_ACK);
206         }
207
208-        send_len += seg->len;
209-        pre_pbuf = seg->p;
210-        next_seqno = seg_seqno + TCP_TCPLEN(seg);
211-        seg = seg->next;
212-        pcb->unsent = seg;
213-        pbuf_chain_len++;
214-      }
215-
216-      if (first_pbuf == NULL) {
217-        err = tcp_output_seg(pcb, seg, netif, next_seqno + seg->len);
218+        err = tcp_output_segment(&new_seg, pcb, netif);
219         if (err != ERR_OK) {
220-          if (pcb->unsent == NULL)
221-            pcb->last_unsent = NULL;
222-	  pcb->need_tso_send = 0;
223-          return err;
224+            /* segment could not be sent, for whatever reason */
225+            tcp_set_flags(pcb, TF_NAGLEMEMERR);
226+            return err;
227         }
228-        pcb->unsent = seg->next;
229-        useg = tcp_output_over(pcb, seg, useg);
230-        seg = pcb->unsent;
231-        continue;
232-      }
233-
234-      struct tcp_seg new_seg;
235-      tcp_init_segment(&new_seg, pcb, first_pbuf, 0, lwip_ntohl(start_seg->tcphdr->seqno), 0);
236
237-      err = tcp_output_seg(pcb, &new_seg, netif, next_seqno);
238+        pcb->unsent = last_seg->next;
239+        if (pcb->state != SYN_SENT) {
240+            tcp_clear_flags(pcb, TF_ACK_DELAY | TF_ACK_NOW);
241+        }
242
243-      for (u32_t i = 0; i < pbuf_chain_len; i++) {
244-        struct tcp_seg *next_seg = start_seg->next;
245-        start_seg->p->next = NULL;
246-        useg = tcp_output_over(pcb, start_seg, useg);
247-        start_seg = next_seg;
248-      }
249+        snd_nxt = last_seg_seqno + TCP_TCPLEN(last_seg);
250+        if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) {
251+            pcb->snd_nxt = snd_nxt;
252+        }
253
254-      pbuf_remove_header(new_seg.p, new_seg.p->tot_len - new_seg.len - TCPH_HDRLEN_BYTES(new_seg.tcphdr));
255-      new_seg.p->tot_len = new_seg.p->len;
256-    }
257-  pcb->need_tso_send = 0;
258+        pbuf_remove_header(new_seg.p, new_seg.p->tot_len - new_seg.len - TCP_HLEN);
259+        new_seg.p->tot_len = new_seg.p->len;
260+
261+        for (int start = pbuf_chain_len; start > 0; start--) {
262+            struct tcp_seg *tmp_seg = start_seg;
263+            start_seg = start_seg->next;
264+            tmp_seg->p->next = NULL;
265+            if (TCP_TCPLEN(tmp_seg) > 0) {
266+                tmp_seg->next = NULL;
267+                if (pcb->unacked == NULL) {
268+                    pcb->unacked = tmp_seg;
269+                    useg = tmp_seg;
270+                } else {
271+                    if (TCP_SEQ_LT(lwip_ntohl(tmp_seg->tcphdr->seqno), lwip_ntohl(useg->tcphdr->seqno))) {
272+                        /* add segment to before tail of unacked list, keeping the list sorted */
273+                        struct tcp_seg **cur_seg = &(pcb->unacked);
274+                        while (*cur_seg &&
275+                            TCP_SEQ_LT(lwip_ntohl((*cur_seg)->tcphdr->seqno), lwip_ntohl(tmp_seg->tcphdr->seqno)))     {
276+                            cur_seg = &((*cur_seg)->next );
277+                        }
278+                        tmp_seg->next = (*cur_seg);
279+                        (*cur_seg) = tmp_seg;
280+                    } else {
281+                        /* add segment to tail of unacked list */
282+                        useg->next = tmp_seg;
283+                        useg = useg->next;
284+                    }
285+                }
286+            } else {
287+              tcp_seg_free(tmp_seg);
288+            }
289+        }
290+     } while(seg != NULL && lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd && send_pkt++ < 10);
291+end_loop:
292+    pcb->need_tso_send = 0;
293   } else
294 #endif
295 {
296-  while (seg != NULL && send_len < 0xffff &&
297+  uint16_t send_pkt = 0;
298+  while (seg != NULL && send_pkt++ < 10 &&
299          lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd) {
300     LWIP_ASSERT("RST not expected here!",
301                 (TCPH_FLAGS(seg->tcphdr) & TCP_RST) == 0);
302@@ -1576,7 +1579,6 @@ tcp_output(struct tcp_pcb *pcb)
303         ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)) {
304       break;
305     }
306-    send_len += seg->len;
307 #if TCP_CWND_DEBUG
308     LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_output: snd_wnd %"TCPWNDSIZE_F", cwnd %"TCPWNDSIZE_F", wnd %"U32_F", effwnd %"U32_F", seq %"U32_F", ack %"U32_F", i %"S16_F"\n",
309                                  pcb->snd_wnd, pcb->cwnd, wnd,
310diff --git a/src/include/lwipopts.h b/src/include/lwipopts.h
311index 742b4a9..0d2a6d9 100644
312--- a/src/include/lwipopts.h
313+++ b/src/include/lwipopts.h
314@@ -55,6 +55,8 @@
315
316 #define GAZELLE_TCP_MAX_PBUF_CHAIN_LEN 40
317
318+#define GAZELLE_TCP_MIN_TSO_SEG_LEN 256
319+
320 /*
321    ----------------------------------
322    ---------- NIC offloads ----------
323--
3242.33.0
325
326