• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/fp16/pack_fp16.h"
18 #include <string.h>
19 
20 #ifdef ENABLE_ARM
PackWeightConvDw3x3Fp16(const void * src,void * dst,int channel)21 void PackWeightConvDw3x3Fp16(const void *src, void *dst, int channel) {
22   // nchw to nc8hw8 with 1D F(2,3)
23   for (int i = 0; i < channel; i++) {
24     float16_t *src_kernel = (float16_t *)src + i * 9;
25     float16_t *dst_kernel = (float16_t *)dst + (i / 8) * 96 + i % 8;
26     for (int y = 0; y < 3; y++) {
27       float16_t g0 = src_kernel[3 * y];
28       float16_t g1 = src_kernel[3 * y + 1];
29       float16_t g2 = src_kernel[3 * y + 2];
30 
31       dst_kernel[32 * y] = g0;
32       dst_kernel[32 * y + 8] = (float16_t)0.5 * (g0 + g1 + g2);
33       dst_kernel[32 * y + 16] = (float16_t)0.5 * (g0 - g1 + g2);
34       dst_kernel[32 * y + 24] = g2;
35     }
36   }
37 }
38 #endif
39 
Im2ColPackUnitFp16(const float16_t * input_data,const ConvParameter * conv_param,float16_t * packed_input,int real_cal_num,int block_index)40 void Im2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
41                         int real_cal_num, int block_index) {
42   // input format : nhwc
43   int kernel_h = conv_param->kernel_h_;
44   int kernel_w = conv_param->kernel_w_;
45   int kernel_plane = kernel_h * kernel_w;
46   int stride_h = conv_param->stride_h_;
47   int stride_w = conv_param->stride_w_;
48   int pad_h = conv_param->pad_u_;
49   int pad_w = conv_param->pad_l_;
50   int dilation_h = conv_param->dilation_h_;
51   int dilation_w = conv_param->dilation_w_;
52   int in_channel = conv_param->input_channel_;
53   int in_h = conv_param->input_h_;
54   int in_w = conv_param->input_w_;
55   int out_w = conv_param->output_w_;
56 
57   for (int i = 0; i < real_cal_num; i++) {
58     int block_start = block_index + i;
59     int input_h = block_start / out_w * stride_h - pad_h;
60     int input_w = block_start % out_w * stride_w - pad_w;
61     int input_stride = (input_h * in_w + input_w) * in_channel;
62     int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
63     int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
64     int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
65     int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
66     if (dilation_h == 1 && dilation_w == 1) {
67       for (int j = kh_s; j < kh_e; j++) {
68         int input_y_stride = j * in_w * in_channel + input_stride;
69         int input_x_stride = input_y_stride + kw_s * in_channel;
70         int input_plane_offset = (j * kernel_w + kw_s) * in_channel + i * in_channel * kernel_plane;
71         memcpy(packed_input + input_plane_offset, input_data + input_x_stride,
72                (kw_e - kw_s) * in_channel * sizeof(float16_t));
73       }  // kernel_h loop
74     } else {
75       for (int j = kh_s; j < kh_e; j++) {
76         int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
77         for (int n = kw_s; n < kw_e; n++) {
78           int input_x_stride = input_y_stride + n * dilation_w * in_channel;
79           int input_plane_offset = (j * kernel_w + n) * in_channel + i * in_channel * kernel_plane;
80           memcpy(packed_input + input_plane_offset, input_data + input_x_stride, in_channel * sizeof(float16_t));
81         }  // kernel_w loop
82       }    // kernel_h loop
83     }
84   }  // tile num loop
85 }
86 
PackHWCToWHCFp16(const float16_t * src,float16_t * dst,int height,int width,int channel)87 void PackHWCToWHCFp16(const float16_t *src, float16_t *dst, int height, int width, int channel) {
88   for (int i = 0; i < height; ++i) {
89     for (int j = 0; j < width; ++j) {
90       memcpy(dst + (j * height + i) * channel, src + (i * width + j) * channel, channel * sizeof(float16_t));
91     }
92   }
93 }
94 
PackWeightToC8Fp16(const float16_t * origin_weight_data,float16_t * packed_weight_data,const ConvParameter * conv_param)95 void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
96                         const ConvParameter *conv_param) {
97   // origin weight format : ohwi
98   int input_channel = conv_param->input_channel_;
99   int ic8 = UP_DIV(input_channel, C8NUM);
100   int output_channel = conv_param->output_channel_;
101   int kernel_plane = conv_param->kernel_h_ * conv_param->kernel_w_;
102 
103   for (int k = 0; k < kernel_plane; k++) {
104     int src_kernel_offset = k * input_channel;
105     int dst_kernel_offset = k * C8NUM;
106     for (int o = 0; o < output_channel; o++) {
107       int src_oc_offset = src_kernel_offset + o * kernel_plane * input_channel;
108       int dst_oc_offset = dst_kernel_offset + o * ic8 * kernel_plane * C8NUM;
109       for (int i = 0; i < input_channel; i++) {
110         int c8_block_num = i / C8NUM;
111         int c8_block_rem = i % C8NUM;
112         int src_ic_offset = src_oc_offset + i;
113         int dst_ic_offset = dst_oc_offset + c8_block_num * kernel_plane * C8NUM + c8_block_rem;
114         (packed_weight_data + dst_ic_offset)[0] = (origin_weight_data + src_ic_offset)[0];
115       }
116     }
117   }
118 }
119 
PackWeightToC4Fp16(const float16_t * origin_weight_data,float16_t * packed_weight_data,const ConvParameter * conv_param)120 void PackWeightToC4Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
121                         const ConvParameter *conv_param) {
122   // origin weight format : ohwi
123   int input_channel = conv_param->input_channel_;
124   int ic8 = UP_DIV(input_channel, C8NUM);
125   int ic4 = ic8 * 2;
126   int output_channel = conv_param->output_channel_;
127   int kernel_plane = conv_param->kernel_h_ * conv_param->kernel_w_;
128 
129   for (int k = 0; k < kernel_plane; k++) {
130     int src_kernel_offset = k * input_channel;
131     int dst_kernel_offset = k * C4NUM;
132     for (int o = 0; o < output_channel; o++) {
133       int src_oc_offset = src_kernel_offset + o * kernel_plane * input_channel;
134       int dst_oc_offset = dst_kernel_offset + o * ic4 * kernel_plane * C4NUM;
135       for (int i = 0; i < input_channel; i++) {
136         int c4_block_num = i / C4NUM;
137         int c4_block_rem = i % C4NUM;
138         int src_ic_offset = src_oc_offset + i;
139         int dst_ic_offset = dst_oc_offset + c4_block_num * kernel_plane * C4NUM + c4_block_rem;
140         (packed_weight_data + dst_ic_offset)[0] = (origin_weight_data + src_ic_offset)[0];
141       }
142     }
143   }
144 }
145 
PackNHWCToNC4HW4Fp16(const void * src,void * dst,int batch,int plane,int channel)146 void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
147   int c4 = UP_DIV(channel, C4NUM);
148   for (int b = 0; b < batch; b++) {
149     int src_oc_offset = b * plane * channel;
150     int dst_oc_offset = b * plane * c4 * C4NUM;
151     for (int k = 0; k < plane; k++) {
152       int src_kernel_offset = src_oc_offset + k * channel;
153       int dst_kernel_offset = dst_oc_offset + k * C4NUM;
154       for (int i = 0; i < channel; i++) {
155         int c4_block_num = i / C4NUM;
156         int c4_block_rem = i % C4NUM;
157         int src_ic_offset = src_kernel_offset + i;
158         int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem;
159         ((float16_t *)dst + dst_ic_offset)[0] = ((float16_t *)src + src_ic_offset)[0];
160       }
161     }
162   }
163 }
164 
PackNCHWToNC4HW4Fp16(const void * src,void * dst,int batch,int plane,int channel)165 void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
166   int c4 = UP_DIV(channel, C4NUM);
167   for (int b = 0; b < batch; b++) {
168     int src_offset = b * plane * channel;
169     int dst_offset = b * plane * c4 * C4NUM;
170     for (int c = 0; c < channel; c++) {
171       int c4_block_num = c / C4NUM;
172       int c4_block_rem = c % C4NUM;
173       int src_c_offset = src_offset + c * plane;
174       int dst_c_offset = dst_offset + c4_block_num * plane * C4NUM;
175       for (int k = 0; k < plane; k++) {
176         int src_kernel_offset = src_c_offset + k;
177         int dst_kernel_offset = dst_c_offset + C4NUM * k + c4_block_rem;
178         ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
179       }
180     }
181   }
182 }
183 
PackNHWCToNCHWFp16(const void * src,void * dst,int batches,int plane,int channel,int task_id,int thread_count)184 void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel, int task_id,
185                         int thread_count) {
186 #ifdef ENABLE_ARM64
187   // Transpose16x8 in arm64
188   const int hw_tile = C16NUM;
189 #else
190   // Transpose8x8 in others
191   const int hw_tile = C8NUM;
192 #endif
193   int hw_align = plane / hw_tile;
194   int task_start = 0;
195   int task_end = plane;
196   if (thread_count > 0) {
197     int offset_hw = UP_DIV(hw_align, thread_count) * hw_tile;
198     task_start = offset_hw * task_id;
199     int count = plane - task_start;
200     if (count <= 0) {
201       return;
202     }
203     task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw);
204     hw_align = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0);
205   } else {
206     hw_align *= hw_tile;
207   }
208   int c8 = channel / C8NUM * C8NUM;
209   int batch = plane * channel;
210   for (int n = 0; n < batches; n++) {
211     const float16_t *src_batch = (const float16_t *)src + n * batch;
212     float16_t *dst_batch = (float16_t *)dst + n * batch;
213     int hw = task_start;
214     for (; hw < hw_align; hw += hw_tile) {
215       int c = 0;
216       for (; c < c8; c += C8NUM) {
217         const float16_t *src_ptr = src_batch + hw * channel + c;
218         float16_t *dst_ptr = dst_batch + c * plane + hw;
219 #ifdef ENABLE_ARM64
220         size_t src_stride = channel * sizeof(float16_t);
221         size_t dst_stride = plane * sizeof(float16_t);
222         Transpose16x8ARM64Fp16(src_ptr, dst_ptr, src_stride, dst_stride);
223 #elif defined(ENABLE_ARM82_A32)
224         size_t src_stride = channel * sizeof(float16_t);
225         size_t dst_stride = plane * sizeof(float16_t);
226         Transpose8x8A32Fp16(src_ptr, dst_ptr, src_stride, dst_stride);
227 #else
228         for (int tr = 0; tr < hw_tile; tr++) {
229           for (int tc = 0; tc < C8NUM; tc++) {
230             dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
231           }
232         }
233 #endif
234       }
235       for (; c < channel; c++) {
236         const float16_t *src_ptr = src_batch + hw * channel + c;
237         float16_t *dst_ptr = dst_batch + c * plane + hw;
238         for (size_t i = 0; i < hw_tile; i++) {
239           dst_ptr[i] = src_ptr[i * channel];
240         }
241       }
242     }
243     for (; hw < task_end; hw++) {
244       const float16_t *src_ptr = src_batch + hw * channel;
245       float16_t *dst_ptr = dst_batch + hw;
246       for (size_t i = 0; i < channel; i++) {
247         dst_ptr[i * plane] = src_ptr[i];
248       }
249     }
250   }
251 }
252 
PackNCHWToNHWCFp16(const void * src,void * dst,int batch,int plane,int channel,int task_id,int thread_count)253 void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) {
254   return PackNHWCToNCHWFp16(src, dst, batch, channel, plane, task_id, thread_count);
255 }
256 
PackNHWCToNHWC4Fp16(const void * src,void * dst,int batch,int plane,int channel)257 void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
258   int ic4 = UP_DIV(channel, C4NUM);
259   int c4_channel = ic4 * C4NUM;
260   int nhwc4_batch_unit_offset = ic4 * C4NUM * plane;
261   int ic_remainder_ = channel % C4NUM;
262   if (ic_remainder_ != 0) {
263     int nhwc4_batch_offset = 0;
264     for (int b = 0; b < batch; b++) {
265       int batch_offset = b * channel * plane;
266       for (int i = 0; i < plane; i++) {
267         float16_t *dst_per_plane = (float16_t *)dst + nhwc4_batch_offset + i * c4_channel;
268         memcpy(dst_per_plane, (float16_t *)src + batch_offset + i * channel, channel * sizeof(float16_t));
269         for (int j = channel; j < c4_channel; ++j) {
270           dst_per_plane[j] = 0;
271         }
272       }
273       nhwc4_batch_offset += nhwc4_batch_unit_offset;
274     }
275   } else {
276     size_t ori_input_size = batch * plane * channel * sizeof(float16_t);
277     memcpy(dst, src, ori_input_size);
278   }
279 }
280 
PackNHWCToNHWC8Fp16(const void * src,void * dst,int batch,int plane,int channel)281 void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel) {
282   int ic8 = UP_DIV(channel, C8NUM);
283   int c8_channel = ic8 * C8NUM;
284   int nhwc8_batch_unit_offset = ic8 * C8NUM * plane;
285   int ic_remainder_ = channel % C8NUM;
286   if (ic_remainder_ != 0) {
287     int nhwc8_batch_offset = 0;
288     for (int b = 0; b < batch; b++) {
289       int batch_offset = b * channel * plane;
290       for (int i = 0; i < plane; i++) {
291         float16_t *dst_per_plane = (float16_t *)dst + nhwc8_batch_offset + i * c8_channel;
292         memcpy(dst_per_plane, (float16_t *)src + batch_offset + i * channel, channel * sizeof(float16_t));
293         for (int j = channel; j < c8_channel; ++j) {
294           dst_per_plane[j] = 0;
295         }
296       }
297       nhwc8_batch_offset += nhwc8_batch_unit_offset;
298     }
299   } else {
300     size_t ori_input_size = batch * plane * channel * sizeof(float16_t);
301     memcpy(dst, src, ori_input_size);
302   }
303 }
304 
PackNHWC4ToNHWCFp16(const void * src,void * dst,int batch,int plane,int channel)305 void PackNHWC4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
306   int c4 = UP_DIV(channel, C4NUM);
307   int ic_remainder_ = channel % C4NUM;
308   if (ic_remainder_ != 0) {
309     int nhwc_batch_unit_offset = channel * plane;
310     for (int b = 0; b < batch; b++) {
311       int batch_offset = b * c4 * C4NUM * plane;
312       for (int i = 0; i < plane; i++) {
313         memcpy((float16_t *)dst + b * nhwc_batch_unit_offset + i * channel,
314                (float16_t *)src + batch_offset + i * c4 * C4NUM, channel * sizeof(float16_t));
315       }
316     }
317   } else {
318     size_t ori_input_size = batch * plane * channel * sizeof(float16_t);
319     memcpy((float16_t *)dst, (float16_t *)src, ori_input_size);
320   }
321 }
322 
PackNCHWToNHWC4Fp16(const void * src,void * dst,int batch,int plane,int channel)323 void PackNCHWToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
324   int nhwc4_batch_offset = 0;
325   int ic4 = UP_DIV(channel, C4NUM);
326   int nhwc4_batch_unit_offset = ic4 * C4NUM * plane;
327 
328   for (int b = 0; b < batch; b++) {
329     int batch_offset = b * channel * plane;
330     for (int c = 0; c < channel; c++) {
331       int src_c_offset = batch_offset + c * plane;
332       int dst_c_offset = nhwc4_batch_offset + c;
333       for (int i = 0; i < plane; i++) {
334         int src_plane_offset = src_c_offset + i;
335         int dst_plane_offset = dst_c_offset + i * ic4 * C4NUM;
336         ((float16_t *)dst)[dst_plane_offset] = ((float16_t *)src)[src_plane_offset];
337       }
338     }
339     nhwc4_batch_offset += nhwc4_batch_unit_offset;
340   }
341 }
342 
PackNC4HW4ToNHWC4Fp16(const void * src,void * dst,int batch,int plane,int channel)343 void PackNC4HW4ToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
344   int c4 = UP_DIV(channel, C4NUM);
345   for (int b = 0; b < batch; b++) {
346     int src_offset = b * plane * c4 * C4NUM;
347     int dst_offset = b * plane * channel;
348     for (int c = 0; c < channel; c++) {
349       int c4_block_num = c / C4NUM;
350       int c4_block_res = c % C4NUM;
351       int src_c_offset = src_offset + c4_block_num * plane * C4NUM + c4_block_res;
352       int dst_c_offset = dst_offset + c4_block_num * C4NUM + c4_block_res;
353       for (int k = 0; k < plane; k++) {
354         int src_kernel_offset = src_c_offset + k * C4NUM;
355         int dst_kernel_offset = dst_c_offset + k * c4 * C4NUM;
356         ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
357       }
358     }
359   }
360 }
361 
PackNC4HW4ToNHWCFp16(const void * src,void * dst,int batch,int plane,int channel)362 void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
363   int c4 = UP_DIV(channel, C4NUM);
364   for (int b = 0; b < batch; b++) {
365     int src_offset = b * plane * c4 * C4NUM;
366     int dst_offset = b * plane * channel;
367     for (int c = 0; c < channel; c++) {
368       int c4_block_num = c / C4NUM;
369       int c4_block_res = c % C4NUM;
370       int src_c_offset = src_offset + c4_block_num * plane * C4NUM + c4_block_res;
371       int dst_c_offset = dst_offset + c;
372       for (int k = 0; k < plane; k++) {
373         int src_kernel_offset = src_c_offset + k * C4NUM;
374         int dst_kernel_offset = dst_c_offset + k * channel;
375         ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
376       }
377     }
378   }
379 }
380 
PackNC4HW4ToNCHWFp16(const void * src,void * dst,int batch,int plane,int channel)381 void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel) {
382   int c4 = UP_DIV(channel, C4NUM);
383   for (int b = 0; b < batch; b++) {
384     int src_offset = b * plane * c4 * C4NUM;
385     int dst_offset = b * plane * channel;
386     for (int c = 0; c < channel; c++) {
387       int c4_block_num = c / C4NUM;
388       int c4_block_res = c % C4NUM;
389       int src_c_offset = src_offset + c4_block_num * plane * C4NUM + c4_block_res;
390       int dst_c_offset = dst_offset + c * plane;
391       for (int k = 0; k < plane; k++) {
392         int src_kernel_offset = src_c_offset + k * C4NUM;
393         int dst_kernel_offset = dst_c_offset + k;
394         ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
395       }
396     }
397   }
398 }
399 
PackNCHWFp32ToNC8HW8Fp16(const float * src,float16_t * dst,int batch,int plane,int channel)400 void PackNCHWFp32ToNC8HW8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
401   int c8 = UP_DIV(channel, C8NUM);
402   for (int b = 0; b < batch; b++) {
403     int src_offset = b * plane * channel;
404     int dst_offset = b * plane * c8 * C8NUM;
405     for (int c = 0; c < channel; c++) {
406       int c8_block_num = c / C8NUM;
407       int c8_block_rem = c % C8NUM;
408       int src_c_offset = src_offset + c * plane;
409       int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
410       for (int k = 0; k < plane; k++) {
411         int src_kernel_offset = src_c_offset + k;
412         int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
413         (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
414       }
415     }
416   }
417 }
418 
PackNCHWFp16ToNC8HW8Fp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)419 void PackNCHWFp16ToNC8HW8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
420   int c8 = UP_DIV(channel, C8NUM);
421   for (int b = 0; b < batch; b++) {
422     int src_offset = b * plane * channel;
423     int dst_offset = b * plane * c8 * C8NUM;
424     for (int c = 0; c < channel; c++) {
425       int c8_block_num = c / C8NUM;
426       int c8_block_rem = c % C8NUM;
427       int src_c_offset = src_offset + c * plane;
428       int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
429       for (int k = 0; k < plane; k++) {
430         int src_kernel_offset = src_c_offset + k;
431         int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
432         (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
433       }
434     }
435   }
436 }
437 
438 #ifdef Debug
PackNC8HW8ToNHWCFp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)439 void PackNC8HW8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
440   int block = UP_DIV(channel, C8NUM);
441   int last_block_idx = block - 1;
442   int last_src_col = channel - last_block_idx * C8NUM;
443   for (size_t i = 0; i < block; i++) {
444     size_t src_col = (i != last_block_idx) ? C8NUM : last_src_col;
445     float16_t *dst_cur = dst + i * C8NUM;
446     for (size_t j = 0; j < plane; j++) {
447       memcpy(dst_cur, src, src_col * sizeof(float16_t));
448       src += src_col;
449       dst_cur += channel;
450     }
451   }
452 }
453 #endif
454 
PackNHWCFp32ToNHWC8Fp16(const float * src,float16_t * dst,int batch,int plane,int channel)455 void PackNHWCFp32ToNHWC8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
456   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
457   for (int b = 0; b < batch; b++) {
458     float16_t *dst_batch = dst + b * plane * c8_channel;
459     const float *src_batch = src + b * plane * channel;
460     for (int i = 0; i < plane; i++) {
461       float16_t *dst_plane = dst_batch + i * c8_channel;
462       const float *src_plane = src_batch + i * channel;
463       for (int c = 0; c < channel; c++) {
464         dst_plane[c] = (float16_t)(src_plane[c]);
465       }
466     }
467   }
468 }
469 
PackNHWCFp32ToC8HWN8Fp16(const float * src,float16_t * dst,int batch,int plane,int channel)470 void PackNHWCFp32ToC8HWN8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
471   for (int n = 0; n < batch; n++) {
472     for (int hw = 0; hw < plane; hw++) {
473       for (int c = 0; c < channel; c++) {
474         int c8div = c / C8NUM;
475         int c8mod = c % C8NUM;
476         int src_index = n * plane * channel + hw * channel + c;
477         int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
478         dst[dst_index] = (float16_t)(src[src_index]);
479       }
480     }
481   }
482   return;
483 }
484 
PackNHWCFp16ToC8HWN8Fp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)485 void PackNHWCFp16ToC8HWN8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
486   for (int n = 0; n < batch; n++) {
487     for (int hw = 0; hw < plane; hw++) {
488       for (int c = 0; c < channel; c++) {
489         int c8div = c / C8NUM;
490         int c8mod = c % C8NUM;
491         int src_index = n * plane * channel + hw * channel + c;
492         int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
493         dst[dst_index] = src[src_index];
494       }
495     }
496   }
497   return;
498 }
499 
PackNHWC8Fp16ToNHWCFp32(const float16_t * src,float * dst,int batch,int plane,int channel)500 void PackNHWC8Fp16ToNHWCFp32(const float16_t *src, float *dst, int batch, int plane, int channel) {
501   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
502   for (int b = 0; b < batch; b++) {
503     const float16_t *src_batch = src + b * plane * c8_channel;
504     float *dst_batch = dst + b * plane * channel;
505     for (int i = 0; i < plane; i++) {
506       const float16_t *src_plane = src_batch + i * c8_channel;
507       float *dst_plane = dst_batch + i * channel;
508       for (int c = 0; c < channel; c++) {
509         dst_plane[c] = (float16_t)(src_plane[c]);
510       }
511     }
512   }
513 }
514 
PackNHWC8ToNHWCFp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)515 void PackNHWC8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
516   int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
517   for (int b = 0; b < batch; b++) {
518     const float16_t *src_batch = src + b * plane * c8_channel;
519     float16_t *dst_batch = dst + b * plane * channel;
520     for (int i = 0; i < plane; i++) {
521       const float16_t *src_plane = src_batch + i * c8_channel;
522       float16_t *dst_plane = dst_batch + i * channel;
523       memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
524     }
525   }
526 }
527 
528 #ifdef ENABLE_ARM82_A32
Transpose8x8A32Fp16(const float16_t * src,float16_t * dst,size_t src_stride,size_t dst_stride)529 inline void Transpose8x8A32Fp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride) {
530   asm volatile(
531     "mov r10, %[src]\n"
532     "mov r12, %[dst]\n"
533     "vld1.16 {q0}, [r10], %[src_stride]\n"
534     "vld1.16 {q2}, [r10], %[src_stride]\n"
535     "vld1.16 {q4}, [r10], %[src_stride]\n"
536     "vld1.16 {q6}, [r10], %[src_stride]\n"
537 
538     "vtrn.16 d0, d4\n"
539     "vtrn.16 d1, d5\n"
540     "vtrn.16 d8, d12\n"
541     "vtrn.16 d9, d13\n"
542 
543     "vld1.16 {q8}, [r10], %[src_stride]\n"
544     "vld1.16 {q10}, [r10], %[src_stride]\n"
545     "vld1.16 {q12}, [r10], %[src_stride]\n"
546     "vld1.16 {q14}, [r10], %[src_stride]\n"
547 
548     "vtrn.32 d0, d8\n"
549     "vtrn.32 d4, d12\n"
550     "vtrn.32 d1, d9\n"
551     "vtrn.32 d5, d13\n"
552 
553     "vtrn.16 d16, d20\n"
554     "vtrn.16 d17, d21\n"
555     "vtrn.16 d24, d28\n"
556     "vtrn.16 d25, d29\n"
557 
558     "vtrn.32 d16, d24\n"
559     "vtrn.32 d20, d28\n"
560     "vtrn.32 d17, d25\n"
561     "vtrn.32 d21, d29\n"
562 
563     "vswp d1, d16\n"
564     "vswp d5, d20\n"
565     "vswp d9, d24\n"
566     "vswp d13, d28\n"
567 
568     "vst1.16 {q0}, [r12], %[dst_stride]\n"
569     "vst1.16 {q2}, [r12], %[dst_stride]\n"
570     "vst1.16 {q4}, [r12], %[dst_stride]\n"
571     "vst1.16 {q6}, [r12], %[dst_stride]\n"
572 
573     "vst1.16 {q8}, [r12], %[dst_stride]\n"
574     "vst1.16 {q10}, [r12], %[dst_stride]\n"
575     "vst1.16 {q12}, [r12], %[dst_stride]\n"
576     "vst1.16 {q14}, [r12], %[dst_stride]\n"
577 
578     :
579     : [ dst ] "r"(dst), [ src ] "r"(src), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
580     : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
581       "q15");
582 }
583 
Transpose12x8A32Fp16(const float16_t * src_c,float16_t * dst_c,size_t src_stride,size_t dst_stride)584 inline void Transpose12x8A32Fp16(const float16_t *src_c, float16_t *dst_c, size_t src_stride, size_t dst_stride) {
585   asm volatile(
586     "mov r10, %[src_c]\n"
587     "mov r12, %[dst_c]\n"
588 
589     "vld1.16 {q0}, [r10], %[src_stride]\n"
590     "vld1.16 {q2}, [r10], %[src_stride]\n"
591     "vld1.16 {q4}, [r10], %[src_stride]\n"
592     "vld1.16 {q6}, [r10], %[src_stride]\n"
593 
594     "vtrn.16 d0, d4\n"
595     "vtrn.16 d1, d5\n"
596     "vtrn.16 d8, d12\n"
597     "vtrn.16 d9, d13\n"
598 
599     "vld1.16 {q8}, [r10], %[src_stride]\n"
600     "vld1.16 {q10}, [r10], %[src_stride]\n"
601     "vld1.16 {q12}, [r10], %[src_stride]\n"
602     "vld1.16 {q14}, [r10], %[src_stride]\n"
603 
604     "vtrn.32 d0, d8\n"
605     "vtrn.32 d4, d12\n"
606     "vtrn.32 d1, d9\n"
607     "vtrn.32 d5, d13\n"
608 
609     "vtrn.16 d16, d20\n"
610     "vtrn.16 d17, d21\n"
611     "vtrn.16 d24, d28\n"
612     "vtrn.16 d25, d29\n"
613 
614     "vld1.16 {q1}, [r10], %[src_stride]\n"
615     "vld1.16 {q3}, [r10], %[src_stride]\n"
616     "vld1.16 {q5}, [r10], %[src_stride]\n"
617     "vld1.16 {q7}, [r10], %[src_stride]\n"
618 
619     "vtrn.32 d16, d24\n"
620     "vtrn.32 d20, d28\n"
621     "vtrn.32 d17, d25\n"
622     "vtrn.32 d21, d29\n"
623 
624     "vswp d1, d16\n"
625     "vswp d5, d20\n"
626     "vswp d9, d24\n"
627     "vswp d13, d28\n"
628 
629     "vtrn.16 d2, d6\n"
630     "vtrn.16 d3, d7\n"
631     "vtrn.16 d10, d14\n"
632     "vtrn.16 d11, d15\n"
633 
634     "vtrn.32 d2, d10\n"
635     "vtrn.32 d6, d14\n"
636     "vtrn.32 d3, d11\n"
637     "vtrn.32 d7, d15\n"
638 
639     "vst1.16 {q0, d2}, [r12], %[dst_stride]\n"
640     "vst1.16 {q2, d6}, [r12], %[dst_stride]\n"
641     "vst1.16 {q4, d10}, [r12], %[dst_stride]\n"
642     "vst1.16 {q6, d14}, [r12], %[dst_stride]\n"
643 
644     "vswp d3, d18\n"
645     "vswp d7, d22\n"
646     "vswp d11, d26\n"
647     "vswp d15, d30\n"
648 
649     "vst1.16 {q8, d18}, [r12], %[dst_stride]\n"
650     "vst1.16 {q10, d22}, [r12], %[dst_stride]\n"
651     "vst1.16 {q12, d26}, [r12], %[dst_stride]\n"
652     "vst1.16 {q14, d30}, [r12], %[dst_stride]\n"
653 
654     :
655     : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
656     : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
657       "q15");
658 }
659 #endif
660 
661 #ifdef ENABLE_ARM64
Transpose4x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)662 inline void Transpose4x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
663   dst_stride += dst_stride;
664   asm volatile(
665     "mov x10, %[src_ptr]\n"
666     "mov x11, %[dst_ptr]\n"
667 
668     "ld1 {v0.8h}, [x10], %[src_stride]\n"
669     "ld1 {v1.8h}, [x10], %[src_stride]\n"
670     "ld1 {v2.8h}, [x10], %[src_stride]\n"
671     "ld1 {v3.8h}, [x10], %[src_stride]\n"
672     "add x10, x11, %[dst_stride]\n"
673 
674     "zip1 v4.8h, v0.8h, v1.8h\n"
675     "zip1 v5.8h, v2.8h, v3.8h\n"
676 
677     "trn1 v6.4s, v4.4s, v5.4s\n"
678     "trn2 v7.4s, v4.4s, v5.4s\n"
679 
680     "trn1 v24.2d, v6.2d, v7.2d\n"
681     "trn2 v25.2d, v6.2d, v7.2d\n"
682 
683     "zip2 v8.8h, v0.8h, v1.8h\n"
684     "zip2 v9.8h, v2.8h, v3.8h\n"
685 
686     "trn1 v10.4s, v8.4s, v9.4s\n"
687     "trn2 v11.4s, v8.4s, v9.4s\n"
688 
689     "trn1 v26.2d, v10.2d, v11.2d\n"
690     "trn2 v27.2d, v10.2d, v11.2d\n"
691 
692     "st1 {v24.8h}, [x11], %[tow_dst_stride]\n"
693     "st1 {v25.8h}, [x10], %[tow_dst_stride]\n"
694     "st1 {v26.8h}, [x11], %[tow_dst_stride]\n"
695     "st1 {v27.8h}, [x10], %[tow_dst_stride]\n"
696     :
697     : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride),
698       [ dst_stride ] "r"(dst_stride), [ tow_dst_stride ] "r"(2 * dst_stride)
699     : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v24", "v25", "v26",
700       "v27");
701 }
702 
Transpose8x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)703 inline void Transpose8x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
704   asm volatile(
705     "mov x10, %[src_ptr]\n"
706     "mov x11, %[dst_ptr]\n"
707 
708     "ld1 {v0.8h}, [x10], %[src_stride]\n"
709     "ld1 {v1.8h}, [x10], %[src_stride]\n"
710     "ld1 {v2.8h}, [x10], %[src_stride]\n"
711     "ld1 {v3.8h}, [x10], %[src_stride]\n"
712     "ld1 {v4.8h}, [x10], %[src_stride]\n"
713     "ld1 {v5.8h}, [x10], %[src_stride]\n"
714     "ld1 {v6.8h}, [x10], %[src_stride]\n"
715     "ld1 {v7.8h}, [x10], %[src_stride]\n"
716     "add x10, x11, %[dst_stride]\n"
717 
718     "zip1 v16.8h, v0.8h, v1.8h\n"
719     "zip1 v17.8h, v2.8h, v3.8h\n"
720     "zip1 v18.8h, v4.8h, v5.8h\n"
721     "zip1 v19.8h, v6.8h, v7.8h\n"
722 
723     "trn1 v20.4s, v16.4s, v17.4s\n"
724     "trn2 v21.4s, v16.4s, v17.4s\n"
725     "trn1 v22.4s, v18.4s, v19.4s\n"
726     "trn2 v23.4s, v18.4s, v19.4s\n"
727 
728     "trn1 v24.2d, v20.2d, v22.2d\n"
729     "trn2 v26.2d, v20.2d, v22.2d\n"
730     "trn1 v25.2d, v21.2d, v23.2d\n"
731     "trn2 v27.2d, v21.2d, v23.2d\n"
732 
733     "zip2 v8.8h, v0.8h, v1.8h\n"
734     "zip2 v9.8h, v2.8h, v3.8h\n"
735     "zip2 v10.8h, v4.8h, v5.8h\n"
736     "zip2 v11.8h, v6.8h, v7.8h\n"
737 
738     "trn1 v12.4s, v8.4s, v9.4s\n"
739     "trn2 v13.4s, v8.4s, v9.4s\n"
740     "trn1 v14.4s, v10.4s, v11.4s\n"
741     "trn2 v15.4s, v10.4s, v11.4s\n"
742 
743     "trn1 v28.2d, v12.2d, v14.2d\n"
744     "trn2 v30.2d, v12.2d, v14.2d\n"
745     "trn1 v29.2d, v13.2d, v15.2d\n"
746     "trn2 v31.2d, v13.2d, v15.2d\n"
747 
748     "st1 {v24.8h}, [x11], %[tow_dst_stride]\n"
749     "st1 {v25.8h}, [x10], %[tow_dst_stride]\n"
750     "st1 {v26.8h}, [x11], %[tow_dst_stride]\n"
751     "st1 {v27.8h}, [x10], %[tow_dst_stride]\n"
752     "st1 {v28.8h}, [x11], %[tow_dst_stride]\n"
753     "st1 {v29.8h}, [x10], %[tow_dst_stride]\n"
754     "st1 {v30.8h}, [x11], %[tow_dst_stride]\n"
755     "st1 {v31.8h}, [x10], %[tow_dst_stride]\n"
756     :
757     : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride),
758       [ dst_stride ] "r"(dst_stride), [ tow_dst_stride ] "r"(2 * dst_stride)
759     : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
760       "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
761       "v31");
762 }
763 
Transpose12x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)764 void Transpose12x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
765 #ifdef ENABLE_DEBUG
766   for (int tr = 0; tr < C12NUM; tr++) {
767     for (int tc = 0; tc < C8NUM; tc++) {
768       dst_ptr[tc * C12NUM + tr] = src_ptr[tr * col + tc];
769     }
770   }
771 #else
772   asm volatile(
773     "mov x10, %[src_ptr]\n"
774     "mov x11, %[dst_ptr]\n"
775 
776     "ld1 {v0.8h}, [x10], %[src_stride]\n"
777     "ld1 {v1.8h}, [x10], %[src_stride]\n"
778     "ld1 {v2.8h}, [x10], %[src_stride]\n"
779     "ld1 {v3.8h}, [x10], %[src_stride]\n"
780     "ld1 {v4.8h}, [x10], %[src_stride]\n"
781     "ld1 {v5.8h}, [x10], %[src_stride]\n"
782     "ld1 {v6.8h}, [x10], %[src_stride]\n"
783     "ld1 {v7.8h}, [x10], %[src_stride]\n"
784 
785     "zip1 v16.8h, v0.8h, v1.8h\n"
786     "zip1 v17.8h, v2.8h, v3.8h\n"
787     "zip1 v18.8h, v4.8h, v5.8h\n"
788     "zip1 v19.8h, v6.8h, v7.8h\n"
789 
790     "ld1 {v8.8h}, [x10], %[src_stride]\n"
791     "ld1 {v9.8h}, [x10], %[src_stride]\n"
792     "ld1 {v10.8h}, [x10], %[src_stride]\n"
793     "ld1 {v11.8h}, [x10], %[src_stride]\n"
794 
795     "trn1 v20.4s, v16.4s, v17.4s\n"
796     "trn2 v21.4s, v16.4s, v17.4s\n"
797     "trn1 v22.4s, v18.4s, v19.4s\n"
798     "trn2 v23.4s, v18.4s, v19.4s\n"
799 
800     "trn1 v24.2d, v20.2d, v22.2d\n"
801     "trn2 v25.2d, v20.2d, v22.2d\n"
802     "trn1 v26.2d, v21.2d, v23.2d\n"
803     "trn2 v27.2d, v21.2d, v23.2d\n"
804 
805     "zip1 v16.8h, v8.8h, v9.8h\n"
806     "zip1 v17.8h, v10.8h, v11.8h\n"
807 
808     "trn1 v20.4s, v16.4s, v17.4s\n"
809     "trn2 v21.4s, v16.4s, v17.4s\n"
810 
811     "trn1 v28.2d, v20.2d, v20.2d\n"
812     "trn2 v29.2d, v20.2d, v20.2d\n"
813     "trn1 v30.2d, v21.2d, v21.2d\n"
814     "trn2 v31.2d, v21.2d, v21.2d\n"
815 
816     "add x10, x11, #16\n"
817     "st1 {v24.8h}, [x11], %[dst_stride]\n"
818     "st1 {v28.4h}, [x10], %[dst_stride]\n"
819     "st1 {v26.8h}, [x11], %[dst_stride]\n"
820     "st1 {v30.4h}, [x10], %[dst_stride]\n"
821     "st1 {v25.8h}, [x11], %[dst_stride]\n"
822     "st1 {v29.4h}, [x10], %[dst_stride]\n"
823     "st1 {v27.8h}, [x11], %[dst_stride]\n"
824     "st1 {v31.4h}, [x10], %[dst_stride]\n"
825 
826     "zip2 v16.8h, v0.8h, v1.8h\n"
827     "zip2 v17.8h, v2.8h, v3.8h\n"
828     "zip2 v18.8h, v4.8h, v5.8h\n"
829     "zip2 v19.8h, v6.8h, v7.8h\n"
830 
831     "trn1 v20.4s, v16.4s, v17.4s\n"
832     "trn2 v21.4s, v16.4s, v17.4s\n"
833     "trn1 v22.4s, v18.4s, v19.4s\n"
834     "trn2 v23.4s, v18.4s, v19.4s\n"
835 
836     "trn1 v24.2d, v20.2d, v22.2d\n"
837     "trn2 v25.2d, v20.2d, v22.2d\n"
838     "trn1 v26.2d, v21.2d, v23.2d\n"
839     "trn2 v27.2d, v21.2d, v23.2d\n"
840 
841     "zip2 v16.8h, v8.8h, v9.8h\n"
842     "zip2 v17.8h, v10.8h, v11.8h\n"
843 
844     "trn1 v20.4s, v16.4s, v17.4s\n"
845     "trn2 v21.4s, v16.4s, v17.4s\n"
846 
847     "trn1 v28.2d, v20.2d, v20.2d\n"
848     "trn2 v29.2d, v20.2d, v20.2d\n"
849     "trn1 v30.2d, v21.2d, v21.2d\n"
850     "trn2 v31.2d, v21.2d, v21.2d\n"
851 
852     "st1 {v24.8h}, [x11], %[dst_stride]\n"
853     "st1 {v28.4h}, [x10], %[dst_stride]\n"
854     "st1 {v26.8h}, [x11], %[dst_stride]\n"
855     "st1 {v30.4h}, [x10], %[dst_stride]\n"
856     "st1 {v25.8h}, [x11], %[dst_stride]\n"
857     "st1 {v29.4h}, [x10], %[dst_stride]\n"
858     "st1 {v27.8h}, [x11], %[dst_stride]\n"
859     "st1 {v31.4h}, [x10], %[dst_stride]\n"
860     :
861     : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
862     : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
863       "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
864 #endif
865 }
866 
Transpose16x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)867 inline void Transpose16x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
868   asm volatile(
869     "mov x10, %[src_ptr]\n"
870     "mov x11, %[dst_ptr]\n"
871 
872     "ld1 {v0.8h}, [x10], %[src_stride]\n"
873     "ld1 {v1.8h}, [x10], %[src_stride]\n"
874     "ld1 {v2.8h}, [x10], %[src_stride]\n"
875     "ld1 {v3.8h}, [x10], %[src_stride]\n"
876     "ld1 {v4.8h}, [x10], %[src_stride]\n"
877     "ld1 {v5.8h}, [x10], %[src_stride]\n"
878     "ld1 {v6.8h}, [x10], %[src_stride]\n"
879     "ld1 {v7.8h}, [x10], %[src_stride]\n"
880 
881     "zip1 v16.8h, v0.8h, v1.8h\n"
882     "zip1 v17.8h, v2.8h, v3.8h\n"
883     "zip1 v18.8h, v4.8h, v5.8h\n"
884     "zip1 v19.8h, v6.8h, v7.8h\n"
885 
886     "ld1 {v8.8h}, [x10], %[src_stride]\n"
887     "ld1 {v9.8h}, [x10], %[src_stride]\n"
888     "ld1 {v10.8h}, [x10], %[src_stride]\n"
889     "ld1 {v11.8h}, [x10], %[src_stride]\n"
890     "ld1 {v12.8h}, [x10], %[src_stride]\n"
891     "ld1 {v13.8h}, [x10], %[src_stride]\n"
892     "ld1 {v14.8h}, [x10], %[src_stride]\n"
893     "ld1 {v15.8h}, [x10], %[src_stride]\n"
894 
895     "trn1 v20.4s, v16.4s, v17.4s\n"
896     "trn2 v21.4s, v16.4s, v17.4s\n"
897     "trn1 v22.4s, v18.4s, v19.4s\n"
898     "trn2 v23.4s, v18.4s, v19.4s\n"
899 
900     "trn1 v24.2d, v20.2d, v22.2d\n"
901     "trn2 v25.2d, v20.2d, v22.2d\n"
902     "trn1 v26.2d, v21.2d, v23.2d\n"
903     "trn2 v27.2d, v21.2d, v23.2d\n"
904 
905     "zip1 v16.8h, v8.8h, v9.8h\n"
906     "zip1 v17.8h, v10.8h, v11.8h\n"
907     "zip1 v18.8h, v12.8h, v13.8h\n"
908     "zip1 v19.8h, v14.8h, v15.8h\n"
909 
910     "trn1 v20.4s, v16.4s, v17.4s\n"
911     "trn2 v21.4s, v16.4s, v17.4s\n"
912     "trn1 v22.4s, v18.4s, v19.4s\n"
913     "trn2 v23.4s, v18.4s, v19.4s\n"
914 
915     "trn1 v28.2d, v20.2d, v22.2d\n"
916     "trn2 v29.2d, v20.2d, v22.2d\n"
917     "trn1 v30.2d, v21.2d, v23.2d\n"
918     "trn2 v31.2d, v21.2d, v23.2d\n"
919 
920     "add x10, x11, #16\n"
921     "st1 {v24.8h}, [x11], %[dst_stride]\n"
922     "st1 {v28.8h}, [x10], %[dst_stride]\n"
923     "st1 {v26.8h}, [x11], %[dst_stride]\n"
924     "st1 {v30.8h}, [x10], %[dst_stride]\n"
925     "st1 {v25.8h}, [x11], %[dst_stride]\n"
926     "st1 {v29.8h}, [x10], %[dst_stride]\n"
927     "st1 {v27.8h}, [x11], %[dst_stride]\n"
928     "st1 {v31.8h}, [x10], %[dst_stride]\n"
929 
930     "zip2 v16.8h, v0.8h, v1.8h\n"
931     "zip2 v17.8h, v2.8h, v3.8h\n"
932     "zip2 v18.8h, v4.8h, v5.8h\n"
933     "zip2 v19.8h, v6.8h, v7.8h\n"
934 
935     "trn1 v20.4s, v16.4s, v17.4s\n"
936     "trn2 v21.4s, v16.4s, v17.4s\n"
937     "trn1 v22.4s, v18.4s, v19.4s\n"
938     "trn2 v23.4s, v18.4s, v19.4s\n"
939 
940     "trn1 v24.2d, v20.2d, v22.2d\n"
941     "trn2 v25.2d, v20.2d, v22.2d\n"
942     "trn1 v26.2d, v21.2d, v23.2d\n"
943     "trn2 v27.2d, v21.2d, v23.2d\n"
944 
945     "zip2 v16.8h, v8.8h, v9.8h\n"
946     "zip2 v17.8h, v10.8h, v11.8h\n"
947     "zip2 v18.8h, v12.8h, v13.8h\n"
948     "zip2 v19.8h, v14.8h, v15.8h\n"
949 
950     "trn1 v20.4s, v16.4s, v17.4s\n"
951     "trn2 v21.4s, v16.4s, v17.4s\n"
952     "trn1 v22.4s, v18.4s, v19.4s\n"
953     "trn2 v23.4s, v18.4s, v19.4s\n"
954 
955     "trn1 v28.2d, v20.2d, v22.2d\n"
956     "trn2 v29.2d, v20.2d, v22.2d\n"
957     "trn1 v30.2d, v21.2d, v23.2d\n"
958     "trn2 v31.2d, v21.2d, v23.2d\n"
959 
960     "st1 {v24.8h}, [x11], %[dst_stride]\n"
961     "st1 {v28.8h}, [x10], %[dst_stride]\n"
962     "st1 {v26.8h}, [x11], %[dst_stride]\n"
963     "st1 {v30.8h}, [x10], %[dst_stride]\n"
964     "st1 {v25.8h}, [x11], %[dst_stride]\n"
965     "st1 {v29.8h}, [x10], %[dst_stride]\n"
966     "st1 {v27.8h}, [x11], %[dst_stride]\n"
967     "st1 {v31.8h}, [x10], %[dst_stride]\n"
968     :
969     : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
970     : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
971       "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
972       "v31");
973 }
974 #endif
975