1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/fp16/pack_fp16.h"
18 #include <string.h>
19
20 #ifdef ENABLE_ARM
PackWeightConvDw3x3Fp16(const void * src,void * dst,int channel)21 void PackWeightConvDw3x3Fp16(const void *src, void *dst, int channel) {
22 // nchw to nc8hw8 with 1D F(2,3)
23 for (int i = 0; i < channel; i++) {
24 float16_t *src_kernel = (float16_t *)src + i * 9;
25 float16_t *dst_kernel = (float16_t *)dst + (i / 8) * 96 + i % 8;
26 for (int y = 0; y < 3; y++) {
27 float16_t g0 = src_kernel[3 * y];
28 float16_t g1 = src_kernel[3 * y + 1];
29 float16_t g2 = src_kernel[3 * y + 2];
30
31 dst_kernel[32 * y] = g0;
32 dst_kernel[32 * y + 8] = (float16_t)0.5 * (g0 + g1 + g2);
33 dst_kernel[32 * y + 16] = (float16_t)0.5 * (g0 - g1 + g2);
34 dst_kernel[32 * y + 24] = g2;
35 }
36 }
37 }
38 #endif
39
Im2ColPackUnitFp16(const float16_t * input_data,const ConvParameter * conv_param,float16_t * packed_input,int real_cal_num,int block_index)40 void Im2ColPackUnitFp16(const float16_t *input_data, const ConvParameter *conv_param, float16_t *packed_input,
41 int real_cal_num, int block_index) {
42 // input format : nhwc
43 int kernel_h = conv_param->kernel_h_;
44 int kernel_w = conv_param->kernel_w_;
45 int kernel_plane = kernel_h * kernel_w;
46 int stride_h = conv_param->stride_h_;
47 int stride_w = conv_param->stride_w_;
48 int pad_h = conv_param->pad_u_;
49 int pad_w = conv_param->pad_l_;
50 int dilation_h = conv_param->dilation_h_;
51 int dilation_w = conv_param->dilation_w_;
52 int in_channel = conv_param->input_channel_;
53 int in_h = conv_param->input_h_;
54 int in_w = conv_param->input_w_;
55 int out_w = conv_param->output_w_;
56
57 for (int i = 0; i < real_cal_num; i++) {
58 int block_start = block_index + i;
59 int input_h = block_start / out_w * stride_h - pad_h;
60 int input_w = block_start % out_w * stride_w - pad_w;
61 int input_stride = (input_h * in_w + input_w) * in_channel;
62 int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
63 int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
64 int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
65 int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
66 if (dilation_h == 1 && dilation_w == 1) {
67 for (int j = kh_s; j < kh_e; j++) {
68 int input_y_stride = j * in_w * in_channel + input_stride;
69 int input_x_stride = input_y_stride + kw_s * in_channel;
70 int input_plane_offset = (j * kernel_w + kw_s) * in_channel + i * in_channel * kernel_plane;
71 memcpy(packed_input + input_plane_offset, input_data + input_x_stride,
72 (kw_e - kw_s) * in_channel * sizeof(float16_t));
73 } // kernel_h loop
74 } else {
75 for (int j = kh_s; j < kh_e; j++) {
76 int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
77 for (int n = kw_s; n < kw_e; n++) {
78 int input_x_stride = input_y_stride + n * dilation_w * in_channel;
79 int input_plane_offset = (j * kernel_w + n) * in_channel + i * in_channel * kernel_plane;
80 memcpy(packed_input + input_plane_offset, input_data + input_x_stride, in_channel * sizeof(float16_t));
81 } // kernel_w loop
82 } // kernel_h loop
83 }
84 } // tile num loop
85 }
86
PackHWCToWHCFp16(const float16_t * src,float16_t * dst,int height,int width,int channel)87 void PackHWCToWHCFp16(const float16_t *src, float16_t *dst, int height, int width, int channel) {
88 for (int i = 0; i < height; ++i) {
89 for (int j = 0; j < width; ++j) {
90 memcpy(dst + (j * height + i) * channel, src + (i * width + j) * channel, channel * sizeof(float16_t));
91 }
92 }
93 }
94
PackWeightToC8Fp16(const float16_t * origin_weight_data,float16_t * packed_weight_data,const ConvParameter * conv_param)95 void PackWeightToC8Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
96 const ConvParameter *conv_param) {
97 // origin weight format : ohwi
98 int input_channel = conv_param->input_channel_;
99 int ic8 = UP_DIV(input_channel, C8NUM);
100 int output_channel = conv_param->output_channel_;
101 int kernel_plane = conv_param->kernel_h_ * conv_param->kernel_w_;
102
103 for (int k = 0; k < kernel_plane; k++) {
104 int src_kernel_offset = k * input_channel;
105 int dst_kernel_offset = k * C8NUM;
106 for (int o = 0; o < output_channel; o++) {
107 int src_oc_offset = src_kernel_offset + o * kernel_plane * input_channel;
108 int dst_oc_offset = dst_kernel_offset + o * ic8 * kernel_plane * C8NUM;
109 for (int i = 0; i < input_channel; i++) {
110 int c8_block_num = i / C8NUM;
111 int c8_block_rem = i % C8NUM;
112 int src_ic_offset = src_oc_offset + i;
113 int dst_ic_offset = dst_oc_offset + c8_block_num * kernel_plane * C8NUM + c8_block_rem;
114 (packed_weight_data + dst_ic_offset)[0] = (origin_weight_data + src_ic_offset)[0];
115 }
116 }
117 }
118 }
119
PackWeightToC4Fp16(const float16_t * origin_weight_data,float16_t * packed_weight_data,const ConvParameter * conv_param)120 void PackWeightToC4Fp16(const float16_t *origin_weight_data, float16_t *packed_weight_data,
121 const ConvParameter *conv_param) {
122 // origin weight format : ohwi
123 int input_channel = conv_param->input_channel_;
124 int ic8 = UP_DIV(input_channel, C8NUM);
125 int ic4 = ic8 * 2;
126 int output_channel = conv_param->output_channel_;
127 int kernel_plane = conv_param->kernel_h_ * conv_param->kernel_w_;
128
129 for (int k = 0; k < kernel_plane; k++) {
130 int src_kernel_offset = k * input_channel;
131 int dst_kernel_offset = k * C4NUM;
132 for (int o = 0; o < output_channel; o++) {
133 int src_oc_offset = src_kernel_offset + o * kernel_plane * input_channel;
134 int dst_oc_offset = dst_kernel_offset + o * ic4 * kernel_plane * C4NUM;
135 for (int i = 0; i < input_channel; i++) {
136 int c4_block_num = i / C4NUM;
137 int c4_block_rem = i % C4NUM;
138 int src_ic_offset = src_oc_offset + i;
139 int dst_ic_offset = dst_oc_offset + c4_block_num * kernel_plane * C4NUM + c4_block_rem;
140 (packed_weight_data + dst_ic_offset)[0] = (origin_weight_data + src_ic_offset)[0];
141 }
142 }
143 }
144 }
145
PackNHWCToNC4HW4Fp16(const void * src,void * dst,int batch,int plane,int channel)146 void PackNHWCToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
147 int c4 = UP_DIV(channel, C4NUM);
148 for (int b = 0; b < batch; b++) {
149 int src_oc_offset = b * plane * channel;
150 int dst_oc_offset = b * plane * c4 * C4NUM;
151 for (int k = 0; k < plane; k++) {
152 int src_kernel_offset = src_oc_offset + k * channel;
153 int dst_kernel_offset = dst_oc_offset + k * C4NUM;
154 for (int i = 0; i < channel; i++) {
155 int c4_block_num = i / C4NUM;
156 int c4_block_rem = i % C4NUM;
157 int src_ic_offset = src_kernel_offset + i;
158 int dst_ic_offset = dst_kernel_offset + c4_block_num * plane * C4NUM + c4_block_rem;
159 ((float16_t *)dst + dst_ic_offset)[0] = ((float16_t *)src + src_ic_offset)[0];
160 }
161 }
162 }
163 }
164
PackNCHWToNC4HW4Fp16(const void * src,void * dst,int batch,int plane,int channel)165 void PackNCHWToNC4HW4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
166 int c4 = UP_DIV(channel, C4NUM);
167 for (int b = 0; b < batch; b++) {
168 int src_offset = b * plane * channel;
169 int dst_offset = b * plane * c4 * C4NUM;
170 for (int c = 0; c < channel; c++) {
171 int c4_block_num = c / C4NUM;
172 int c4_block_rem = c % C4NUM;
173 int src_c_offset = src_offset + c * plane;
174 int dst_c_offset = dst_offset + c4_block_num * plane * C4NUM;
175 for (int k = 0; k < plane; k++) {
176 int src_kernel_offset = src_c_offset + k;
177 int dst_kernel_offset = dst_c_offset + C4NUM * k + c4_block_rem;
178 ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
179 }
180 }
181 }
182 }
183
PackNHWCToNCHWFp16(const void * src,void * dst,int batches,int plane,int channel,int task_id,int thread_count)184 void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int channel, int task_id,
185 int thread_count) {
186 #ifdef ENABLE_ARM64
187 // Transpose16x8 in arm64
188 const int hw_tile = C16NUM;
189 #else
190 // Transpose8x8 in others
191 const int hw_tile = C8NUM;
192 #endif
193 int hw_align = plane / hw_tile;
194 int task_start = 0;
195 int task_end = plane;
196 if (thread_count > 0) {
197 int offset_hw = UP_DIV(hw_align, thread_count) * hw_tile;
198 task_start = offset_hw * task_id;
199 int count = plane - task_start;
200 if (count <= 0) {
201 return;
202 }
203 task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw);
204 hw_align = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0);
205 } else {
206 hw_align *= hw_tile;
207 }
208 int c8 = channel / C8NUM * C8NUM;
209 int batch = plane * channel;
210 for (int n = 0; n < batches; n++) {
211 const float16_t *src_batch = (const float16_t *)src + n * batch;
212 float16_t *dst_batch = (float16_t *)dst + n * batch;
213 int hw = task_start;
214 for (; hw < hw_align; hw += hw_tile) {
215 int c = 0;
216 for (; c < c8; c += C8NUM) {
217 const float16_t *src_ptr = src_batch + hw * channel + c;
218 float16_t *dst_ptr = dst_batch + c * plane + hw;
219 #ifdef ENABLE_ARM64
220 size_t src_stride = channel * sizeof(float16_t);
221 size_t dst_stride = plane * sizeof(float16_t);
222 Transpose16x8ARM64Fp16(src_ptr, dst_ptr, src_stride, dst_stride);
223 #elif defined(ENABLE_ARM82_A32)
224 size_t src_stride = channel * sizeof(float16_t);
225 size_t dst_stride = plane * sizeof(float16_t);
226 Transpose8x8A32Fp16(src_ptr, dst_ptr, src_stride, dst_stride);
227 #else
228 for (int tr = 0; tr < hw_tile; tr++) {
229 for (int tc = 0; tc < C8NUM; tc++) {
230 dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
231 }
232 }
233 #endif
234 }
235 for (; c < channel; c++) {
236 const float16_t *src_ptr = src_batch + hw * channel + c;
237 float16_t *dst_ptr = dst_batch + c * plane + hw;
238 for (size_t i = 0; i < hw_tile; i++) {
239 dst_ptr[i] = src_ptr[i * channel];
240 }
241 }
242 }
243 for (; hw < task_end; hw++) {
244 const float16_t *src_ptr = src_batch + hw * channel;
245 float16_t *dst_ptr = dst_batch + hw;
246 for (size_t i = 0; i < channel; i++) {
247 dst_ptr[i * plane] = src_ptr[i];
248 }
249 }
250 }
251 }
252
PackNCHWToNHWCFp16(const void * src,void * dst,int batch,int plane,int channel,int task_id,int thread_count)253 void PackNCHWToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) {
254 return PackNHWCToNCHWFp16(src, dst, batch, channel, plane, task_id, thread_count);
255 }
256
PackNHWCToNHWC4Fp16(const void * src,void * dst,int batch,int plane,int channel)257 void PackNHWCToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
258 int ic4 = UP_DIV(channel, C4NUM);
259 int c4_channel = ic4 * C4NUM;
260 int nhwc4_batch_unit_offset = ic4 * C4NUM * plane;
261 int ic_remainder_ = channel % C4NUM;
262 if (ic_remainder_ != 0) {
263 int nhwc4_batch_offset = 0;
264 for (int b = 0; b < batch; b++) {
265 int batch_offset = b * channel * plane;
266 for (int i = 0; i < plane; i++) {
267 float16_t *dst_per_plane = (float16_t *)dst + nhwc4_batch_offset + i * c4_channel;
268 memcpy(dst_per_plane, (float16_t *)src + batch_offset + i * channel, channel * sizeof(float16_t));
269 for (int j = channel; j < c4_channel; ++j) {
270 dst_per_plane[j] = 0;
271 }
272 }
273 nhwc4_batch_offset += nhwc4_batch_unit_offset;
274 }
275 } else {
276 size_t ori_input_size = batch * plane * channel * sizeof(float16_t);
277 memcpy(dst, src, ori_input_size);
278 }
279 }
280
PackNHWCToNHWC8Fp16(const void * src,void * dst,int batch,int plane,int channel)281 void PackNHWCToNHWC8Fp16(const void *src, void *dst, int batch, int plane, int channel) {
282 int ic8 = UP_DIV(channel, C8NUM);
283 int c8_channel = ic8 * C8NUM;
284 int nhwc8_batch_unit_offset = ic8 * C8NUM * plane;
285 int ic_remainder_ = channel % C8NUM;
286 if (ic_remainder_ != 0) {
287 int nhwc8_batch_offset = 0;
288 for (int b = 0; b < batch; b++) {
289 int batch_offset = b * channel * plane;
290 for (int i = 0; i < plane; i++) {
291 float16_t *dst_per_plane = (float16_t *)dst + nhwc8_batch_offset + i * c8_channel;
292 memcpy(dst_per_plane, (float16_t *)src + batch_offset + i * channel, channel * sizeof(float16_t));
293 for (int j = channel; j < c8_channel; ++j) {
294 dst_per_plane[j] = 0;
295 }
296 }
297 nhwc8_batch_offset += nhwc8_batch_unit_offset;
298 }
299 } else {
300 size_t ori_input_size = batch * plane * channel * sizeof(float16_t);
301 memcpy(dst, src, ori_input_size);
302 }
303 }
304
PackNHWC4ToNHWCFp16(const void * src,void * dst,int batch,int plane,int channel)305 void PackNHWC4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
306 int c4 = UP_DIV(channel, C4NUM);
307 int ic_remainder_ = channel % C4NUM;
308 if (ic_remainder_ != 0) {
309 int nhwc_batch_unit_offset = channel * plane;
310 for (int b = 0; b < batch; b++) {
311 int batch_offset = b * c4 * C4NUM * plane;
312 for (int i = 0; i < plane; i++) {
313 memcpy((float16_t *)dst + b * nhwc_batch_unit_offset + i * channel,
314 (float16_t *)src + batch_offset + i * c4 * C4NUM, channel * sizeof(float16_t));
315 }
316 }
317 } else {
318 size_t ori_input_size = batch * plane * channel * sizeof(float16_t);
319 memcpy((float16_t *)dst, (float16_t *)src, ori_input_size);
320 }
321 }
322
PackNCHWToNHWC4Fp16(const void * src,void * dst,int batch,int plane,int channel)323 void PackNCHWToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
324 int nhwc4_batch_offset = 0;
325 int ic4 = UP_DIV(channel, C4NUM);
326 int nhwc4_batch_unit_offset = ic4 * C4NUM * plane;
327
328 for (int b = 0; b < batch; b++) {
329 int batch_offset = b * channel * plane;
330 for (int c = 0; c < channel; c++) {
331 int src_c_offset = batch_offset + c * plane;
332 int dst_c_offset = nhwc4_batch_offset + c;
333 for (int i = 0; i < plane; i++) {
334 int src_plane_offset = src_c_offset + i;
335 int dst_plane_offset = dst_c_offset + i * ic4 * C4NUM;
336 ((float16_t *)dst)[dst_plane_offset] = ((float16_t *)src)[src_plane_offset];
337 }
338 }
339 nhwc4_batch_offset += nhwc4_batch_unit_offset;
340 }
341 }
342
PackNC4HW4ToNHWC4Fp16(const void * src,void * dst,int batch,int plane,int channel)343 void PackNC4HW4ToNHWC4Fp16(const void *src, void *dst, int batch, int plane, int channel) {
344 int c4 = UP_DIV(channel, C4NUM);
345 for (int b = 0; b < batch; b++) {
346 int src_offset = b * plane * c4 * C4NUM;
347 int dst_offset = b * plane * channel;
348 for (int c = 0; c < channel; c++) {
349 int c4_block_num = c / C4NUM;
350 int c4_block_res = c % C4NUM;
351 int src_c_offset = src_offset + c4_block_num * plane * C4NUM + c4_block_res;
352 int dst_c_offset = dst_offset + c4_block_num * C4NUM + c4_block_res;
353 for (int k = 0; k < plane; k++) {
354 int src_kernel_offset = src_c_offset + k * C4NUM;
355 int dst_kernel_offset = dst_c_offset + k * c4 * C4NUM;
356 ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
357 }
358 }
359 }
360 }
361
PackNC4HW4ToNHWCFp16(const void * src,void * dst,int batch,int plane,int channel)362 void PackNC4HW4ToNHWCFp16(const void *src, void *dst, int batch, int plane, int channel) {
363 int c4 = UP_DIV(channel, C4NUM);
364 for (int b = 0; b < batch; b++) {
365 int src_offset = b * plane * c4 * C4NUM;
366 int dst_offset = b * plane * channel;
367 for (int c = 0; c < channel; c++) {
368 int c4_block_num = c / C4NUM;
369 int c4_block_res = c % C4NUM;
370 int src_c_offset = src_offset + c4_block_num * plane * C4NUM + c4_block_res;
371 int dst_c_offset = dst_offset + c;
372 for (int k = 0; k < plane; k++) {
373 int src_kernel_offset = src_c_offset + k * C4NUM;
374 int dst_kernel_offset = dst_c_offset + k * channel;
375 ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
376 }
377 }
378 }
379 }
380
PackNC4HW4ToNCHWFp16(const void * src,void * dst,int batch,int plane,int channel)381 void PackNC4HW4ToNCHWFp16(const void *src, void *dst, int batch, int plane, int channel) {
382 int c4 = UP_DIV(channel, C4NUM);
383 for (int b = 0; b < batch; b++) {
384 int src_offset = b * plane * c4 * C4NUM;
385 int dst_offset = b * plane * channel;
386 for (int c = 0; c < channel; c++) {
387 int c4_block_num = c / C4NUM;
388 int c4_block_res = c % C4NUM;
389 int src_c_offset = src_offset + c4_block_num * plane * C4NUM + c4_block_res;
390 int dst_c_offset = dst_offset + c * plane;
391 for (int k = 0; k < plane; k++) {
392 int src_kernel_offset = src_c_offset + k * C4NUM;
393 int dst_kernel_offset = dst_c_offset + k;
394 ((float16_t *)dst + dst_kernel_offset)[0] = ((float16_t *)src + src_kernel_offset)[0];
395 }
396 }
397 }
398 }
399
PackNCHWFp32ToNC8HW8Fp16(const float * src,float16_t * dst,int batch,int plane,int channel)400 void PackNCHWFp32ToNC8HW8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
401 int c8 = UP_DIV(channel, C8NUM);
402 for (int b = 0; b < batch; b++) {
403 int src_offset = b * plane * channel;
404 int dst_offset = b * plane * c8 * C8NUM;
405 for (int c = 0; c < channel; c++) {
406 int c8_block_num = c / C8NUM;
407 int c8_block_rem = c % C8NUM;
408 int src_c_offset = src_offset + c * plane;
409 int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
410 for (int k = 0; k < plane; k++) {
411 int src_kernel_offset = src_c_offset + k;
412 int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
413 (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
414 }
415 }
416 }
417 }
418
PackNCHWFp16ToNC8HW8Fp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)419 void PackNCHWFp16ToNC8HW8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
420 int c8 = UP_DIV(channel, C8NUM);
421 for (int b = 0; b < batch; b++) {
422 int src_offset = b * plane * channel;
423 int dst_offset = b * plane * c8 * C8NUM;
424 for (int c = 0; c < channel; c++) {
425 int c8_block_num = c / C8NUM;
426 int c8_block_rem = c % C8NUM;
427 int src_c_offset = src_offset + c * plane;
428 int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
429 for (int k = 0; k < plane; k++) {
430 int src_kernel_offset = src_c_offset + k;
431 int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
432 (dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
433 }
434 }
435 }
436 }
437
438 #ifdef Debug
PackNC8HW8ToNHWCFp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)439 void PackNC8HW8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
440 int block = UP_DIV(channel, C8NUM);
441 int last_block_idx = block - 1;
442 int last_src_col = channel - last_block_idx * C8NUM;
443 for (size_t i = 0; i < block; i++) {
444 size_t src_col = (i != last_block_idx) ? C8NUM : last_src_col;
445 float16_t *dst_cur = dst + i * C8NUM;
446 for (size_t j = 0; j < plane; j++) {
447 memcpy(dst_cur, src, src_col * sizeof(float16_t));
448 src += src_col;
449 dst_cur += channel;
450 }
451 }
452 }
453 #endif
454
PackNHWCFp32ToNHWC8Fp16(const float * src,float16_t * dst,int batch,int plane,int channel)455 void PackNHWCFp32ToNHWC8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
456 int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
457 for (int b = 0; b < batch; b++) {
458 float16_t *dst_batch = dst + b * plane * c8_channel;
459 const float *src_batch = src + b * plane * channel;
460 for (int i = 0; i < plane; i++) {
461 float16_t *dst_plane = dst_batch + i * c8_channel;
462 const float *src_plane = src_batch + i * channel;
463 for (int c = 0; c < channel; c++) {
464 dst_plane[c] = (float16_t)(src_plane[c]);
465 }
466 }
467 }
468 }
469
PackNHWCFp32ToC8HWN8Fp16(const float * src,float16_t * dst,int batch,int plane,int channel)470 void PackNHWCFp32ToC8HWN8Fp16(const float *src, float16_t *dst, int batch, int plane, int channel) {
471 for (int n = 0; n < batch; n++) {
472 for (int hw = 0; hw < plane; hw++) {
473 for (int c = 0; c < channel; c++) {
474 int c8div = c / C8NUM;
475 int c8mod = c % C8NUM;
476 int src_index = n * plane * channel + hw * channel + c;
477 int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
478 dst[dst_index] = (float16_t)(src[src_index]);
479 }
480 }
481 }
482 return;
483 }
484
PackNHWCFp16ToC8HWN8Fp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)485 void PackNHWCFp16ToC8HWN8Fp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
486 for (int n = 0; n < batch; n++) {
487 for (int hw = 0; hw < plane; hw++) {
488 for (int c = 0; c < channel; c++) {
489 int c8div = c / C8NUM;
490 int c8mod = c % C8NUM;
491 int src_index = n * plane * channel + hw * channel + c;
492 int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
493 dst[dst_index] = src[src_index];
494 }
495 }
496 }
497 return;
498 }
499
PackNHWC8Fp16ToNHWCFp32(const float16_t * src,float * dst,int batch,int plane,int channel)500 void PackNHWC8Fp16ToNHWCFp32(const float16_t *src, float *dst, int batch, int plane, int channel) {
501 int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
502 for (int b = 0; b < batch; b++) {
503 const float16_t *src_batch = src + b * plane * c8_channel;
504 float *dst_batch = dst + b * plane * channel;
505 for (int i = 0; i < plane; i++) {
506 const float16_t *src_plane = src_batch + i * c8_channel;
507 float *dst_plane = dst_batch + i * channel;
508 for (int c = 0; c < channel; c++) {
509 dst_plane[c] = (float16_t)(src_plane[c]);
510 }
511 }
512 }
513 }
514
PackNHWC8ToNHWCFp16(const float16_t * src,float16_t * dst,int batch,int plane,int channel)515 void PackNHWC8ToNHWCFp16(const float16_t *src, float16_t *dst, int batch, int plane, int channel) {
516 int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
517 for (int b = 0; b < batch; b++) {
518 const float16_t *src_batch = src + b * plane * c8_channel;
519 float16_t *dst_batch = dst + b * plane * channel;
520 for (int i = 0; i < plane; i++) {
521 const float16_t *src_plane = src_batch + i * c8_channel;
522 float16_t *dst_plane = dst_batch + i * channel;
523 memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
524 }
525 }
526 }
527
528 #ifdef ENABLE_ARM82_A32
Transpose8x8A32Fp16(const float16_t * src,float16_t * dst,size_t src_stride,size_t dst_stride)529 inline void Transpose8x8A32Fp16(const float16_t *src, float16_t *dst, size_t src_stride, size_t dst_stride) {
530 asm volatile(
531 "mov r10, %[src]\n"
532 "mov r12, %[dst]\n"
533 "vld1.16 {q0}, [r10], %[src_stride]\n"
534 "vld1.16 {q2}, [r10], %[src_stride]\n"
535 "vld1.16 {q4}, [r10], %[src_stride]\n"
536 "vld1.16 {q6}, [r10], %[src_stride]\n"
537
538 "vtrn.16 d0, d4\n"
539 "vtrn.16 d1, d5\n"
540 "vtrn.16 d8, d12\n"
541 "vtrn.16 d9, d13\n"
542
543 "vld1.16 {q8}, [r10], %[src_stride]\n"
544 "vld1.16 {q10}, [r10], %[src_stride]\n"
545 "vld1.16 {q12}, [r10], %[src_stride]\n"
546 "vld1.16 {q14}, [r10], %[src_stride]\n"
547
548 "vtrn.32 d0, d8\n"
549 "vtrn.32 d4, d12\n"
550 "vtrn.32 d1, d9\n"
551 "vtrn.32 d5, d13\n"
552
553 "vtrn.16 d16, d20\n"
554 "vtrn.16 d17, d21\n"
555 "vtrn.16 d24, d28\n"
556 "vtrn.16 d25, d29\n"
557
558 "vtrn.32 d16, d24\n"
559 "vtrn.32 d20, d28\n"
560 "vtrn.32 d17, d25\n"
561 "vtrn.32 d21, d29\n"
562
563 "vswp d1, d16\n"
564 "vswp d5, d20\n"
565 "vswp d9, d24\n"
566 "vswp d13, d28\n"
567
568 "vst1.16 {q0}, [r12], %[dst_stride]\n"
569 "vst1.16 {q2}, [r12], %[dst_stride]\n"
570 "vst1.16 {q4}, [r12], %[dst_stride]\n"
571 "vst1.16 {q6}, [r12], %[dst_stride]\n"
572
573 "vst1.16 {q8}, [r12], %[dst_stride]\n"
574 "vst1.16 {q10}, [r12], %[dst_stride]\n"
575 "vst1.16 {q12}, [r12], %[dst_stride]\n"
576 "vst1.16 {q14}, [r12], %[dst_stride]\n"
577
578 :
579 : [ dst ] "r"(dst), [ src ] "r"(src), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
580 : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
581 "q15");
582 }
583
Transpose12x8A32Fp16(const float16_t * src_c,float16_t * dst_c,size_t src_stride,size_t dst_stride)584 inline void Transpose12x8A32Fp16(const float16_t *src_c, float16_t *dst_c, size_t src_stride, size_t dst_stride) {
585 asm volatile(
586 "mov r10, %[src_c]\n"
587 "mov r12, %[dst_c]\n"
588
589 "vld1.16 {q0}, [r10], %[src_stride]\n"
590 "vld1.16 {q2}, [r10], %[src_stride]\n"
591 "vld1.16 {q4}, [r10], %[src_stride]\n"
592 "vld1.16 {q6}, [r10], %[src_stride]\n"
593
594 "vtrn.16 d0, d4\n"
595 "vtrn.16 d1, d5\n"
596 "vtrn.16 d8, d12\n"
597 "vtrn.16 d9, d13\n"
598
599 "vld1.16 {q8}, [r10], %[src_stride]\n"
600 "vld1.16 {q10}, [r10], %[src_stride]\n"
601 "vld1.16 {q12}, [r10], %[src_stride]\n"
602 "vld1.16 {q14}, [r10], %[src_stride]\n"
603
604 "vtrn.32 d0, d8\n"
605 "vtrn.32 d4, d12\n"
606 "vtrn.32 d1, d9\n"
607 "vtrn.32 d5, d13\n"
608
609 "vtrn.16 d16, d20\n"
610 "vtrn.16 d17, d21\n"
611 "vtrn.16 d24, d28\n"
612 "vtrn.16 d25, d29\n"
613
614 "vld1.16 {q1}, [r10], %[src_stride]\n"
615 "vld1.16 {q3}, [r10], %[src_stride]\n"
616 "vld1.16 {q5}, [r10], %[src_stride]\n"
617 "vld1.16 {q7}, [r10], %[src_stride]\n"
618
619 "vtrn.32 d16, d24\n"
620 "vtrn.32 d20, d28\n"
621 "vtrn.32 d17, d25\n"
622 "vtrn.32 d21, d29\n"
623
624 "vswp d1, d16\n"
625 "vswp d5, d20\n"
626 "vswp d9, d24\n"
627 "vswp d13, d28\n"
628
629 "vtrn.16 d2, d6\n"
630 "vtrn.16 d3, d7\n"
631 "vtrn.16 d10, d14\n"
632 "vtrn.16 d11, d15\n"
633
634 "vtrn.32 d2, d10\n"
635 "vtrn.32 d6, d14\n"
636 "vtrn.32 d3, d11\n"
637 "vtrn.32 d7, d15\n"
638
639 "vst1.16 {q0, d2}, [r12], %[dst_stride]\n"
640 "vst1.16 {q2, d6}, [r12], %[dst_stride]\n"
641 "vst1.16 {q4, d10}, [r12], %[dst_stride]\n"
642 "vst1.16 {q6, d14}, [r12], %[dst_stride]\n"
643
644 "vswp d3, d18\n"
645 "vswp d7, d22\n"
646 "vswp d11, d26\n"
647 "vswp d15, d30\n"
648
649 "vst1.16 {q8, d18}, [r12], %[dst_stride]\n"
650 "vst1.16 {q10, d22}, [r12], %[dst_stride]\n"
651 "vst1.16 {q12, d26}, [r12], %[dst_stride]\n"
652 "vst1.16 {q14, d30}, [r12], %[dst_stride]\n"
653
654 :
655 : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
656 : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
657 "q15");
658 }
659 #endif
660
661 #ifdef ENABLE_ARM64
Transpose4x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)662 inline void Transpose4x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
663 dst_stride += dst_stride;
664 asm volatile(
665 "mov x10, %[src_ptr]\n"
666 "mov x11, %[dst_ptr]\n"
667
668 "ld1 {v0.8h}, [x10], %[src_stride]\n"
669 "ld1 {v1.8h}, [x10], %[src_stride]\n"
670 "ld1 {v2.8h}, [x10], %[src_stride]\n"
671 "ld1 {v3.8h}, [x10], %[src_stride]\n"
672 "add x10, x11, %[dst_stride]\n"
673
674 "zip1 v4.8h, v0.8h, v1.8h\n"
675 "zip1 v5.8h, v2.8h, v3.8h\n"
676
677 "trn1 v6.4s, v4.4s, v5.4s\n"
678 "trn2 v7.4s, v4.4s, v5.4s\n"
679
680 "trn1 v24.2d, v6.2d, v7.2d\n"
681 "trn2 v25.2d, v6.2d, v7.2d\n"
682
683 "zip2 v8.8h, v0.8h, v1.8h\n"
684 "zip2 v9.8h, v2.8h, v3.8h\n"
685
686 "trn1 v10.4s, v8.4s, v9.4s\n"
687 "trn2 v11.4s, v8.4s, v9.4s\n"
688
689 "trn1 v26.2d, v10.2d, v11.2d\n"
690 "trn2 v27.2d, v10.2d, v11.2d\n"
691
692 "st1 {v24.8h}, [x11], %[tow_dst_stride]\n"
693 "st1 {v25.8h}, [x10], %[tow_dst_stride]\n"
694 "st1 {v26.8h}, [x11], %[tow_dst_stride]\n"
695 "st1 {v27.8h}, [x10], %[tow_dst_stride]\n"
696 :
697 : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride),
698 [ dst_stride ] "r"(dst_stride), [ tow_dst_stride ] "r"(2 * dst_stride)
699 : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v24", "v25", "v26",
700 "v27");
701 }
702
Transpose8x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)703 inline void Transpose8x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
704 asm volatile(
705 "mov x10, %[src_ptr]\n"
706 "mov x11, %[dst_ptr]\n"
707
708 "ld1 {v0.8h}, [x10], %[src_stride]\n"
709 "ld1 {v1.8h}, [x10], %[src_stride]\n"
710 "ld1 {v2.8h}, [x10], %[src_stride]\n"
711 "ld1 {v3.8h}, [x10], %[src_stride]\n"
712 "ld1 {v4.8h}, [x10], %[src_stride]\n"
713 "ld1 {v5.8h}, [x10], %[src_stride]\n"
714 "ld1 {v6.8h}, [x10], %[src_stride]\n"
715 "ld1 {v7.8h}, [x10], %[src_stride]\n"
716 "add x10, x11, %[dst_stride]\n"
717
718 "zip1 v16.8h, v0.8h, v1.8h\n"
719 "zip1 v17.8h, v2.8h, v3.8h\n"
720 "zip1 v18.8h, v4.8h, v5.8h\n"
721 "zip1 v19.8h, v6.8h, v7.8h\n"
722
723 "trn1 v20.4s, v16.4s, v17.4s\n"
724 "trn2 v21.4s, v16.4s, v17.4s\n"
725 "trn1 v22.4s, v18.4s, v19.4s\n"
726 "trn2 v23.4s, v18.4s, v19.4s\n"
727
728 "trn1 v24.2d, v20.2d, v22.2d\n"
729 "trn2 v26.2d, v20.2d, v22.2d\n"
730 "trn1 v25.2d, v21.2d, v23.2d\n"
731 "trn2 v27.2d, v21.2d, v23.2d\n"
732
733 "zip2 v8.8h, v0.8h, v1.8h\n"
734 "zip2 v9.8h, v2.8h, v3.8h\n"
735 "zip2 v10.8h, v4.8h, v5.8h\n"
736 "zip2 v11.8h, v6.8h, v7.8h\n"
737
738 "trn1 v12.4s, v8.4s, v9.4s\n"
739 "trn2 v13.4s, v8.4s, v9.4s\n"
740 "trn1 v14.4s, v10.4s, v11.4s\n"
741 "trn2 v15.4s, v10.4s, v11.4s\n"
742
743 "trn1 v28.2d, v12.2d, v14.2d\n"
744 "trn2 v30.2d, v12.2d, v14.2d\n"
745 "trn1 v29.2d, v13.2d, v15.2d\n"
746 "trn2 v31.2d, v13.2d, v15.2d\n"
747
748 "st1 {v24.8h}, [x11], %[tow_dst_stride]\n"
749 "st1 {v25.8h}, [x10], %[tow_dst_stride]\n"
750 "st1 {v26.8h}, [x11], %[tow_dst_stride]\n"
751 "st1 {v27.8h}, [x10], %[tow_dst_stride]\n"
752 "st1 {v28.8h}, [x11], %[tow_dst_stride]\n"
753 "st1 {v29.8h}, [x10], %[tow_dst_stride]\n"
754 "st1 {v30.8h}, [x11], %[tow_dst_stride]\n"
755 "st1 {v31.8h}, [x10], %[tow_dst_stride]\n"
756 :
757 : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride),
758 [ dst_stride ] "r"(dst_stride), [ tow_dst_stride ] "r"(2 * dst_stride)
759 : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
760 "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
761 "v31");
762 }
763
Transpose12x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)764 void Transpose12x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
765 #ifdef ENABLE_DEBUG
766 for (int tr = 0; tr < C12NUM; tr++) {
767 for (int tc = 0; tc < C8NUM; tc++) {
768 dst_ptr[tc * C12NUM + tr] = src_ptr[tr * col + tc];
769 }
770 }
771 #else
772 asm volatile(
773 "mov x10, %[src_ptr]\n"
774 "mov x11, %[dst_ptr]\n"
775
776 "ld1 {v0.8h}, [x10], %[src_stride]\n"
777 "ld1 {v1.8h}, [x10], %[src_stride]\n"
778 "ld1 {v2.8h}, [x10], %[src_stride]\n"
779 "ld1 {v3.8h}, [x10], %[src_stride]\n"
780 "ld1 {v4.8h}, [x10], %[src_stride]\n"
781 "ld1 {v5.8h}, [x10], %[src_stride]\n"
782 "ld1 {v6.8h}, [x10], %[src_stride]\n"
783 "ld1 {v7.8h}, [x10], %[src_stride]\n"
784
785 "zip1 v16.8h, v0.8h, v1.8h\n"
786 "zip1 v17.8h, v2.8h, v3.8h\n"
787 "zip1 v18.8h, v4.8h, v5.8h\n"
788 "zip1 v19.8h, v6.8h, v7.8h\n"
789
790 "ld1 {v8.8h}, [x10], %[src_stride]\n"
791 "ld1 {v9.8h}, [x10], %[src_stride]\n"
792 "ld1 {v10.8h}, [x10], %[src_stride]\n"
793 "ld1 {v11.8h}, [x10], %[src_stride]\n"
794
795 "trn1 v20.4s, v16.4s, v17.4s\n"
796 "trn2 v21.4s, v16.4s, v17.4s\n"
797 "trn1 v22.4s, v18.4s, v19.4s\n"
798 "trn2 v23.4s, v18.4s, v19.4s\n"
799
800 "trn1 v24.2d, v20.2d, v22.2d\n"
801 "trn2 v25.2d, v20.2d, v22.2d\n"
802 "trn1 v26.2d, v21.2d, v23.2d\n"
803 "trn2 v27.2d, v21.2d, v23.2d\n"
804
805 "zip1 v16.8h, v8.8h, v9.8h\n"
806 "zip1 v17.8h, v10.8h, v11.8h\n"
807
808 "trn1 v20.4s, v16.4s, v17.4s\n"
809 "trn2 v21.4s, v16.4s, v17.4s\n"
810
811 "trn1 v28.2d, v20.2d, v20.2d\n"
812 "trn2 v29.2d, v20.2d, v20.2d\n"
813 "trn1 v30.2d, v21.2d, v21.2d\n"
814 "trn2 v31.2d, v21.2d, v21.2d\n"
815
816 "add x10, x11, #16\n"
817 "st1 {v24.8h}, [x11], %[dst_stride]\n"
818 "st1 {v28.4h}, [x10], %[dst_stride]\n"
819 "st1 {v26.8h}, [x11], %[dst_stride]\n"
820 "st1 {v30.4h}, [x10], %[dst_stride]\n"
821 "st1 {v25.8h}, [x11], %[dst_stride]\n"
822 "st1 {v29.4h}, [x10], %[dst_stride]\n"
823 "st1 {v27.8h}, [x11], %[dst_stride]\n"
824 "st1 {v31.4h}, [x10], %[dst_stride]\n"
825
826 "zip2 v16.8h, v0.8h, v1.8h\n"
827 "zip2 v17.8h, v2.8h, v3.8h\n"
828 "zip2 v18.8h, v4.8h, v5.8h\n"
829 "zip2 v19.8h, v6.8h, v7.8h\n"
830
831 "trn1 v20.4s, v16.4s, v17.4s\n"
832 "trn2 v21.4s, v16.4s, v17.4s\n"
833 "trn1 v22.4s, v18.4s, v19.4s\n"
834 "trn2 v23.4s, v18.4s, v19.4s\n"
835
836 "trn1 v24.2d, v20.2d, v22.2d\n"
837 "trn2 v25.2d, v20.2d, v22.2d\n"
838 "trn1 v26.2d, v21.2d, v23.2d\n"
839 "trn2 v27.2d, v21.2d, v23.2d\n"
840
841 "zip2 v16.8h, v8.8h, v9.8h\n"
842 "zip2 v17.8h, v10.8h, v11.8h\n"
843
844 "trn1 v20.4s, v16.4s, v17.4s\n"
845 "trn2 v21.4s, v16.4s, v17.4s\n"
846
847 "trn1 v28.2d, v20.2d, v20.2d\n"
848 "trn2 v29.2d, v20.2d, v20.2d\n"
849 "trn1 v30.2d, v21.2d, v21.2d\n"
850 "trn2 v31.2d, v21.2d, v21.2d\n"
851
852 "st1 {v24.8h}, [x11], %[dst_stride]\n"
853 "st1 {v28.4h}, [x10], %[dst_stride]\n"
854 "st1 {v26.8h}, [x11], %[dst_stride]\n"
855 "st1 {v30.4h}, [x10], %[dst_stride]\n"
856 "st1 {v25.8h}, [x11], %[dst_stride]\n"
857 "st1 {v29.4h}, [x10], %[dst_stride]\n"
858 "st1 {v27.8h}, [x11], %[dst_stride]\n"
859 "st1 {v31.4h}, [x10], %[dst_stride]\n"
860 :
861 : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
862 : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
863 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
864 #endif
865 }
866
Transpose16x8ARM64Fp16(const float16_t * src_ptr,float16_t * dst_ptr,size_t src_stride,size_t dst_stride)867 inline void Transpose16x8ARM64Fp16(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_stride, size_t dst_stride) {
868 asm volatile(
869 "mov x10, %[src_ptr]\n"
870 "mov x11, %[dst_ptr]\n"
871
872 "ld1 {v0.8h}, [x10], %[src_stride]\n"
873 "ld1 {v1.8h}, [x10], %[src_stride]\n"
874 "ld1 {v2.8h}, [x10], %[src_stride]\n"
875 "ld1 {v3.8h}, [x10], %[src_stride]\n"
876 "ld1 {v4.8h}, [x10], %[src_stride]\n"
877 "ld1 {v5.8h}, [x10], %[src_stride]\n"
878 "ld1 {v6.8h}, [x10], %[src_stride]\n"
879 "ld1 {v7.8h}, [x10], %[src_stride]\n"
880
881 "zip1 v16.8h, v0.8h, v1.8h\n"
882 "zip1 v17.8h, v2.8h, v3.8h\n"
883 "zip1 v18.8h, v4.8h, v5.8h\n"
884 "zip1 v19.8h, v6.8h, v7.8h\n"
885
886 "ld1 {v8.8h}, [x10], %[src_stride]\n"
887 "ld1 {v9.8h}, [x10], %[src_stride]\n"
888 "ld1 {v10.8h}, [x10], %[src_stride]\n"
889 "ld1 {v11.8h}, [x10], %[src_stride]\n"
890 "ld1 {v12.8h}, [x10], %[src_stride]\n"
891 "ld1 {v13.8h}, [x10], %[src_stride]\n"
892 "ld1 {v14.8h}, [x10], %[src_stride]\n"
893 "ld1 {v15.8h}, [x10], %[src_stride]\n"
894
895 "trn1 v20.4s, v16.4s, v17.4s\n"
896 "trn2 v21.4s, v16.4s, v17.4s\n"
897 "trn1 v22.4s, v18.4s, v19.4s\n"
898 "trn2 v23.4s, v18.4s, v19.4s\n"
899
900 "trn1 v24.2d, v20.2d, v22.2d\n"
901 "trn2 v25.2d, v20.2d, v22.2d\n"
902 "trn1 v26.2d, v21.2d, v23.2d\n"
903 "trn2 v27.2d, v21.2d, v23.2d\n"
904
905 "zip1 v16.8h, v8.8h, v9.8h\n"
906 "zip1 v17.8h, v10.8h, v11.8h\n"
907 "zip1 v18.8h, v12.8h, v13.8h\n"
908 "zip1 v19.8h, v14.8h, v15.8h\n"
909
910 "trn1 v20.4s, v16.4s, v17.4s\n"
911 "trn2 v21.4s, v16.4s, v17.4s\n"
912 "trn1 v22.4s, v18.4s, v19.4s\n"
913 "trn2 v23.4s, v18.4s, v19.4s\n"
914
915 "trn1 v28.2d, v20.2d, v22.2d\n"
916 "trn2 v29.2d, v20.2d, v22.2d\n"
917 "trn1 v30.2d, v21.2d, v23.2d\n"
918 "trn2 v31.2d, v21.2d, v23.2d\n"
919
920 "add x10, x11, #16\n"
921 "st1 {v24.8h}, [x11], %[dst_stride]\n"
922 "st1 {v28.8h}, [x10], %[dst_stride]\n"
923 "st1 {v26.8h}, [x11], %[dst_stride]\n"
924 "st1 {v30.8h}, [x10], %[dst_stride]\n"
925 "st1 {v25.8h}, [x11], %[dst_stride]\n"
926 "st1 {v29.8h}, [x10], %[dst_stride]\n"
927 "st1 {v27.8h}, [x11], %[dst_stride]\n"
928 "st1 {v31.8h}, [x10], %[dst_stride]\n"
929
930 "zip2 v16.8h, v0.8h, v1.8h\n"
931 "zip2 v17.8h, v2.8h, v3.8h\n"
932 "zip2 v18.8h, v4.8h, v5.8h\n"
933 "zip2 v19.8h, v6.8h, v7.8h\n"
934
935 "trn1 v20.4s, v16.4s, v17.4s\n"
936 "trn2 v21.4s, v16.4s, v17.4s\n"
937 "trn1 v22.4s, v18.4s, v19.4s\n"
938 "trn2 v23.4s, v18.4s, v19.4s\n"
939
940 "trn1 v24.2d, v20.2d, v22.2d\n"
941 "trn2 v25.2d, v20.2d, v22.2d\n"
942 "trn1 v26.2d, v21.2d, v23.2d\n"
943 "trn2 v27.2d, v21.2d, v23.2d\n"
944
945 "zip2 v16.8h, v8.8h, v9.8h\n"
946 "zip2 v17.8h, v10.8h, v11.8h\n"
947 "zip2 v18.8h, v12.8h, v13.8h\n"
948 "zip2 v19.8h, v14.8h, v15.8h\n"
949
950 "trn1 v20.4s, v16.4s, v17.4s\n"
951 "trn2 v21.4s, v16.4s, v17.4s\n"
952 "trn1 v22.4s, v18.4s, v19.4s\n"
953 "trn2 v23.4s, v18.4s, v19.4s\n"
954
955 "trn1 v28.2d, v20.2d, v22.2d\n"
956 "trn2 v29.2d, v20.2d, v22.2d\n"
957 "trn1 v30.2d, v21.2d, v23.2d\n"
958 "trn2 v31.2d, v21.2d, v23.2d\n"
959
960 "st1 {v24.8h}, [x11], %[dst_stride]\n"
961 "st1 {v28.8h}, [x10], %[dst_stride]\n"
962 "st1 {v26.8h}, [x11], %[dst_stride]\n"
963 "st1 {v30.8h}, [x10], %[dst_stride]\n"
964 "st1 {v25.8h}, [x11], %[dst_stride]\n"
965 "st1 {v29.8h}, [x10], %[dst_stride]\n"
966 "st1 {v27.8h}, [x11], %[dst_stride]\n"
967 "st1 {v31.8h}, [x10], %[dst_stride]\n"
968 :
969 : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ src_stride ] "r"(src_stride), [ dst_stride ] "r"(dst_stride)
970 : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
971 "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
972 "v31");
973 }
974 #endif
975