• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <arm_neon.h>
18 #include <cstring>
19 
20 #include "ultrahdr/editorhelper.h"
21 
22 namespace ultrahdr {
23 
24 #define vrev128q_u8(src, dst) \
25   dst = vrev64q_u8(src);      \
26   dst = vextq_u8(dst, dst, 8);
27 
28 #define vrev128q_u16(src, dst) \
29   dst = vrev64q_u16(src);      \
30   dst = vextq_u16(dst, dst, 4);
31 
32 #define vrev128q_u32(src, dst) \
33   dst = vrev64q_u32(src);      \
34   dst = vextq_u32(dst, dst, 2);
35 
36 #define vrev128q_u64(a) a = vextq_u64(a, a, 1)
37 
mirror_buffer_horizontal_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)38 static void mirror_buffer_horizontal_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
39                                                   int src_w, int src_h, int src_stride,
40                                                   int dst_stride) {
41   uint8_t* src_row = src_buffer;
42   uint8_t* dst_row = dst_buffer;
43 
44   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
45     uint8_t* src_blk = src_row + src_w;
46     uint8_t* dst_blk = dst_row;
47     int j = 0;
48 
49     for (; j + 64 <= src_w; src_blk -= 64, dst_blk += 64, j += 64) {
50       uint8x16x4_t s0 = vld1q_u8_x4(src_blk - 64);
51       uint8x16x4_t d0;
52       vrev128q_u8(s0.val[0], d0.val[3]);
53       vrev128q_u8(s0.val[1], d0.val[2]);
54       vrev128q_u8(s0.val[2], d0.val[1]);
55       vrev128q_u8(s0.val[3], d0.val[0]);
56       vst1q_u8_x4(dst_blk, d0);
57     }
58 
59     for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
60       uint8x16x2_t s0 = vld1q_u8_x2(src_blk - 32);
61       uint8x16x2_t d0;
62       vrev128q_u8(s0.val[0], d0.val[1]);
63       vrev128q_u8(s0.val[1], d0.val[0]);
64       vst1q_u8_x2(dst_blk, d0);
65     }
66 
67     for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
68       uint8x16_t s0 = vld1q_u8(src_blk - 16);
69       vrev128q_u8(s0, s0);
70       vst1q_u8(dst_blk, s0);
71     }
72 
73     for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
74       uint8x8_t s0 = vld1_u8(src_blk - 8);
75       s0 = vrev64_u8(s0);
76       vst1_u8(dst_blk, s0);
77     }
78 
79     for (int k = 0; k < src_w - j; k++) {
80       dst_blk[k] = src_row[src_w - j - k - 1];
81     }
82   }
83 }
84 
mirror_buffer_horizontal_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)85 static void mirror_buffer_horizontal_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
86                                                    int src_w, int src_h, int src_stride,
87                                                    int dst_stride) {
88   uint16_t* src_row = src_buffer;
89   uint16_t* dst_row = dst_buffer;
90 
91   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
92     uint16_t* src_blk = src_row + src_w;
93     uint16_t* dst_blk = dst_row;
94     int j = 0;
95 
96     for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
97       uint16x8x4_t s0 = vld1q_u16_x4(src_blk - 32);
98       uint16x8x4_t d0;
99       vrev128q_u16(s0.val[0], d0.val[3]);
100       vrev128q_u16(s0.val[1], d0.val[2]);
101       vrev128q_u16(s0.val[2], d0.val[1]);
102       vrev128q_u16(s0.val[3], d0.val[0]);
103       vst1q_u16_x4(dst_blk, d0);
104     }
105 
106     for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
107       uint16x8x2_t s0 = vld1q_u16_x2(src_blk - 16);
108       uint16x8x2_t d0;
109       vrev128q_u16(s0.val[0], d0.val[1]);
110       vrev128q_u16(s0.val[1], d0.val[0]);
111       vst1q_u16_x2(dst_blk, d0);
112     }
113 
114     for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
115       uint16x8_t s0 = vld1q_u16(src_blk - 8);
116       vrev128q_u16(s0, s0);
117       vst1q_u16(dst_blk, s0);
118     }
119 
120     for (int k = 0; k < src_w - j; k++) {
121       dst_blk[k] = src_row[src_w - j - k - 1];
122     }
123   }
124 }
125 
mirror_buffer_horizontal_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)126 static void mirror_buffer_horizontal_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
127                                                    int src_w, int src_h, int src_stride,
128                                                    int dst_stride) {
129   uint32_t* src_row = src_buffer;
130   uint32_t* dst_row = dst_buffer;
131 
132   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
133     uint32_t* src_blk = src_row + src_w;
134     uint32_t* dst_blk = dst_row;
135     int j = 0;
136 
137     for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
138       uint32x4x4_t s0 = vld1q_u32_x4(src_blk - 16);
139       uint32x4x4_t d0;
140       vrev128q_u32(s0.val[0], d0.val[3]);
141       vrev128q_u32(s0.val[1], d0.val[2]);
142       vrev128q_u32(s0.val[2], d0.val[1]);
143       vrev128q_u32(s0.val[3], d0.val[0]);
144       vst1q_u32_x4(dst_blk, d0);
145     }
146 
147     for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
148       uint32x4x2_t s0 = vld1q_u32_x2(src_blk - 8);
149       uint32x4x2_t d0;
150       vrev128q_u32(s0.val[0], d0.val[1]);
151       vrev128q_u32(s0.val[1], d0.val[0]);
152       vst1q_u32_x2(dst_blk, d0);
153     }
154 
155     for (; j + 4 <= src_w; src_blk -= 4, dst_blk += 4, j += 4) {
156       uint32x4_t s0 = vld1q_u32(src_blk - 4);
157       vrev128q_u32(s0, s0);
158       vst1q_u32(dst_blk, s0);
159     }
160 
161     for (int k = 0; k < src_w - j; k++) {
162       dst_blk[k] = src_row[src_w - j - k - 1];
163     }
164   }
165 }
166 
mirror_buffer_horizontal_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)167 static void mirror_buffer_horizontal_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
168                                                    int src_w, int src_h, int src_stride,
169                                                    int dst_stride) {
170   uint64_t* src_row = src_buffer;
171   uint64_t* dst_row = dst_buffer;
172 
173   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
174     uint64_t* src_blk = src_row + src_w;
175     uint64_t* dst_blk = dst_row;
176     int j = 0;
177 
178     for (; j + 2 <= src_w; src_blk -= 2, dst_blk += 2, j += 2) {
179       uint64x2_t s0 = vld1q_u64(src_blk - 2);
180       vrev128q_u64(s0);
181       vst1q_u64(dst_blk, s0);
182     }
183     for (int k = 0; k < src_w - j; k++) {
184       dst_blk[k] = src_row[src_w - j - k - 1];
185     }
186   }
187 }
188 
mirror_buffer_vertical_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)189 static void mirror_buffer_vertical_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer, int src_w,
190                                                 int src_h, int src_stride, int dst_stride) {
191   uint8_t* src_row = src_buffer;
192   uint8_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
193 
194   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
195     uint8_t* src_blk = src_row;
196     uint8_t* dst_blk = dst_row;
197     int j = 0;
198 
199     for (; j + 64 <= src_w; src_blk += 64, dst_blk += 64, j += 64) {
200       uint8x16x4_t s0 = vld1q_u8_x4(src_blk);
201       vst1q_u8_x4(dst_blk, s0);
202     }
203 
204     for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
205       uint8x16x2_t s0 = vld1q_u8_x2(src_blk);
206       vst1q_u8_x2(dst_blk, s0);
207     }
208 
209     for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
210       uint8x16_t s0 = vld1q_u8(src_blk);
211       vst1q_u8(dst_blk, s0);
212     }
213 
214     for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
215       uint8x8_t s0 = vld1_u8(src_blk);
216       vst1_u8(dst_blk, s0);
217     }
218 
219     if (j < src_w) memcpy(dst_blk, src_blk, src_w - j);
220   }
221 }
222 
mirror_buffer_vertical_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)223 static void mirror_buffer_vertical_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
224                                                  int src_w, int src_h, int src_stride,
225                                                  int dst_stride) {
226   uint16_t* src_row = src_buffer;
227   uint16_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
228 
229   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
230     uint16_t* src_blk = src_row;
231     uint16_t* dst_blk = dst_row;
232     int j = 0;
233 
234     for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
235       uint16x8x4_t s0 = vld1q_u16_x4(src_blk);
236       vst1q_u16_x4(dst_blk, s0);
237     }
238 
239     for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
240       uint16x8x2_t s0 = vld1q_u16_x2(src_blk);
241       vst1q_u16_x2(dst_blk, s0);
242     }
243 
244     for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
245       uint16x8_t s0 = vld1q_u16(src_blk);
246       vst1q_u16(dst_blk, s0);
247     }
248 
249     if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint16_t));
250   }
251 }
252 
mirror_buffer_vertical_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)253 static void mirror_buffer_vertical_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
254                                                  int src_w, int src_h, int src_stride,
255                                                  int dst_stride) {
256   uint32_t* src_row = src_buffer;
257   uint32_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
258 
259   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
260     uint32_t* src_blk = src_row;
261     uint32_t* dst_blk = dst_row;
262     int j = 0;
263 
264     for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
265       uint32x4x4_t s0 = vld1q_u32_x4(src_blk);
266       vst1q_u32_x4(dst_blk, s0);
267     }
268 
269     for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
270       uint32x4x2_t s0 = vld1q_u32_x2(src_blk);
271       vst1q_u32_x2(dst_blk, s0);
272     }
273 
274     for (; j + 4 <= src_w; src_blk += 4, dst_blk += 4, j += 4) {
275       uint32x4_t s0 = vld1q_u32(src_blk);
276       vst1q_u32(dst_blk, s0);
277     }
278 
279     if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint32_t));
280   }
281 }
282 
mirror_buffer_vertical_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)283 static void mirror_buffer_vertical_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
284                                                  int src_w, int src_h, int src_stride,
285                                                  int dst_stride) {
286   uint64_t* src_row = src_buffer;
287   uint64_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
288 
289   for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
290     uint64_t* src_blk = src_row;
291     uint64_t* dst_blk = dst_row;
292     int j = 0;
293 
294     for (; j + 2 <= src_w; src_blk += 2, dst_blk += 2, j += 2) {
295       uint64x2_t s0 = vld1q_u64(src_blk);
296       vst1q_u64(dst_blk, s0);
297     }
298 
299     if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint64_t));
300   }
301 }
302 
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)303 static INLINE void transpose_u8_8x8(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
304                                     uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6, uint8x8_t* a7) {
305   // Swap 8 bit elements. Goes from:
306   // a0: 00 01 02 03 04 05 06 07
307   // a1: 10 11 12 13 14 15 16 17
308   // a2: 20 21 22 23 24 25 26 27
309   // a3: 30 31 32 33 34 35 36 37
310   // a4: 40 41 42 43 44 45 46 47
311   // a5: 50 51 52 53 54 55 56 57
312   // a6: 60 61 62 63 64 65 66 67
313   // a7: 70 71 72 73 74 75 76 77
314   // to:
315   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
316   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
317   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
318   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
319 
320   const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
321   const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
322 
323   // Swap 16 bit elements resulting in:
324   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
325   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
326   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
327   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
328 
329   const uint16x8x2_t c0 =
330       vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0]));
331   const uint16x8x2_t c1 =
332       vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1]));
333 
334   // Unzip 32 bit elements resulting in:
335   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
336   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
337   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
338   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
339   const uint32x4x2_t d0 =
340       vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0]));
341   const uint32x4x2_t d1 =
342       vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1]));
343 
344   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
345   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
346   *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
347   *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
348   *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
349   *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
350   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
351   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
352 }
353 
reverse_uint8x8_regs(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)354 static INLINE void reverse_uint8x8_regs(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
355                                         uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6,
356                                         uint8x8_t* a7) {
357   *a0 = vrev64_u8(*a0);
358   *a1 = vrev64_u8(*a1);
359   *a2 = vrev64_u8(*a2);
360   *a3 = vrev64_u8(*a3);
361   *a4 = vrev64_u8(*a4);
362   *a5 = vrev64_u8(*a5);
363   *a6 = vrev64_u8(*a6);
364   *a7 = vrev64_u8(*a7);
365 }
366 
vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)367 static INLINE uint16x8x2_t vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
368   uint16x8x2_t b0;
369 
370 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
371   b0.val[0] =
372       vreinterpretq_u16_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
373   b0.val[1] =
374       vreinterpretq_u16_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
375 #else
376   b0.val[0] =
377       vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1)));
378   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
379                            vreinterpret_u16_u32(vget_high_s32(a1)));
380 #endif
381   return b0;
382 }
383 
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)384 static INLINE void transpose_u16_8x8(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2, uint16x8_t* a3,
385                                      uint16x8_t* a4, uint16x8_t* a5, uint16x8_t* a6,
386                                      uint16x8_t* a7) {
387   // Swap 16 bit elements. Goes from:
388   // a0: 00 01 02 03 04 05 06 07
389   // a1: 10 11 12 13 14 15 16 17
390   // a2: 20 21 22 23 24 25 26 27
391   // a3: 30 31 32 33 34 35 36 37
392   // a4: 40 41 42 43 44 45 46 47
393   // a5: 50 51 52 53 54 55 56 57
394   // a6: 60 61 62 63 64 65 66 67
395   // a7: 70 71 72 73 74 75 76 77
396   // to:
397   // b0.val[0]: 00 10 02 12 04 14 06 16
398   // b0.val[1]: 01 11 03 13 05 15 07 17
399   // b1.val[0]: 20 30 22 32 24 34 26 36
400   // b1.val[1]: 21 31 23 33 25 35 27 37
401   // b2.val[0]: 40 50 42 52 44 54 46 56
402   // b2.val[1]: 41 51 43 53 45 55 47 57
403   // b3.val[0]: 60 70 62 72 64 74 66 76
404   // b3.val[1]: 61 71 63 73 65 75 67 77
405   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
406   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
407   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
408   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
409 
410   // Swap 32 bit elements resulting in:
411   // c0.val[0]: 00 10 20 30 04 14 24 34
412   // c0.val[1]: 02 12 22 32 06 16 26 36
413   // c1.val[0]: 01 11 21 31 05 15 25 35
414   // c1.val[1]: 03 13 23 33 07 17 27 37
415   // c2.val[0]: 40 50 60 70 44 54 64 74
416   // c2.val[1]: 42 52 62 72 46 56 66 76
417   // c3.val[0]: 41 51 61 71 45 55 65 75
418   // c3.val[1]: 43 53 63 73 47 57 67 77
419   const uint32x4x2_t c0 =
420       vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0]));
421   const uint32x4x2_t c1 =
422       vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), vreinterpretq_u32_u16(b1.val[1]));
423   const uint32x4x2_t c2 =
424       vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), vreinterpretq_u32_u16(b3.val[0]));
425   const uint32x4x2_t c3 =
426       vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), vreinterpretq_u32_u16(b3.val[1]));
427 
428   // Swap 64 bit elements resulting in:
429   // d0.val[0]: 00 10 20 30 40 50 60 70
430   // d0.val[1]: 04 14 24 34 44 54 64 74
431   // d1.val[0]: 01 11 21 31 41 51 61 71
432   // d1.val[1]: 05 15 25 35 45 55 65 75
433   // d2.val[0]: 02 12 22 32 42 52 62 72
434   // d2.val[1]: 06 16 26 36 46 56 66 76
435   // d3.val[0]: 03 13 23 33 43 53 63 73
436   // d3.val[1]: 07 17 27 37 47 57 67 77
437   const uint16x8x2_t d0 = vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
438   const uint16x8x2_t d1 = vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
439   const uint16x8x2_t d2 = vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
440   const uint16x8x2_t d3 = vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
441 
442   *a0 = d0.val[0];
443   *a1 = d1.val[0];
444   *a2 = d2.val[0];
445   *a3 = d3.val[0];
446   *a4 = d0.val[1];
447   *a5 = d1.val[1];
448   *a6 = d2.val[1];
449   *a7 = d3.val[1];
450 }
451 
reverse_uint16x8_regs(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)452 static INLINE void reverse_uint16x8_regs(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2,
453                                          uint16x8_t* a3, uint16x8_t* a4, uint16x8_t* a5,
454                                          uint16x8_t* a6, uint16x8_t* a7) {
455   vrev128q_u16(*a0, *a0);
456   vrev128q_u16(*a1, *a1);
457   vrev128q_u16(*a2, *a2);
458   vrev128q_u16(*a3, *a3);
459   vrev128q_u16(*a4, *a4);
460   vrev128q_u16(*a5, *a5);
461   vrev128q_u16(*a6, *a6);
462   vrev128q_u16(*a7, *a7);
463 }
464 
vtrnq_u64_to_u32(uint32x4_t a0,uint32x4_t a1)465 static INLINE uint32x4x2_t vtrnq_u64_to_u32(uint32x4_t a0, uint32x4_t a1) {
466   uint32x4x2_t b0;
467 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
468   b0.val[0] =
469       vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
470   b0.val[1] =
471       vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
472 #else
473   b0.val[0] = vcombine_u32(vget_low_u32(a0), vget_low_u32(a1));
474   b0.val[1] = vcombine_u32(vget_high_u32(a0), vget_high_u32(a1));
475 #endif
476   return b0;
477 }
478 
transpose_u32_4x4(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)479 static INLINE void transpose_u32_4x4(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
480                                      uint32x4_t* a3) {
481   // Swap 32 bit elements. Goes from:
482   // a0: 00 01 02 03
483   // a1: 10 11 12 13
484   // a2: 20 21 22 23
485   // a3: 30 31 32 33
486   // to:
487   // b0.val[0]: 00 10 02 12
488   // b0.val[1]: 01 11 03 13
489   // b1.val[0]: 20 30 22 32
490   // b1.val[1]: 21 31 23 33
491 
492   const uint32x4x2_t b0 = vtrnq_u32(*a0, *a1);
493   const uint32x4x2_t b1 = vtrnq_u32(*a2, *a3);
494 
495   // Swap 64 bit elements resulting in:
496   // c0.val[0]: 00 10 20 30
497   // c0.val[1]: 02 12 22 32
498   // c1.val[0]: 01 11 21 31
499   // c1.val[1]: 03 13 23 33
500 
501   const uint32x4x2_t c0 = vtrnq_u64_to_u32(b0.val[0], b1.val[0]);
502   const uint32x4x2_t c1 = vtrnq_u64_to_u32(b0.val[1], b1.val[1]);
503 
504   *a0 = c0.val[0];
505   *a1 = c1.val[0];
506   *a2 = c0.val[1];
507   *a3 = c1.val[1];
508 }
509 
reverse_uint32x4_regs(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)510 static INLINE void reverse_uint32x4_regs(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
511                                          uint32x4_t* a3) {
512   vrev128q_u32(*a0, *a0);
513   vrev128q_u32(*a1, *a1);
514   vrev128q_u32(*a2, *a2);
515   vrev128q_u32(*a3, *a3);
516 }
517 
rotate90_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)518 static INLINE void rotate90_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
519   uint64x2_t b0 = vcombine_u64(vget_low_u64(*a1), vget_low_u64(*a0));
520   uint64x2_t b1 = vcombine_u64(vget_high_u64(*a1), vget_high_u64(*a0));
521   *a0 = b0;
522   *a1 = b1;
523 }
524 
rotate270_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)525 static INLINE void rotate270_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
526   uint64x2_t b0 = vcombine_u64(vget_low_u64(*a0), vget_low_u64(*a1));
527   uint64x2_t b1 = vcombine_u64(vget_high_u64(*a0), vget_high_u64(*a1));
528   *a0 = b1;
529   *a1 = b0;
530 }
531 
load_u8_8x8(const uint8_t * s,const int stride,uint8x8_t * s0,uint8x8_t * s1,uint8x8_t * s2,uint8x8_t * s3,uint8x8_t * s4,uint8x8_t * s5,uint8x8_t * s6,uint8x8_t * s7)532 static INLINE void load_u8_8x8(const uint8_t* s, const int stride, uint8x8_t* s0, uint8x8_t* s1,
533                                uint8x8_t* s2, uint8x8_t* s3, uint8x8_t* s4, uint8x8_t* s5,
534                                uint8x8_t* s6, uint8x8_t* s7) {
535   *s0 = vld1_u8(s);
536   s += stride;
537   *s1 = vld1_u8(s);
538   s += stride;
539   *s2 = vld1_u8(s);
540   s += stride;
541   *s3 = vld1_u8(s);
542   s += stride;
543   *s4 = vld1_u8(s);
544   s += stride;
545   *s5 = vld1_u8(s);
546   s += stride;
547   *s6 = vld1_u8(s);
548   s += stride;
549   *s7 = vld1_u8(s);
550 }
551 
load_u16_8x8(const uint16_t * s,const int stride,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)552 static INLINE void load_u16_8x8(const uint16_t* s, const int stride, uint16x8_t* s0, uint16x8_t* s1,
553                                 uint16x8_t* s2, uint16x8_t* s3, uint16x8_t* s4, uint16x8_t* s5,
554                                 uint16x8_t* s6, uint16x8_t* s7) {
555   *s0 = vld1q_u16(s);
556   s += stride;
557   *s1 = vld1q_u16(s);
558   s += stride;
559   *s2 = vld1q_u16(s);
560   s += stride;
561   *s3 = vld1q_u16(s);
562   s += stride;
563   *s4 = vld1q_u16(s);
564   s += stride;
565   *s5 = vld1q_u16(s);
566   s += stride;
567   *s6 = vld1q_u16(s);
568   s += stride;
569   *s7 = vld1q_u16(s);
570 }
571 
load_u32_4x4(const uint32_t * s,const int stride,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)572 static INLINE void load_u32_4x4(const uint32_t* s, const int stride, uint32x4_t* s1, uint32x4_t* s2,
573                                 uint32x4_t* s3, uint32x4_t* s4) {
574   *s1 = vld1q_u32(s);
575   s += stride;
576   *s2 = vld1q_u32(s);
577   s += stride;
578   *s3 = vld1q_u32(s);
579   s += stride;
580   *s4 = vld1q_u32(s);
581 }
582 
load_u64_2x2(const uint64_t * s,const int stride,uint64x2_t * s1,uint64x2_t * s2)583 static INLINE void load_u64_2x2(const uint64_t* s, const int stride, uint64x2_t* s1,
584                                 uint64x2_t* s2) {
585   *s1 = vld1q_u64(s);
586   s += stride;
587   *s2 = vld1q_u64(s);
588 }
589 
store_u8_8x8(uint8_t * s,int stride,uint8x8_t s0,uint8x8_t s1,uint8x8_t s2,uint8x8_t s3,uint8x8_t s4,uint8x8_t s5,uint8x8_t s6,uint8x8_t s7)590 static INLINE void store_u8_8x8(uint8_t* s, int stride, uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
591                                 uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6,
592                                 uint8x8_t s7) {
593   vst1_u8(s, s0);
594   s += stride;
595   vst1_u8(s, s1);
596   s += stride;
597   vst1_u8(s, s2);
598   s += stride;
599   vst1_u8(s, s3);
600   s += stride;
601   vst1_u8(s, s4);
602   s += stride;
603   vst1_u8(s, s5);
604   s += stride;
605   vst1_u8(s, s6);
606   s += stride;
607   vst1_u8(s, s7);
608 }
609 
store_u16_8x8(uint16_t * s,int stride,uint16x8_t s0,uint16x8_t s1,uint16x8_t s2,uint16x8_t s3,uint16x8_t s4,uint16x8_t s5,uint16x8_t s6,uint16x8_t s7)610 static INLINE void store_u16_8x8(uint16_t* s, int stride, uint16x8_t s0, uint16x8_t s1,
611                                  uint16x8_t s2, uint16x8_t s3, uint16x8_t s4, uint16x8_t s5,
612                                  uint16x8_t s6, uint16x8_t s7) {
613   vst1q_u16(s, s0);
614   s += stride;
615   vst1q_u16(s, s1);
616   s += stride;
617   vst1q_u16(s, s2);
618   s += stride;
619   vst1q_u16(s, s3);
620   s += stride;
621   vst1q_u16(s, s4);
622   s += stride;
623   vst1q_u16(s, s5);
624   s += stride;
625   vst1q_u16(s, s6);
626   s += stride;
627   vst1q_u16(s, s7);
628 }
629 
store_u32_4x4(uint32_t * s,int stride,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)630 static INLINE void store_u32_4x4(uint32_t* s, int stride, uint32x4_t s1, uint32x4_t s2,
631                                  uint32x4_t s3, uint32x4_t s4) {
632   vst1q_u32(s, s1);
633   s += stride;
634   vst1q_u32(s, s2);
635   s += stride;
636   vst1q_u32(s, s3);
637   s += stride;
638   vst1q_u32(s, s4);
639 }
640 
store_u64_2x2(uint64_t * s,int stride,uint64x2_t s1,uint64x2_t s2)641 static INLINE void store_u64_2x2(uint64_t* s, int stride, uint64x2_t s1, uint64x2_t s2) {
642   vst1q_u64(s, s1);
643   s += stride;
644   vst1q_u64(s, s2);
645 }
646 
rotate_buffer_clockwise_90_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)647 static void rotate_buffer_clockwise_90_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
648                                                     int src_w, int src_h, int src_stride,
649                                                     int dst_stride) {
650   const int blk_wd = 8;
651 
652   if (src_h < blk_wd || src_w < blk_wd) {
653     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
654     return;
655   }
656 
657   int sub_img_w = (src_w / blk_wd) * blk_wd;
658   uint8x8_t s[blk_wd];
659   int i = 0;
660 
661   while (1) {
662     uint8_t* dst_blk = dst_buffer + src_h - i - blk_wd;
663     uint8_t* src_blk = src_buffer + (i * src_stride);
664     int j;
665 
666     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
667       load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
668       transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
669       reverse_uint8x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
670       store_u8_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
671     }
672     if (sub_img_w < src_w) {
673       dst_blk += blk_wd - 1;
674       for (int k = 0; k < blk_wd; k++) {
675         for (int l = 0; l < (src_w - sub_img_w); l++) {
676           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
677         }
678       }
679     }
680     i += blk_wd;
681     if (i == src_h) break;
682     if (i + blk_wd > src_h) i = src_h - blk_wd;
683   }
684 }
685 
rotate_buffer_clockwise_90_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)686 static void rotate_buffer_clockwise_90_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
687                                                      int src_w, int src_h, int src_stride,
688                                                      int dst_stride) {
689   const int blk_wd = 8;
690 
691   if (src_h < blk_wd || src_w < blk_wd) {
692     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
693     return;
694   }
695 
696   int sub_img_w = (src_w / blk_wd) * blk_wd;
697   uint16x8_t s[blk_wd];
698   int i = 0;
699 
700   while (1) {
701     uint16_t* dst_blk = dst_buffer + src_h - i - blk_wd;
702     uint16_t* src_blk = src_buffer + (i * src_stride);
703     int j;
704 
705     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
706       load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
707       transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
708       reverse_uint16x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
709       store_u16_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
710     }
711     if (sub_img_w < src_w) {
712       dst_blk += blk_wd - 1;
713       for (int k = 0; k < blk_wd; k++) {
714         for (int l = 0; l < (src_w - sub_img_w); l++) {
715           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
716         }
717       }
718     }
719     i += blk_wd;
720     if (i == src_h) break;
721     if (i + blk_wd > src_h) i = src_h - blk_wd;
722   }
723 }
724 
rotate_buffer_clockwise_90_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)725 static void rotate_buffer_clockwise_90_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
726                                                      int src_w, int src_h, int src_stride,
727                                                      int dst_stride) {
728   const int blk_wd = 4;
729 
730   if (src_h < blk_wd || src_w < blk_wd) {
731     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
732     return;
733   }
734 
735   int sub_img_w = (src_w / blk_wd) * blk_wd;
736   uint32x4_t s[blk_wd];
737   int i = 0;
738 
739   while (1) {
740     uint32_t* dst_blk = dst_buffer + src_h - i - blk_wd;
741     uint32_t* src_blk = src_buffer + (i * src_stride);
742     int j;
743 
744     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
745       load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
746       transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
747       reverse_uint32x4_regs(&s[0], &s[1], &s[2], &s[3]);
748       store_u32_4x4(dst_blk, dst_stride, s[0], s[1], s[2], s[3]);
749     }
750     if (sub_img_w < src_w) {
751       dst_blk += blk_wd - 1;
752       for (int k = 0; k < blk_wd; k++) {
753         for (int l = 0; l < (src_w - sub_img_w); l++) {
754           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
755         }
756       }
757     }
758     i += blk_wd;
759     if (i == src_h) break;
760     if (i + blk_wd > src_h) i = src_h - blk_wd;
761   }
762 }
763 
rotate_buffer_clockwise_90_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)764 static void rotate_buffer_clockwise_90_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
765                                                      int src_w, int src_h, int src_stride,
766                                                      int dst_stride) {
767   const int blk_wd = 2;
768 
769   if (src_h < blk_wd || src_w < blk_wd) {
770     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
771     return;
772   }
773 
774   int sub_img_w = (src_w / blk_wd) * blk_wd;
775   uint64x2_t s[blk_wd];
776   int i = 0;
777 
778   while (1) {
779     uint64_t* dst_blk = dst_buffer + src_h - i - blk_wd;
780     uint64_t* src_blk = src_buffer + (i * src_stride);
781     int j;
782 
783     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
784       load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
785       rotate90_u64_2x2(&s[0], &s[1]);
786       store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
787     }
788     if (sub_img_w < src_w) {
789       dst_blk += blk_wd - 1;
790       for (int k = 0; k < blk_wd; k++) {
791         for (int l = 0; l < (src_w - sub_img_w); l++) {
792           dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
793         }
794       }
795     }
796     i += blk_wd;
797     if (i == src_h) break;
798     if (i + blk_wd > src_h) i = src_h - blk_wd;
799   }
800 }
801 
rotate_buffer_clockwise_270_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)802 static void rotate_buffer_clockwise_270_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
803                                                      int src_w, int src_h, int src_stride,
804                                                      int dst_stride) {
805   const int blk_wd = 8;
806 
807   if (src_h < blk_wd || src_w < blk_wd) {
808     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
809     return;
810   }
811 
812   int sub_img_w = (src_w / blk_wd) * blk_wd;
813   uint8x8_t s[blk_wd];
814   int i = 0;
815 
816   while (1) {
817     uint8_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
818     uint8_t* src_blk = src_buffer + (i * src_stride);
819     int j;
820 
821     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
822       load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
823       transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
824       store_u8_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
825     }
826     if (sub_img_w < src_w) {
827       dst_blk += (blk_wd - 1) * dst_stride;
828       for (int k = 0; k < blk_wd; k++) {
829         for (int l = 0; l < (src_w - sub_img_w); l++) {
830           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
831         }
832       }
833     }
834     i += blk_wd;
835     if (i == src_h) break;
836     if (i + blk_wd > src_h) i = src_h - blk_wd;
837   }
838 }
839 
rotate_buffer_clockwise_270_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)840 static void rotate_buffer_clockwise_270_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
841                                                       int src_w, int src_h, int src_stride,
842                                                       int dst_stride) {
843   const int blk_wd = 8;
844 
845   if (src_h < blk_wd || src_w < blk_wd) {
846     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
847     return;
848   }
849 
850   int sub_img_w = (src_w / blk_wd) * blk_wd;
851   uint16x8_t s[blk_wd];
852   int i = 0;
853 
854   while (1) {
855     uint16_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
856     uint16_t* src_blk = src_buffer + (i * src_stride);
857     int j;
858 
859     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
860       load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
861       transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
862       store_u16_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
863     }
864     if (sub_img_w < src_w) {
865       dst_blk += (blk_wd - 1) * dst_stride;
866       for (int k = 0; k < blk_wd; k++) {
867         for (int l = 0; l < (src_w - sub_img_w); l++) {
868           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
869         }
870       }
871     }
872     i += blk_wd;
873     if (i == src_h) break;
874     if (i + blk_wd > src_h) i = src_h - blk_wd;
875   }
876 }
877 
rotate_buffer_clockwise_270_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)878 static void rotate_buffer_clockwise_270_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
879                                                       int src_w, int src_h, int src_stride,
880                                                       int dst_stride) {
881   const int blk_wd = 4;
882 
883   if (src_h < blk_wd || src_w < blk_wd) {
884     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
885     return;
886   }
887 
888   int sub_img_w = (src_w / blk_wd) * blk_wd;
889   uint32x4_t s[blk_wd];
890   int i = 0;
891 
892   while (1) {
893     uint32_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
894     uint32_t* src_blk = src_buffer + (i * src_stride);
895     int j;
896 
897     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
898       load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
899       transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
900       store_u32_4x4(dst_blk, dst_stride, s[3], s[2], s[1], s[0]);
901     }
902     if (sub_img_w < src_w) {
903       dst_blk += (blk_wd - 1) * dst_stride;
904       for (int k = 0; k < blk_wd; k++) {
905         for (int l = 0; l < (src_w - sub_img_w); l++) {
906           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
907         }
908       }
909     }
910     i += blk_wd;
911     if (i == src_h) break;
912     if (i + blk_wd > src_h) i = src_h - blk_wd;
913   }
914 }
915 
rotate_buffer_clockwise_270_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)916 static void rotate_buffer_clockwise_270_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
917                                                       int src_w, int src_h, int src_stride,
918                                                       int dst_stride) {
919   const int blk_wd = 2;
920 
921   if (src_h < blk_wd || src_w < blk_wd) {
922     rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
923     return;
924   }
925 
926   int sub_img_w = (src_w / blk_wd) * blk_wd;
927   uint64x2_t s[blk_wd];
928   int i = 0;
929 
930   while (1) {
931     uint64_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
932     uint64_t* src_blk = src_buffer + (i * src_stride);
933     int j;
934 
935     for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
936       load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
937       rotate270_u64_2x2(&s[0], &s[1]);
938       store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
939     }
940     if (sub_img_w < src_w) {
941       dst_blk += (blk_wd - 1) * dst_stride;
942       for (int k = 0; k < blk_wd; k++) {
943         for (int l = 0; l < (src_w - sub_img_w); l++) {
944           dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
945         }
946       }
947     }
948     i += blk_wd;
949     if (i == src_h) break;
950     if (i + blk_wd > src_h) i = src_h - blk_wd;
951   }
952 }
953 
954 template <typename T>
mirror_buffer_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,uhdr_mirror_direction_t direction)955 void mirror_buffer_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h, int src_stride,
956                         int dst_stride, uhdr_mirror_direction_t direction) {
957   if (direction == UHDR_MIRROR_VERTICAL) {
958     if constexpr (sizeof(T) == 1) {
959       mirror_buffer_vertical_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
960                                           dst_stride);
961     } else if constexpr (sizeof(T) == 2) {
962       mirror_buffer_vertical_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
963                                            dst_stride);
964     } else if constexpr (sizeof(T) == 4) {
965       mirror_buffer_vertical_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
966                                            dst_stride);
967     } else if constexpr (sizeof(T) == 8) {
968       mirror_buffer_vertical_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
969                                            dst_stride);
970     }
971 
972   } else if (direction == UHDR_MIRROR_HORIZONTAL) {
973     if constexpr (sizeof(T) == 1) {
974       mirror_buffer_horizontal_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
975                                             dst_stride);
976     } else if constexpr (sizeof(T) == 2) {
977       mirror_buffer_horizontal_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
978                                              dst_stride);
979     } else if constexpr (sizeof(T) == 4) {
980       mirror_buffer_horizontal_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
981                                              dst_stride);
982     } else if constexpr (sizeof(T) == 8) {
983       mirror_buffer_horizontal_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
984                                              dst_stride);
985     }
986   }
987 }
988 
989 template <typename T>
rotate_buffer_clockwise_180_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)990 void rotate_buffer_clockwise_180_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
991                                       int src_stride, int dst_stride) {
992   if constexpr (sizeof(T) == 1) {
993     mirror_buffer_horizontal_neon_uint8_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
994                                           src_h, -src_stride, dst_stride);
995   } else if constexpr (sizeof(T) == 2) {
996     mirror_buffer_horizontal_neon_uint16_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
997                                            src_h, -src_stride, dst_stride);
998   } else if constexpr (sizeof(T) == 4) {
999     mirror_buffer_horizontal_neon_uint32_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1000                                            src_h, -src_stride, dst_stride);
1001   } else if constexpr (sizeof(T) == 8) {
1002     mirror_buffer_horizontal_neon_uint64_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1003                                            src_h, -src_stride, dst_stride);
1004   }
1005 }
1006 
1007 template <typename T>
rotate_buffer_clockwise_90_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1008 void rotate_buffer_clockwise_90_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1009                                      int src_stride, int dst_stride) {
1010   if constexpr (sizeof(T) == 1) {
1011     rotate_buffer_clockwise_90_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1012                                             dst_stride);
1013   } else if constexpr (sizeof(T) == 2) {
1014     rotate_buffer_clockwise_90_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1015                                              dst_stride);
1016   } else if constexpr (sizeof(T) == 4) {
1017     rotate_buffer_clockwise_90_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1018                                              dst_stride);
1019   } else if constexpr (sizeof(T) == 8) {
1020     rotate_buffer_clockwise_90_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1021                                              dst_stride);
1022   }
1023 }
1024 
1025 template <typename T>
rotate_buffer_clockwise_270_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1026 void rotate_buffer_clockwise_270_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1027                                       int src_stride, int dst_stride) {
1028   if constexpr (sizeof(T) == 1) {
1029     rotate_buffer_clockwise_270_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1030                                              dst_stride);
1031   } else if constexpr (sizeof(T) == 2) {
1032     rotate_buffer_clockwise_270_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1033                                               dst_stride);
1034   } else if constexpr (sizeof(T) == 4) {
1035     rotate_buffer_clockwise_270_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1036                                               dst_stride);
1037   } else if constexpr (sizeof(T) == 8) {
1038     rotate_buffer_clockwise_270_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1039                                               dst_stride);
1040   }
1041 }
1042 
1043 template <typename T>
rotate_buffer_clockwise_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,int degrees)1044 void rotate_buffer_clockwise_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1045                                   int src_stride, int dst_stride, int degrees) {
1046   if (degrees == 90) {
1047     rotate_buffer_clockwise_90_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1048   } else if (degrees == 180) {
1049     rotate_buffer_clockwise_180_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1050   } else if (degrees == 270) {
1051     rotate_buffer_clockwise_270_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1052   }
1053 }
1054 
1055 template void mirror_buffer_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int,
1056                                           uhdr_mirror_direction_t);
1057 template void mirror_buffer_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int,
1058                                            uhdr_mirror_direction_t);
1059 template void mirror_buffer_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int,
1060                                            uhdr_mirror_direction_t);
1061 template void mirror_buffer_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int,
1062                                            uhdr_mirror_direction_t);
1063 
1064 template void rotate_buffer_clockwise_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int, int);
1065 template void rotate_buffer_clockwise_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int, int);
1066 template void rotate_buffer_clockwise_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int, int);
1067 template void rotate_buffer_clockwise_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int, int);
1068 
1069 }  // namespace ultrahdr
1070