1 /*
2 * Copyright 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <arm_neon.h>
18 #include <cstring>
19
20 #include "ultrahdr/editorhelper.h"
21
22 namespace ultrahdr {
23
24 #define vrev128q_u8(src, dst) \
25 dst = vrev64q_u8(src); \
26 dst = vextq_u8(dst, dst, 8);
27
28 #define vrev128q_u16(src, dst) \
29 dst = vrev64q_u16(src); \
30 dst = vextq_u16(dst, dst, 4);
31
32 #define vrev128q_u32(src, dst) \
33 dst = vrev64q_u32(src); \
34 dst = vextq_u32(dst, dst, 2);
35
36 #define vrev128q_u64(a) a = vextq_u64(a, a, 1)
37
mirror_buffer_horizontal_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)38 static void mirror_buffer_horizontal_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
39 int src_w, int src_h, int src_stride,
40 int dst_stride) {
41 uint8_t* src_row = src_buffer;
42 uint8_t* dst_row = dst_buffer;
43
44 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
45 uint8_t* src_blk = src_row + src_w;
46 uint8_t* dst_blk = dst_row;
47 int j = 0;
48
49 for (; j + 64 <= src_w; src_blk -= 64, dst_blk += 64, j += 64) {
50 uint8x16x4_t s0 = vld1q_u8_x4(src_blk - 64);
51 uint8x16x4_t d0;
52 vrev128q_u8(s0.val[0], d0.val[3]);
53 vrev128q_u8(s0.val[1], d0.val[2]);
54 vrev128q_u8(s0.val[2], d0.val[1]);
55 vrev128q_u8(s0.val[3], d0.val[0]);
56 vst1q_u8_x4(dst_blk, d0);
57 }
58
59 for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
60 uint8x16x2_t s0 = vld1q_u8_x2(src_blk - 32);
61 uint8x16x2_t d0;
62 vrev128q_u8(s0.val[0], d0.val[1]);
63 vrev128q_u8(s0.val[1], d0.val[0]);
64 vst1q_u8_x2(dst_blk, d0);
65 }
66
67 for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
68 uint8x16_t s0 = vld1q_u8(src_blk - 16);
69 vrev128q_u8(s0, s0);
70 vst1q_u8(dst_blk, s0);
71 }
72
73 for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
74 uint8x8_t s0 = vld1_u8(src_blk - 8);
75 s0 = vrev64_u8(s0);
76 vst1_u8(dst_blk, s0);
77 }
78
79 for (int k = 0; k < src_w - j; k++) {
80 dst_blk[k] = src_row[src_w - j - k - 1];
81 }
82 }
83 }
84
mirror_buffer_horizontal_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)85 static void mirror_buffer_horizontal_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
86 int src_w, int src_h, int src_stride,
87 int dst_stride) {
88 uint16_t* src_row = src_buffer;
89 uint16_t* dst_row = dst_buffer;
90
91 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
92 uint16_t* src_blk = src_row + src_w;
93 uint16_t* dst_blk = dst_row;
94 int j = 0;
95
96 for (; j + 32 <= src_w; src_blk -= 32, dst_blk += 32, j += 32) {
97 uint16x8x4_t s0 = vld1q_u16_x4(src_blk - 32);
98 uint16x8x4_t d0;
99 vrev128q_u16(s0.val[0], d0.val[3]);
100 vrev128q_u16(s0.val[1], d0.val[2]);
101 vrev128q_u16(s0.val[2], d0.val[1]);
102 vrev128q_u16(s0.val[3], d0.val[0]);
103 vst1q_u16_x4(dst_blk, d0);
104 }
105
106 for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
107 uint16x8x2_t s0 = vld1q_u16_x2(src_blk - 16);
108 uint16x8x2_t d0;
109 vrev128q_u16(s0.val[0], d0.val[1]);
110 vrev128q_u16(s0.val[1], d0.val[0]);
111 vst1q_u16_x2(dst_blk, d0);
112 }
113
114 for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
115 uint16x8_t s0 = vld1q_u16(src_blk - 8);
116 vrev128q_u16(s0, s0);
117 vst1q_u16(dst_blk, s0);
118 }
119
120 for (int k = 0; k < src_w - j; k++) {
121 dst_blk[k] = src_row[src_w - j - k - 1];
122 }
123 }
124 }
125
mirror_buffer_horizontal_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)126 static void mirror_buffer_horizontal_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
127 int src_w, int src_h, int src_stride,
128 int dst_stride) {
129 uint32_t* src_row = src_buffer;
130 uint32_t* dst_row = dst_buffer;
131
132 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
133 uint32_t* src_blk = src_row + src_w;
134 uint32_t* dst_blk = dst_row;
135 int j = 0;
136
137 for (; j + 16 <= src_w; src_blk -= 16, dst_blk += 16, j += 16) {
138 uint32x4x4_t s0 = vld1q_u32_x4(src_blk - 16);
139 uint32x4x4_t d0;
140 vrev128q_u32(s0.val[0], d0.val[3]);
141 vrev128q_u32(s0.val[1], d0.val[2]);
142 vrev128q_u32(s0.val[2], d0.val[1]);
143 vrev128q_u32(s0.val[3], d0.val[0]);
144 vst1q_u32_x4(dst_blk, d0);
145 }
146
147 for (; j + 8 <= src_w; src_blk -= 8, dst_blk += 8, j += 8) {
148 uint32x4x2_t s0 = vld1q_u32_x2(src_blk - 8);
149 uint32x4x2_t d0;
150 vrev128q_u32(s0.val[0], d0.val[1]);
151 vrev128q_u32(s0.val[1], d0.val[0]);
152 vst1q_u32_x2(dst_blk, d0);
153 }
154
155 for (; j + 4 <= src_w; src_blk -= 4, dst_blk += 4, j += 4) {
156 uint32x4_t s0 = vld1q_u32(src_blk - 4);
157 vrev128q_u32(s0, s0);
158 vst1q_u32(dst_blk, s0);
159 }
160
161 for (int k = 0; k < src_w - j; k++) {
162 dst_blk[k] = src_row[src_w - j - k - 1];
163 }
164 }
165 }
166
mirror_buffer_horizontal_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)167 static void mirror_buffer_horizontal_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
168 int src_w, int src_h, int src_stride,
169 int dst_stride) {
170 uint64_t* src_row = src_buffer;
171 uint64_t* dst_row = dst_buffer;
172
173 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row += dst_stride) {
174 uint64_t* src_blk = src_row + src_w;
175 uint64_t* dst_blk = dst_row;
176 int j = 0;
177
178 for (; j + 2 <= src_w; src_blk -= 2, dst_blk += 2, j += 2) {
179 uint64x2_t s0 = vld1q_u64(src_blk - 2);
180 vrev128q_u64(s0);
181 vst1q_u64(dst_blk, s0);
182 }
183 for (int k = 0; k < src_w - j; k++) {
184 dst_blk[k] = src_row[src_w - j - k - 1];
185 }
186 }
187 }
188
mirror_buffer_vertical_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)189 static void mirror_buffer_vertical_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer, int src_w,
190 int src_h, int src_stride, int dst_stride) {
191 uint8_t* src_row = src_buffer;
192 uint8_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
193
194 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
195 uint8_t* src_blk = src_row;
196 uint8_t* dst_blk = dst_row;
197 int j = 0;
198
199 for (; j + 64 <= src_w; src_blk += 64, dst_blk += 64, j += 64) {
200 uint8x16x4_t s0 = vld1q_u8_x4(src_blk);
201 vst1q_u8_x4(dst_blk, s0);
202 }
203
204 for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
205 uint8x16x2_t s0 = vld1q_u8_x2(src_blk);
206 vst1q_u8_x2(dst_blk, s0);
207 }
208
209 for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
210 uint8x16_t s0 = vld1q_u8(src_blk);
211 vst1q_u8(dst_blk, s0);
212 }
213
214 for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
215 uint8x8_t s0 = vld1_u8(src_blk);
216 vst1_u8(dst_blk, s0);
217 }
218
219 if (j < src_w) memcpy(dst_blk, src_blk, src_w - j);
220 }
221 }
222
mirror_buffer_vertical_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)223 static void mirror_buffer_vertical_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
224 int src_w, int src_h, int src_stride,
225 int dst_stride) {
226 uint16_t* src_row = src_buffer;
227 uint16_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
228
229 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
230 uint16_t* src_blk = src_row;
231 uint16_t* dst_blk = dst_row;
232 int j = 0;
233
234 for (; j + 32 <= src_w; src_blk += 32, dst_blk += 32, j += 32) {
235 uint16x8x4_t s0 = vld1q_u16_x4(src_blk);
236 vst1q_u16_x4(dst_blk, s0);
237 }
238
239 for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
240 uint16x8x2_t s0 = vld1q_u16_x2(src_blk);
241 vst1q_u16_x2(dst_blk, s0);
242 }
243
244 for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
245 uint16x8_t s0 = vld1q_u16(src_blk);
246 vst1q_u16(dst_blk, s0);
247 }
248
249 if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint16_t));
250 }
251 }
252
mirror_buffer_vertical_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)253 static void mirror_buffer_vertical_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
254 int src_w, int src_h, int src_stride,
255 int dst_stride) {
256 uint32_t* src_row = src_buffer;
257 uint32_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
258
259 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
260 uint32_t* src_blk = src_row;
261 uint32_t* dst_blk = dst_row;
262 int j = 0;
263
264 for (; j + 16 <= src_w; src_blk += 16, dst_blk += 16, j += 16) {
265 uint32x4x4_t s0 = vld1q_u32_x4(src_blk);
266 vst1q_u32_x4(dst_blk, s0);
267 }
268
269 for (; j + 8 <= src_w; src_blk += 8, dst_blk += 8, j += 8) {
270 uint32x4x2_t s0 = vld1q_u32_x2(src_blk);
271 vst1q_u32_x2(dst_blk, s0);
272 }
273
274 for (; j + 4 <= src_w; src_blk += 4, dst_blk += 4, j += 4) {
275 uint32x4_t s0 = vld1q_u32(src_blk);
276 vst1q_u32(dst_blk, s0);
277 }
278
279 if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint32_t));
280 }
281 }
282
mirror_buffer_vertical_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)283 static void mirror_buffer_vertical_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
284 int src_w, int src_h, int src_stride,
285 int dst_stride) {
286 uint64_t* src_row = src_buffer;
287 uint64_t* dst_row = dst_buffer + (src_h - 1) * dst_stride;
288
289 for (int i = 0; i < src_h; i++, src_row += src_stride, dst_row -= dst_stride) {
290 uint64_t* src_blk = src_row;
291 uint64_t* dst_blk = dst_row;
292 int j = 0;
293
294 for (; j + 2 <= src_w; src_blk += 2, dst_blk += 2, j += 2) {
295 uint64x2_t s0 = vld1q_u64(src_blk);
296 vst1q_u64(dst_blk, s0);
297 }
298
299 if (j < src_w) memcpy(dst_blk, src_blk, (src_w - j) * sizeof(uint64_t));
300 }
301 }
302
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)303 static INLINE void transpose_u8_8x8(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
304 uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6, uint8x8_t* a7) {
305 // Swap 8 bit elements. Goes from:
306 // a0: 00 01 02 03 04 05 06 07
307 // a1: 10 11 12 13 14 15 16 17
308 // a2: 20 21 22 23 24 25 26 27
309 // a3: 30 31 32 33 34 35 36 37
310 // a4: 40 41 42 43 44 45 46 47
311 // a5: 50 51 52 53 54 55 56 57
312 // a6: 60 61 62 63 64 65 66 67
313 // a7: 70 71 72 73 74 75 76 77
314 // to:
315 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
316 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
317 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
318 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
319
320 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
321 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
322
323 // Swap 16 bit elements resulting in:
324 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
325 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
326 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
327 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
328
329 const uint16x8x2_t c0 =
330 vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0]));
331 const uint16x8x2_t c1 =
332 vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1]));
333
334 // Unzip 32 bit elements resulting in:
335 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
336 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
337 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
338 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
339 const uint32x4x2_t d0 =
340 vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0]));
341 const uint32x4x2_t d1 =
342 vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1]));
343
344 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
345 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
346 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
347 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
348 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
349 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
350 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
351 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
352 }
353
reverse_uint8x8_regs(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)354 static INLINE void reverse_uint8x8_regs(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, uint8x8_t* a3,
355 uint8x8_t* a4, uint8x8_t* a5, uint8x8_t* a6,
356 uint8x8_t* a7) {
357 *a0 = vrev64_u8(*a0);
358 *a1 = vrev64_u8(*a1);
359 *a2 = vrev64_u8(*a2);
360 *a3 = vrev64_u8(*a3);
361 *a4 = vrev64_u8(*a4);
362 *a5 = vrev64_u8(*a5);
363 *a6 = vrev64_u8(*a6);
364 *a7 = vrev64_u8(*a7);
365 }
366
vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)367 static INLINE uint16x8x2_t vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
368 uint16x8x2_t b0;
369
370 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
371 b0.val[0] =
372 vreinterpretq_u16_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
373 b0.val[1] =
374 vreinterpretq_u16_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
375 #else
376 b0.val[0] =
377 vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1)));
378 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
379 vreinterpret_u16_u32(vget_high_s32(a1)));
380 #endif
381 return b0;
382 }
383
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)384 static INLINE void transpose_u16_8x8(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2, uint16x8_t* a3,
385 uint16x8_t* a4, uint16x8_t* a5, uint16x8_t* a6,
386 uint16x8_t* a7) {
387 // Swap 16 bit elements. Goes from:
388 // a0: 00 01 02 03 04 05 06 07
389 // a1: 10 11 12 13 14 15 16 17
390 // a2: 20 21 22 23 24 25 26 27
391 // a3: 30 31 32 33 34 35 36 37
392 // a4: 40 41 42 43 44 45 46 47
393 // a5: 50 51 52 53 54 55 56 57
394 // a6: 60 61 62 63 64 65 66 67
395 // a7: 70 71 72 73 74 75 76 77
396 // to:
397 // b0.val[0]: 00 10 02 12 04 14 06 16
398 // b0.val[1]: 01 11 03 13 05 15 07 17
399 // b1.val[0]: 20 30 22 32 24 34 26 36
400 // b1.val[1]: 21 31 23 33 25 35 27 37
401 // b2.val[0]: 40 50 42 52 44 54 46 56
402 // b2.val[1]: 41 51 43 53 45 55 47 57
403 // b3.val[0]: 60 70 62 72 64 74 66 76
404 // b3.val[1]: 61 71 63 73 65 75 67 77
405 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
406 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
407 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
408 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
409
410 // Swap 32 bit elements resulting in:
411 // c0.val[0]: 00 10 20 30 04 14 24 34
412 // c0.val[1]: 02 12 22 32 06 16 26 36
413 // c1.val[0]: 01 11 21 31 05 15 25 35
414 // c1.val[1]: 03 13 23 33 07 17 27 37
415 // c2.val[0]: 40 50 60 70 44 54 64 74
416 // c2.val[1]: 42 52 62 72 46 56 66 76
417 // c3.val[0]: 41 51 61 71 45 55 65 75
418 // c3.val[1]: 43 53 63 73 47 57 67 77
419 const uint32x4x2_t c0 =
420 vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0]));
421 const uint32x4x2_t c1 =
422 vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), vreinterpretq_u32_u16(b1.val[1]));
423 const uint32x4x2_t c2 =
424 vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), vreinterpretq_u32_u16(b3.val[0]));
425 const uint32x4x2_t c3 =
426 vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), vreinterpretq_u32_u16(b3.val[1]));
427
428 // Swap 64 bit elements resulting in:
429 // d0.val[0]: 00 10 20 30 40 50 60 70
430 // d0.val[1]: 04 14 24 34 44 54 64 74
431 // d1.val[0]: 01 11 21 31 41 51 61 71
432 // d1.val[1]: 05 15 25 35 45 55 65 75
433 // d2.val[0]: 02 12 22 32 42 52 62 72
434 // d2.val[1]: 06 16 26 36 46 56 66 76
435 // d3.val[0]: 03 13 23 33 43 53 63 73
436 // d3.val[1]: 07 17 27 37 47 57 67 77
437 const uint16x8x2_t d0 = vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
438 const uint16x8x2_t d1 = vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
439 const uint16x8x2_t d2 = vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
440 const uint16x8x2_t d3 = vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
441
442 *a0 = d0.val[0];
443 *a1 = d1.val[0];
444 *a2 = d2.val[0];
445 *a3 = d3.val[0];
446 *a4 = d0.val[1];
447 *a5 = d1.val[1];
448 *a6 = d2.val[1];
449 *a7 = d3.val[1];
450 }
451
reverse_uint16x8_regs(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)452 static INLINE void reverse_uint16x8_regs(uint16x8_t* a0, uint16x8_t* a1, uint16x8_t* a2,
453 uint16x8_t* a3, uint16x8_t* a4, uint16x8_t* a5,
454 uint16x8_t* a6, uint16x8_t* a7) {
455 vrev128q_u16(*a0, *a0);
456 vrev128q_u16(*a1, *a1);
457 vrev128q_u16(*a2, *a2);
458 vrev128q_u16(*a3, *a3);
459 vrev128q_u16(*a4, *a4);
460 vrev128q_u16(*a5, *a5);
461 vrev128q_u16(*a6, *a6);
462 vrev128q_u16(*a7, *a7);
463 }
464
vtrnq_u64_to_u32(uint32x4_t a0,uint32x4_t a1)465 static INLINE uint32x4x2_t vtrnq_u64_to_u32(uint32x4_t a0, uint32x4_t a1) {
466 uint32x4x2_t b0;
467 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
468 b0.val[0] =
469 vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
470 b0.val[1] =
471 vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
472 #else
473 b0.val[0] = vcombine_u32(vget_low_u32(a0), vget_low_u32(a1));
474 b0.val[1] = vcombine_u32(vget_high_u32(a0), vget_high_u32(a1));
475 #endif
476 return b0;
477 }
478
transpose_u32_4x4(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)479 static INLINE void transpose_u32_4x4(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
480 uint32x4_t* a3) {
481 // Swap 32 bit elements. Goes from:
482 // a0: 00 01 02 03
483 // a1: 10 11 12 13
484 // a2: 20 21 22 23
485 // a3: 30 31 32 33
486 // to:
487 // b0.val[0]: 00 10 02 12
488 // b0.val[1]: 01 11 03 13
489 // b1.val[0]: 20 30 22 32
490 // b1.val[1]: 21 31 23 33
491
492 const uint32x4x2_t b0 = vtrnq_u32(*a0, *a1);
493 const uint32x4x2_t b1 = vtrnq_u32(*a2, *a3);
494
495 // Swap 64 bit elements resulting in:
496 // c0.val[0]: 00 10 20 30
497 // c0.val[1]: 02 12 22 32
498 // c1.val[0]: 01 11 21 31
499 // c1.val[1]: 03 13 23 33
500
501 const uint32x4x2_t c0 = vtrnq_u64_to_u32(b0.val[0], b1.val[0]);
502 const uint32x4x2_t c1 = vtrnq_u64_to_u32(b0.val[1], b1.val[1]);
503
504 *a0 = c0.val[0];
505 *a1 = c1.val[0];
506 *a2 = c0.val[1];
507 *a3 = c1.val[1];
508 }
509
reverse_uint32x4_regs(uint32x4_t * a0,uint32x4_t * a1,uint32x4_t * a2,uint32x4_t * a3)510 static INLINE void reverse_uint32x4_regs(uint32x4_t* a0, uint32x4_t* a1, uint32x4_t* a2,
511 uint32x4_t* a3) {
512 vrev128q_u32(*a0, *a0);
513 vrev128q_u32(*a1, *a1);
514 vrev128q_u32(*a2, *a2);
515 vrev128q_u32(*a3, *a3);
516 }
517
rotate90_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)518 static INLINE void rotate90_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
519 uint64x2_t b0 = vcombine_u64(vget_low_u64(*a1), vget_low_u64(*a0));
520 uint64x2_t b1 = vcombine_u64(vget_high_u64(*a1), vget_high_u64(*a0));
521 *a0 = b0;
522 *a1 = b1;
523 }
524
rotate270_u64_2x2(uint64x2_t * a0,uint64x2_t * a1)525 static INLINE void rotate270_u64_2x2(uint64x2_t* a0, uint64x2_t* a1) {
526 uint64x2_t b0 = vcombine_u64(vget_low_u64(*a0), vget_low_u64(*a1));
527 uint64x2_t b1 = vcombine_u64(vget_high_u64(*a0), vget_high_u64(*a1));
528 *a0 = b1;
529 *a1 = b0;
530 }
531
load_u8_8x8(const uint8_t * s,const int stride,uint8x8_t * s0,uint8x8_t * s1,uint8x8_t * s2,uint8x8_t * s3,uint8x8_t * s4,uint8x8_t * s5,uint8x8_t * s6,uint8x8_t * s7)532 static INLINE void load_u8_8x8(const uint8_t* s, const int stride, uint8x8_t* s0, uint8x8_t* s1,
533 uint8x8_t* s2, uint8x8_t* s3, uint8x8_t* s4, uint8x8_t* s5,
534 uint8x8_t* s6, uint8x8_t* s7) {
535 *s0 = vld1_u8(s);
536 s += stride;
537 *s1 = vld1_u8(s);
538 s += stride;
539 *s2 = vld1_u8(s);
540 s += stride;
541 *s3 = vld1_u8(s);
542 s += stride;
543 *s4 = vld1_u8(s);
544 s += stride;
545 *s5 = vld1_u8(s);
546 s += stride;
547 *s6 = vld1_u8(s);
548 s += stride;
549 *s7 = vld1_u8(s);
550 }
551
load_u16_8x8(const uint16_t * s,const int stride,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7)552 static INLINE void load_u16_8x8(const uint16_t* s, const int stride, uint16x8_t* s0, uint16x8_t* s1,
553 uint16x8_t* s2, uint16x8_t* s3, uint16x8_t* s4, uint16x8_t* s5,
554 uint16x8_t* s6, uint16x8_t* s7) {
555 *s0 = vld1q_u16(s);
556 s += stride;
557 *s1 = vld1q_u16(s);
558 s += stride;
559 *s2 = vld1q_u16(s);
560 s += stride;
561 *s3 = vld1q_u16(s);
562 s += stride;
563 *s4 = vld1q_u16(s);
564 s += stride;
565 *s5 = vld1q_u16(s);
566 s += stride;
567 *s6 = vld1q_u16(s);
568 s += stride;
569 *s7 = vld1q_u16(s);
570 }
571
load_u32_4x4(const uint32_t * s,const int stride,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)572 static INLINE void load_u32_4x4(const uint32_t* s, const int stride, uint32x4_t* s1, uint32x4_t* s2,
573 uint32x4_t* s3, uint32x4_t* s4) {
574 *s1 = vld1q_u32(s);
575 s += stride;
576 *s2 = vld1q_u32(s);
577 s += stride;
578 *s3 = vld1q_u32(s);
579 s += stride;
580 *s4 = vld1q_u32(s);
581 }
582
load_u64_2x2(const uint64_t * s,const int stride,uint64x2_t * s1,uint64x2_t * s2)583 static INLINE void load_u64_2x2(const uint64_t* s, const int stride, uint64x2_t* s1,
584 uint64x2_t* s2) {
585 *s1 = vld1q_u64(s);
586 s += stride;
587 *s2 = vld1q_u64(s);
588 }
589
store_u8_8x8(uint8_t * s,int stride,uint8x8_t s0,uint8x8_t s1,uint8x8_t s2,uint8x8_t s3,uint8x8_t s4,uint8x8_t s5,uint8x8_t s6,uint8x8_t s7)590 static INLINE void store_u8_8x8(uint8_t* s, int stride, uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
591 uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6,
592 uint8x8_t s7) {
593 vst1_u8(s, s0);
594 s += stride;
595 vst1_u8(s, s1);
596 s += stride;
597 vst1_u8(s, s2);
598 s += stride;
599 vst1_u8(s, s3);
600 s += stride;
601 vst1_u8(s, s4);
602 s += stride;
603 vst1_u8(s, s5);
604 s += stride;
605 vst1_u8(s, s6);
606 s += stride;
607 vst1_u8(s, s7);
608 }
609
store_u16_8x8(uint16_t * s,int stride,uint16x8_t s0,uint16x8_t s1,uint16x8_t s2,uint16x8_t s3,uint16x8_t s4,uint16x8_t s5,uint16x8_t s6,uint16x8_t s7)610 static INLINE void store_u16_8x8(uint16_t* s, int stride, uint16x8_t s0, uint16x8_t s1,
611 uint16x8_t s2, uint16x8_t s3, uint16x8_t s4, uint16x8_t s5,
612 uint16x8_t s6, uint16x8_t s7) {
613 vst1q_u16(s, s0);
614 s += stride;
615 vst1q_u16(s, s1);
616 s += stride;
617 vst1q_u16(s, s2);
618 s += stride;
619 vst1q_u16(s, s3);
620 s += stride;
621 vst1q_u16(s, s4);
622 s += stride;
623 vst1q_u16(s, s5);
624 s += stride;
625 vst1q_u16(s, s6);
626 s += stride;
627 vst1q_u16(s, s7);
628 }
629
store_u32_4x4(uint32_t * s,int stride,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)630 static INLINE void store_u32_4x4(uint32_t* s, int stride, uint32x4_t s1, uint32x4_t s2,
631 uint32x4_t s3, uint32x4_t s4) {
632 vst1q_u32(s, s1);
633 s += stride;
634 vst1q_u32(s, s2);
635 s += stride;
636 vst1q_u32(s, s3);
637 s += stride;
638 vst1q_u32(s, s4);
639 }
640
store_u64_2x2(uint64_t * s,int stride,uint64x2_t s1,uint64x2_t s2)641 static INLINE void store_u64_2x2(uint64_t* s, int stride, uint64x2_t s1, uint64x2_t s2) {
642 vst1q_u64(s, s1);
643 s += stride;
644 vst1q_u64(s, s2);
645 }
646
rotate_buffer_clockwise_90_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)647 static void rotate_buffer_clockwise_90_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
648 int src_w, int src_h, int src_stride,
649 int dst_stride) {
650 const int blk_wd = 8;
651
652 if (src_h < blk_wd || src_w < blk_wd) {
653 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
654 return;
655 }
656
657 int sub_img_w = (src_w / blk_wd) * blk_wd;
658 uint8x8_t s[blk_wd];
659 int i = 0;
660
661 while (1) {
662 uint8_t* dst_blk = dst_buffer + src_h - i - blk_wd;
663 uint8_t* src_blk = src_buffer + (i * src_stride);
664 int j;
665
666 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
667 load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
668 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
669 reverse_uint8x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
670 store_u8_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
671 }
672 if (sub_img_w < src_w) {
673 dst_blk += blk_wd - 1;
674 for (int k = 0; k < blk_wd; k++) {
675 for (int l = 0; l < (src_w - sub_img_w); l++) {
676 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
677 }
678 }
679 }
680 i += blk_wd;
681 if (i == src_h) break;
682 if (i + blk_wd > src_h) i = src_h - blk_wd;
683 }
684 }
685
rotate_buffer_clockwise_90_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)686 static void rotate_buffer_clockwise_90_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
687 int src_w, int src_h, int src_stride,
688 int dst_stride) {
689 const int blk_wd = 8;
690
691 if (src_h < blk_wd || src_w < blk_wd) {
692 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
693 return;
694 }
695
696 int sub_img_w = (src_w / blk_wd) * blk_wd;
697 uint16x8_t s[blk_wd];
698 int i = 0;
699
700 while (1) {
701 uint16_t* dst_blk = dst_buffer + src_h - i - blk_wd;
702 uint16_t* src_blk = src_buffer + (i * src_stride);
703 int j;
704
705 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
706 load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
707 transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
708 reverse_uint16x8_regs(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
709 store_u16_8x8(dst_blk, dst_stride, s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]);
710 }
711 if (sub_img_w < src_w) {
712 dst_blk += blk_wd - 1;
713 for (int k = 0; k < blk_wd; k++) {
714 for (int l = 0; l < (src_w - sub_img_w); l++) {
715 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
716 }
717 }
718 }
719 i += blk_wd;
720 if (i == src_h) break;
721 if (i + blk_wd > src_h) i = src_h - blk_wd;
722 }
723 }
724
rotate_buffer_clockwise_90_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)725 static void rotate_buffer_clockwise_90_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
726 int src_w, int src_h, int src_stride,
727 int dst_stride) {
728 const int blk_wd = 4;
729
730 if (src_h < blk_wd || src_w < blk_wd) {
731 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
732 return;
733 }
734
735 int sub_img_w = (src_w / blk_wd) * blk_wd;
736 uint32x4_t s[blk_wd];
737 int i = 0;
738
739 while (1) {
740 uint32_t* dst_blk = dst_buffer + src_h - i - blk_wd;
741 uint32_t* src_blk = src_buffer + (i * src_stride);
742 int j;
743
744 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
745 load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
746 transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
747 reverse_uint32x4_regs(&s[0], &s[1], &s[2], &s[3]);
748 store_u32_4x4(dst_blk, dst_stride, s[0], s[1], s[2], s[3]);
749 }
750 if (sub_img_w < src_w) {
751 dst_blk += blk_wd - 1;
752 for (int k = 0; k < blk_wd; k++) {
753 for (int l = 0; l < (src_w - sub_img_w); l++) {
754 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
755 }
756 }
757 }
758 i += blk_wd;
759 if (i == src_h) break;
760 if (i + blk_wd > src_h) i = src_h - blk_wd;
761 }
762 }
763
rotate_buffer_clockwise_90_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)764 static void rotate_buffer_clockwise_90_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
765 int src_w, int src_h, int src_stride,
766 int dst_stride) {
767 const int blk_wd = 2;
768
769 if (src_h < blk_wd || src_w < blk_wd) {
770 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 90);
771 return;
772 }
773
774 int sub_img_w = (src_w / blk_wd) * blk_wd;
775 uint64x2_t s[blk_wd];
776 int i = 0;
777
778 while (1) {
779 uint64_t* dst_blk = dst_buffer + src_h - i - blk_wd;
780 uint64_t* src_blk = src_buffer + (i * src_stride);
781 int j;
782
783 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk += (blk_wd * dst_stride)) {
784 load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
785 rotate90_u64_2x2(&s[0], &s[1]);
786 store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
787 }
788 if (sub_img_w < src_w) {
789 dst_blk += blk_wd - 1;
790 for (int k = 0; k < blk_wd; k++) {
791 for (int l = 0; l < (src_w - sub_img_w); l++) {
792 dst_blk[l * dst_stride - k] = src_blk[k * src_stride + l];
793 }
794 }
795 }
796 i += blk_wd;
797 if (i == src_h) break;
798 if (i + blk_wd > src_h) i = src_h - blk_wd;
799 }
800 }
801
rotate_buffer_clockwise_270_neon_uint8_t(uint8_t * src_buffer,uint8_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)802 static void rotate_buffer_clockwise_270_neon_uint8_t(uint8_t* src_buffer, uint8_t* dst_buffer,
803 int src_w, int src_h, int src_stride,
804 int dst_stride) {
805 const int blk_wd = 8;
806
807 if (src_h < blk_wd || src_w < blk_wd) {
808 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
809 return;
810 }
811
812 int sub_img_w = (src_w / blk_wd) * blk_wd;
813 uint8x8_t s[blk_wd];
814 int i = 0;
815
816 while (1) {
817 uint8_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
818 uint8_t* src_blk = src_buffer + (i * src_stride);
819 int j;
820
821 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
822 load_u8_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
823 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
824 store_u8_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
825 }
826 if (sub_img_w < src_w) {
827 dst_blk += (blk_wd - 1) * dst_stride;
828 for (int k = 0; k < blk_wd; k++) {
829 for (int l = 0; l < (src_w - sub_img_w); l++) {
830 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
831 }
832 }
833 }
834 i += blk_wd;
835 if (i == src_h) break;
836 if (i + blk_wd > src_h) i = src_h - blk_wd;
837 }
838 }
839
rotate_buffer_clockwise_270_neon_uint16_t(uint16_t * src_buffer,uint16_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)840 static void rotate_buffer_clockwise_270_neon_uint16_t(uint16_t* src_buffer, uint16_t* dst_buffer,
841 int src_w, int src_h, int src_stride,
842 int dst_stride) {
843 const int blk_wd = 8;
844
845 if (src_h < blk_wd || src_w < blk_wd) {
846 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
847 return;
848 }
849
850 int sub_img_w = (src_w / blk_wd) * blk_wd;
851 uint16x8_t s[blk_wd];
852 int i = 0;
853
854 while (1) {
855 uint16_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
856 uint16_t* src_blk = src_buffer + (i * src_stride);
857 int j;
858
859 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
860 load_u16_8x8(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
861 transpose_u16_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
862 store_u16_8x8(dst_blk, dst_stride, s[7], s[6], s[5], s[4], s[3], s[2], s[1], s[0]);
863 }
864 if (sub_img_w < src_w) {
865 dst_blk += (blk_wd - 1) * dst_stride;
866 for (int k = 0; k < blk_wd; k++) {
867 for (int l = 0; l < (src_w - sub_img_w); l++) {
868 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
869 }
870 }
871 }
872 i += blk_wd;
873 if (i == src_h) break;
874 if (i + blk_wd > src_h) i = src_h - blk_wd;
875 }
876 }
877
rotate_buffer_clockwise_270_neon_uint32_t(uint32_t * src_buffer,uint32_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)878 static void rotate_buffer_clockwise_270_neon_uint32_t(uint32_t* src_buffer, uint32_t* dst_buffer,
879 int src_w, int src_h, int src_stride,
880 int dst_stride) {
881 const int blk_wd = 4;
882
883 if (src_h < blk_wd || src_w < blk_wd) {
884 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
885 return;
886 }
887
888 int sub_img_w = (src_w / blk_wd) * blk_wd;
889 uint32x4_t s[blk_wd];
890 int i = 0;
891
892 while (1) {
893 uint32_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
894 uint32_t* src_blk = src_buffer + (i * src_stride);
895 int j;
896
897 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
898 load_u32_4x4(src_blk, src_stride, &s[0], &s[1], &s[2], &s[3]);
899 transpose_u32_4x4(&s[0], &s[1], &s[2], &s[3]);
900 store_u32_4x4(dst_blk, dst_stride, s[3], s[2], s[1], s[0]);
901 }
902 if (sub_img_w < src_w) {
903 dst_blk += (blk_wd - 1) * dst_stride;
904 for (int k = 0; k < blk_wd; k++) {
905 for (int l = 0; l < (src_w - sub_img_w); l++) {
906 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
907 }
908 }
909 }
910 i += blk_wd;
911 if (i == src_h) break;
912 if (i + blk_wd > src_h) i = src_h - blk_wd;
913 }
914 }
915
rotate_buffer_clockwise_270_neon_uint64_t(uint64_t * src_buffer,uint64_t * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)916 static void rotate_buffer_clockwise_270_neon_uint64_t(uint64_t* src_buffer, uint64_t* dst_buffer,
917 int src_w, int src_h, int src_stride,
918 int dst_stride) {
919 const int blk_wd = 2;
920
921 if (src_h < blk_wd || src_w < blk_wd) {
922 rotate_buffer_clockwise(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride, 270);
923 return;
924 }
925
926 int sub_img_w = (src_w / blk_wd) * blk_wd;
927 uint64x2_t s[blk_wd];
928 int i = 0;
929
930 while (1) {
931 uint64_t* dst_blk = dst_buffer + i + (src_w - blk_wd) * dst_stride;
932 uint64_t* src_blk = src_buffer + (i * src_stride);
933 int j;
934
935 for (j = 0; j < sub_img_w; j += blk_wd, src_blk += blk_wd, dst_blk -= (blk_wd * dst_stride)) {
936 load_u64_2x2(src_blk, src_stride, &s[0], &s[1]);
937 rotate270_u64_2x2(&s[0], &s[1]);
938 store_u64_2x2(dst_blk, dst_stride, s[0], s[1]);
939 }
940 if (sub_img_w < src_w) {
941 dst_blk += (blk_wd - 1) * dst_stride;
942 for (int k = 0; k < blk_wd; k++) {
943 for (int l = 0; l < (src_w - sub_img_w); l++) {
944 dst_blk[-l * dst_stride + k] = src_blk[k * src_stride + l];
945 }
946 }
947 }
948 i += blk_wd;
949 if (i == src_h) break;
950 if (i + blk_wd > src_h) i = src_h - blk_wd;
951 }
952 }
953
954 template <typename T>
mirror_buffer_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,uhdr_mirror_direction_t direction)955 void mirror_buffer_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h, int src_stride,
956 int dst_stride, uhdr_mirror_direction_t direction) {
957 if (direction == UHDR_MIRROR_VERTICAL) {
958 if constexpr (sizeof(T) == 1) {
959 mirror_buffer_vertical_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
960 dst_stride);
961 } else if constexpr (sizeof(T) == 2) {
962 mirror_buffer_vertical_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
963 dst_stride);
964 } else if constexpr (sizeof(T) == 4) {
965 mirror_buffer_vertical_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
966 dst_stride);
967 } else if constexpr (sizeof(T) == 8) {
968 mirror_buffer_vertical_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
969 dst_stride);
970 }
971
972 } else if (direction == UHDR_MIRROR_HORIZONTAL) {
973 if constexpr (sizeof(T) == 1) {
974 mirror_buffer_horizontal_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
975 dst_stride);
976 } else if constexpr (sizeof(T) == 2) {
977 mirror_buffer_horizontal_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
978 dst_stride);
979 } else if constexpr (sizeof(T) == 4) {
980 mirror_buffer_horizontal_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
981 dst_stride);
982 } else if constexpr (sizeof(T) == 8) {
983 mirror_buffer_horizontal_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
984 dst_stride);
985 }
986 }
987 }
988
989 template <typename T>
rotate_buffer_clockwise_180_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)990 void rotate_buffer_clockwise_180_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
991 int src_stride, int dst_stride) {
992 if constexpr (sizeof(T) == 1) {
993 mirror_buffer_horizontal_neon_uint8_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
994 src_h, -src_stride, dst_stride);
995 } else if constexpr (sizeof(T) == 2) {
996 mirror_buffer_horizontal_neon_uint16_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
997 src_h, -src_stride, dst_stride);
998 } else if constexpr (sizeof(T) == 4) {
999 mirror_buffer_horizontal_neon_uint32_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1000 src_h, -src_stride, dst_stride);
1001 } else if constexpr (sizeof(T) == 8) {
1002 mirror_buffer_horizontal_neon_uint64_t(src_buffer + (src_h - 1) * src_stride, dst_buffer, src_w,
1003 src_h, -src_stride, dst_stride);
1004 }
1005 }
1006
1007 template <typename T>
rotate_buffer_clockwise_90_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1008 void rotate_buffer_clockwise_90_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1009 int src_stride, int dst_stride) {
1010 if constexpr (sizeof(T) == 1) {
1011 rotate_buffer_clockwise_90_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1012 dst_stride);
1013 } else if constexpr (sizeof(T) == 2) {
1014 rotate_buffer_clockwise_90_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1015 dst_stride);
1016 } else if constexpr (sizeof(T) == 4) {
1017 rotate_buffer_clockwise_90_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1018 dst_stride);
1019 } else if constexpr (sizeof(T) == 8) {
1020 rotate_buffer_clockwise_90_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1021 dst_stride);
1022 }
1023 }
1024
1025 template <typename T>
rotate_buffer_clockwise_270_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride)1026 void rotate_buffer_clockwise_270_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1027 int src_stride, int dst_stride) {
1028 if constexpr (sizeof(T) == 1) {
1029 rotate_buffer_clockwise_270_neon_uint8_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1030 dst_stride);
1031 } else if constexpr (sizeof(T) == 2) {
1032 rotate_buffer_clockwise_270_neon_uint16_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1033 dst_stride);
1034 } else if constexpr (sizeof(T) == 4) {
1035 rotate_buffer_clockwise_270_neon_uint32_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1036 dst_stride);
1037 } else if constexpr (sizeof(T) == 8) {
1038 rotate_buffer_clockwise_270_neon_uint64_t(src_buffer, dst_buffer, src_w, src_h, src_stride,
1039 dst_stride);
1040 }
1041 }
1042
1043 template <typename T>
rotate_buffer_clockwise_neon(T * src_buffer,T * dst_buffer,int src_w,int src_h,int src_stride,int dst_stride,int degrees)1044 void rotate_buffer_clockwise_neon(T* src_buffer, T* dst_buffer, int src_w, int src_h,
1045 int src_stride, int dst_stride, int degrees) {
1046 if (degrees == 90) {
1047 rotate_buffer_clockwise_90_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1048 } else if (degrees == 180) {
1049 rotate_buffer_clockwise_180_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1050 } else if (degrees == 270) {
1051 rotate_buffer_clockwise_270_neon(src_buffer, dst_buffer, src_w, src_h, src_stride, dst_stride);
1052 }
1053 }
1054
1055 template void mirror_buffer_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int,
1056 uhdr_mirror_direction_t);
1057 template void mirror_buffer_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int,
1058 uhdr_mirror_direction_t);
1059 template void mirror_buffer_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int,
1060 uhdr_mirror_direction_t);
1061 template void mirror_buffer_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int,
1062 uhdr_mirror_direction_t);
1063
1064 template void rotate_buffer_clockwise_neon<uint8_t>(uint8_t*, uint8_t*, int, int, int, int, int);
1065 template void rotate_buffer_clockwise_neon<uint16_t>(uint16_t*, uint16_t*, int, int, int, int, int);
1066 template void rotate_buffer_clockwise_neon<uint32_t>(uint32_t*, uint32_t*, int, int, int, int, int);
1067 template void rotate_buffer_clockwise_neon<uint64_t>(uint64_t*, uint64_t*, int, int, int, int, int);
1068
1069 } // namespace ultrahdr
1070