1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20 #include "av1/common/cdef_block.h"
21
cdef_copy_rect8_8bit_to_16bit_neon(uint16_t * dst,int dstride,const uint8_t * src,int sstride,int width,int height)22 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
23 const uint8_t *src, int sstride,
24 int width, int height) {
25 do {
26 const uint8_t *src_ptr = src;
27 uint16_t *dst_ptr = dst;
28
29 int w = 0;
30 while (width - w >= 16) {
31 uint8x16_t row = vld1q_u8(src_ptr + w);
32 uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
33 vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
34
35 w += 16;
36 }
37 if (width - w >= 8) {
38 uint8x8_t row = vld1_u8(src_ptr + w);
39 vst1q_u16(dst_ptr + w, vmovl_u8(row));
40 w += 8;
41 }
42 if (width - w == 4) {
43 for (int i = w; i < w + 4; i++) {
44 dst_ptr[i] = src_ptr[i];
45 }
46 }
47
48 src += sstride;
49 dst += dstride;
50 } while (--height != 0);
51 }
52
cdef_copy_rect8_16bit_to_16bit_neon(uint16_t * dst,int dstride,const uint16_t * src,int sstride,int width,int height)53 void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
54 const uint16_t *src, int sstride,
55 int width, int height) {
56 do {
57 const uint16_t *src_ptr = src;
58 uint16_t *dst_ptr = dst;
59
60 int w = 0;
61 while (width - w >= 8) {
62 uint16x8_t row = vld1q_u16(src_ptr + w);
63 vst1q_u16(dst_ptr + w, row);
64
65 w += 8;
66 }
67 if (width - w == 4) {
68 uint16x4_t row = vld1_u16(src_ptr + w);
69 vst1_u16(dst_ptr + w, row);
70 }
71
72 src += sstride;
73 dst += dstride;
74 } while (--height != 0);
75 }
76
77 // partial A is a 16-bit vector of the form:
78 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
79 // [0 y1 y2 y3 y4 y5 y6 y7].
80 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
81 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
82 // and const2.
fold_mul_and_sum_neon(int16x8_t partiala,int16x8_t partialb,uint32x4_t const1,uint32x4_t const2)83 static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
84 int16x8_t partialb,
85 uint32x4_t const1,
86 uint32x4_t const2) {
87 // Reverse partial B.
88 // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }.
89 uint8x16_t pattern = vreinterpretq_u8_u64(
90 vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
91 vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
92
93 #if AOM_ARCH_AARCH64
94 partialb =
95 vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
96 #else
97 int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
98 vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
99 int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
100 int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
101 partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
102 #endif
103
104 // Square and add the corresponding x and y values.
105 int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala));
106 cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb));
107 int32x4_t cost_hi =
108 vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala));
109 cost_hi =
110 vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb));
111
112 // Multiply by constant.
113 uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1);
114 cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2);
115 return cost;
116 }
117
118 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
119 // down-right, 6 is vertical).
120 //
121 // For each direction the lines are shifted so that we can perform a
122 // basic sum on each vector element. For example, direction 5 is "south by
123 // southeast", so we need to add the pixels along each line i below:
124 //
125 // 0 1 2 3 4 5 6 7
126 // 0 1 2 3 4 5 6 7
127 // 8 0 1 2 3 4 5 6
128 // 8 0 1 2 3 4 5 6
129 // 9 8 0 1 2 3 4 5
130 // 9 8 0 1 2 3 4 5
131 // 10 9 8 0 1 2 3 4
132 // 10 9 8 0 1 2 3 4
133 //
134 // For this to fit nicely in vectors, the lines need to be shifted like so:
135 // 0 1 2 3 4 5 6 7
136 // 0 1 2 3 4 5 6 7
137 // 8 0 1 2 3 4 5 6
138 // 8 0 1 2 3 4 5 6
139 // 9 8 0 1 2 3 4 5
140 // 9 8 0 1 2 3 4 5
141 // 10 9 8 0 1 2 3 4
142 // 10 9 8 0 1 2 3 4
143 //
144 // In this configuration we can now perform SIMD additions to get the cost
145 // along direction 5. Since this won't fit into a single 128-bit vector, we use
146 // two of them to compute each half of the new configuration, and pad the empty
147 // spaces with zeros. Similar shifting is done for other directions, except
148 // direction 6 which is straightforward as it's the vertical direction.
compute_vert_directions_neon(int16x8_t lines[8],uint32_t cost[4])149 static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
150 uint32_t cost[4]) {
151 const int16x8_t zero = vdupq_n_s16(0);
152
153 // Partial sums for lines 0 and 1.
154 int16x8_t partial4a = vextq_s16(zero, lines[0], 1);
155 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2));
156 int16x8_t partial4b = vextq_s16(lines[0], zero, 1);
157 partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2));
158 int16x8_t tmp = vaddq_s16(lines[0], lines[1]);
159 int16x8_t partial5a = vextq_s16(zero, tmp, 3);
160 int16x8_t partial5b = vextq_s16(tmp, zero, 3);
161 int16x8_t partial7a = vextq_s16(zero, tmp, 6);
162 int16x8_t partial7b = vextq_s16(tmp, zero, 6);
163 int16x8_t partial6 = tmp;
164
165 // Partial sums for lines 2 and 3.
166 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3));
167 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4));
168 partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3));
169 partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4));
170 tmp = vaddq_s16(lines[2], lines[3]);
171 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4));
172 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4));
173 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5));
174 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5));
175 partial6 = vaddq_s16(partial6, tmp);
176
177 // Partial sums for lines 4 and 5.
178 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5));
179 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6));
180 partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5));
181 partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6));
182 tmp = vaddq_s16(lines[4], lines[5]);
183 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5));
184 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5));
185 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4));
186 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4));
187 partial6 = vaddq_s16(partial6, tmp);
188
189 // Partial sums for lines 6 and 7.
190 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7));
191 partial4a = vaddq_s16(partial4a, lines[7]);
192 partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7));
193 tmp = vaddq_s16(lines[6], lines[7]);
194 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6));
195 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6));
196 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3));
197 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3));
198 partial6 = vaddq_s16(partial6, tmp);
199
200 uint32x4_t const0 = vreinterpretq_u32_u64(
201 vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
202 vcreate_u64((uint64_t)210 << 32 | 280)));
203 uint32x4_t const1 = vreinterpretq_u32_u64(
204 vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
205 vcreate_u64((uint64_t)105 << 32 | 120)));
206 uint32x4_t const2 = vreinterpretq_u32_u64(
207 vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
208 uint32x4_t const3 = vreinterpretq_u32_u64(
209 vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
210 vcreate_u64((uint64_t)105 << 32 | 105)));
211
212 // Compute costs in terms of partial sums.
213 int32x4_t partial6_s32 =
214 vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6));
215 partial6_s32 =
216 vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6));
217
218 uint32x4_t costs[4];
219 costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
220 costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
221 costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105);
222 costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
223
224 costs[0] = horizontal_add_4d_u32x4(costs);
225 vst1q_u32(cost, costs[0]);
226 return costs[0];
227 }
228
fold_mul_and_sum_pairwise_neon(int16x8_t partiala,int16x8_t partialb,int16x8_t partialc,uint32x4_t const0)229 static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
230 int16x8_t partialb,
231 int16x8_t partialc,
232 uint32x4_t const0) {
233 // Reverse partial c.
234 // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }.
235 uint8x16_t pattern = vreinterpretq_u8_u64(
236 vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a),
237 vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302)));
238
239 #if AOM_ARCH_AARCH64
240 partialc =
241 vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern));
242 #else
243 int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)),
244 vget_high_s8(vreinterpretq_s8_s16(partialc)) } };
245 int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
246 int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
247 partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
248 #endif
249
250 int32x4_t partiala_s32 = vpaddlq_s16(partiala);
251 int32x4_t partialb_s32 = vpaddlq_s16(partialb);
252 int32x4_t partialc_s32 = vpaddlq_s16(partialc);
253
254 partiala_s32 = vmulq_s32(partiala_s32, partiala_s32);
255 partialb_s32 = vmulq_s32(partialb_s32, partialb_s32);
256 partialc_s32 = vmulq_s32(partialc_s32, partialc_s32);
257
258 partiala_s32 = vaddq_s32(partiala_s32, partialc_s32);
259
260 uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105);
261 cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0);
262 return cost;
263 }
264
265 // This function computes the cost along directions 0, 1, 2, 3. (0 means
266 // 45-degree up-right, 2 is horizontal).
267 //
268 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted
269 // lines need three vectors instead of two. For direction 1 for example, we need
270 // to compute the sums along the line i below:
271 // 0 0 1 1 2 2 3 3
272 // 1 1 2 2 3 3 4 4
273 // 2 2 3 3 4 4 5 5
274 // 3 3 4 4 5 5 6 6
275 // 4 4 5 5 6 6 7 7
276 // 5 5 6 6 7 7 8 8
277 // 6 6 7 7 8 8 9 9
278 // 7 7 8 8 9 9 10 10
279 //
280 // Which means we need the following configuration:
281 // 0 0 1 1 2 2 3 3
282 // 1 1 2 2 3 3 4 4
283 // 2 2 3 3 4 4 5 5
284 // 3 3 4 4 5 5 6 6
285 // 4 4 5 5 6 6 7 7
286 // 5 5 6 6 7 7 8 8
287 // 6 6 7 7 8 8 9 9
288 // 7 7 8 8 9 9 10 10
289 //
290 // Three vectors are needed to compute this, as well as some extra pairwise
291 // additions.
compute_horiz_directions_neon(int16x8_t lines[8],uint32_t cost[4])292 static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8],
293 uint32_t cost[4]) {
294 const int16x8_t zero = vdupq_n_s16(0);
295
296 // Compute diagonal directions (1, 2, 3).
297 // Partial sums for lines 0 and 1.
298 int16x8_t partial0a = lines[0];
299 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7));
300 int16x8_t partial0b = vextq_s16(lines[1], zero, 7);
301 int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6));
302 int16x8_t partial1b = vextq_s16(lines[1], zero, 6);
303 int16x8_t partial3a = vextq_s16(lines[0], zero, 2);
304 partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4));
305 int16x8_t partial3b = vextq_s16(zero, lines[0], 2);
306 partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4));
307
308 // Partial sums for lines 2 and 3.
309 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6));
310 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5));
311 partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6));
312 partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5));
313 partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4));
314 partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2));
315 partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4));
316 partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2));
317 partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6));
318 partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6));
319 partial3b = vaddq_s16(partial3b, lines[3]);
320
321 // Partial sums for lines 4 and 5.
322 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4));
323 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3));
324 partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4));
325 partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3));
326 partial1b = vaddq_s16(partial1b, lines[4]);
327 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6));
328 int16x8_t partial1c = vextq_s16(lines[5], zero, 6);
329 partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2));
330 partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4));
331 int16x8_t partial3c = vextq_s16(zero, lines[4], 2);
332 partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4));
333
334 // Partial sums for lines 6 and 7.
335 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2));
336 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1));
337 partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2));
338 partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1));
339 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4));
340 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2));
341 partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4));
342 partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2));
343 partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6));
344 partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6));
345 partial3c = vaddq_s16(partial3c, lines[7]);
346
347 // Special case for direction 2 as it's just a sum along each line.
348 int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
349 int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
350 int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
351 int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
352
353 uint32x4_t partial2a_u32 =
354 vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a));
355 uint32x4_t partial2b_u32 =
356 vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b));
357
358 uint32x4_t const0 = vreinterpretq_u32_u64(
359 vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
360 vcreate_u64((uint64_t)210 << 32 | 280)));
361 uint32x4_t const1 = vreinterpretq_u32_u64(
362 vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
363 vcreate_u64((uint64_t)105 << 32 | 120)));
364 uint32x4_t const2 = vreinterpretq_u32_u64(
365 vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420),
366 vcreate_u64((uint64_t)105 << 32 | 140)));
367
368 uint32x4_t costs[4];
369 costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1);
370 costs[1] =
371 fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2);
372 costs[2] = vaddq_u32(partial2a_u32, partial2b_u32);
373 costs[2] = vmulq_n_u32(costs[2], 105);
374 costs[3] =
375 fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2);
376
377 costs[0] = horizontal_add_4d_u32x4(costs);
378 vst1q_u32(cost, costs[0]);
379 return costs[0];
380 }
381
cdef_find_dir_neon(const uint16_t * img,int stride,int32_t * var,int coeff_shift)382 int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
383 int coeff_shift) {
384 uint32_t cost[8];
385 uint32_t best_cost = 0;
386 int best_dir = 0;
387 int16x8_t lines[8];
388 for (int i = 0; i < 8; i++) {
389 uint16x8_t s = vld1q_u16(&img[i * stride]);
390 lines[i] = vreinterpretq_s16_u16(
391 vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
392 }
393
394 // Compute "mostly vertical" directions.
395 uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4);
396
397 // Compute "mostly horizontal" directions.
398 uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost);
399
400 // Find max cost as well as its index to get best_dir.
401 // The max cost needs to be propagated in the whole vector to find its
402 // position in the original cost vectors cost03 and cost47.
403 uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
404 #if AOM_ARCH_AARCH64
405 best_cost = vmaxvq_u32(cost07);
406 uint32x4_t max_cost = vdupq_n_u32(best_cost);
407 uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
408 vreinterpretq_u8_u32(
409 vceqq_u32(max_cost, cost47)) } };
410 // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
411 uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
412 // Get the lowest 8 bit of each 32-bit elements and reverse them.
413 uint8x8_t tbl = vqtbl2_u8(costs, idx);
414 uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
415 best_dir = aom_clzll(a) >> 3;
416 #else
417 uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
418 cost64 = vpmax_u32(cost64, cost64);
419 uint32x4_t max_cost = vcombine_u32(cost64, cost64);
420 best_cost = vget_lane_u32(cost64, 0);
421 uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
422 vmovn_u32(vceqq_u32(max_cost, cost47)));
423 uint8x8_t idx =
424 vand_u8(vmovn_u16(costs),
425 vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
426 int sum = horizontal_add_u8x8(idx);
427 best_dir = get_msb(sum ^ (sum - 1));
428 #endif
429
430 // Difference between the optimal variance and the variance along the
431 // orthogonal direction. Again, the sum(x^2) terms cancel out.
432 *var = best_cost - cost[(best_dir + 4) & 7];
433 // We'd normally divide by 840, but dividing by 1024 is close enough
434 // for what we're going to do with this.
435 *var >>= 10;
436 return best_dir;
437 }
438
cdef_find_dir_dual_neon(const uint16_t * img1,const uint16_t * img2,int stride,int32_t * var_out_1st,int32_t * var_out_2nd,int coeff_shift,int * out_dir_1st_8x8,int * out_dir_2nd_8x8)439 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
440 int stride, int32_t *var_out_1st,
441 int32_t *var_out_2nd, int coeff_shift,
442 int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
443 // Process first 8x8.
444 *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
445
446 // Process second 8x8.
447 *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
448 }
449
450 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(uint16x8_t a,uint16x8_t b,unsigned int threshold,int adjdamp)451 static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
452 unsigned int threshold, int adjdamp) {
453 uint16x8_t diff = vabdq_u16(a, b);
454 const uint16x8_t a_gt_b = vcgtq_u16(a, b);
455 const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold),
456 vshlq_u16(diff, vdupq_n_s16(-adjdamp)));
457 const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s));
458 return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
459 }
460
primary_filter(uint16x8_t s,uint16x8_t tap[4],const int * pri_taps,int pri_strength,int pri_damping,int16x8_t * sum)461 static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4],
462 const int *pri_taps, int pri_strength,
463 int pri_damping, int16x8_t *sum) {
464 // Near taps
465 int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping);
466 int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping);
467 // sum += pri_taps[0] * (n0 + n1)
468 n0 = vaddq_s16(n0, n1);
469 *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]);
470
471 // Far taps
472 int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping);
473 int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping);
474 // sum += pri_taps[1] * (f0 + f1)
475 f0 = vaddq_s16(f0, f1);
476 *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
477 }
478
secondary_filter(uint16x8_t s,uint16x8_t tap[8],const int * sec_taps,int sec_strength,int sec_damping,int16x8_t * sum)479 static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
480 const int *sec_taps, int sec_strength,
481 int sec_damping, int16x8_t *sum) {
482 // Near taps
483 int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping);
484 int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping);
485 int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping);
486 int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping);
487
488 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
489 s0 = vaddq_s16(s0, s1);
490 s2 = vaddq_s16(s2, s3);
491 s0 = vaddq_s16(s0, s2);
492 *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]);
493
494 // Far taps
495 s0 = constrain16(tap[4], s, sec_strength, sec_damping);
496 s1 = constrain16(tap[5], s, sec_strength, sec_damping);
497 s2 = constrain16(tap[6], s, sec_strength, sec_damping);
498 s3 = constrain16(tap[7], s, sec_strength, sec_damping);
499
500 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
501 s0 = vaddq_s16(s0, s1);
502 s2 = vaddq_s16(s2, s3);
503 s0 = vaddq_s16(s0, s2);
504 *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]);
505 }
506
cdef_filter_8_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)507 void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
508 int pri_strength, int sec_strength, int dir,
509 int pri_damping, int sec_damping, int coeff_shift,
510 int block_width, int block_height) {
511 uint16x8_t max, min;
512 const uint16x8_t cdef_large_value_mask =
513 vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
514 const int po1 = cdef_directions[dir][0];
515 const int po2 = cdef_directions[dir][1];
516 const int s1o1 = cdef_directions[dir + 2][0];
517 const int s1o2 = cdef_directions[dir + 2][1];
518 const int s2o1 = cdef_directions[dir - 2][0];
519 const int s2o2 = cdef_directions[dir - 2][1];
520 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
521 const int *sec_taps = cdef_sec_taps;
522
523 if (pri_strength) {
524 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
525 }
526 if (sec_strength) {
527 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
528 }
529
530 if (block_width == 8) {
531 uint8_t *dst8 = (uint8_t *)dest;
532
533 int h = block_height;
534 do {
535 int16x8_t sum = vdupq_n_s16(0);
536 uint16x8_t s = vld1q_u16(in);
537 max = min = s;
538
539 uint16x8_t pri_src[4];
540
541 // Primary near taps
542 pri_src[0] = vld1q_u16(in + po1);
543 pri_src[1] = vld1q_u16(in - po1);
544
545 // Primary far taps
546 pri_src[2] = vld1q_u16(in + po2);
547 pri_src[3] = vld1q_u16(in - po2);
548
549 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
550
551 // The source is 16 bits, however, we only really care about the lower
552 // 8 bits. The upper 8 bits contain the "large" flag. After the final
553 // primary max has been calculated, zero out the upper 8 bits. Use this
554 // to find the "16 bit" max.
555 uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
556 vreinterpretq_u8_u16(pri_src[1]));
557 uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
558 vreinterpretq_u8_u16(pri_src[3]));
559 pri_max0 = vmaxq_u8(pri_max0, pri_max1);
560 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
561 cdef_large_value_mask));
562
563 uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
564 uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
565 pri_min0 = vminq_u16(pri_min0, pri_min1);
566 min = vminq_u16(min, pri_min0);
567
568 uint16x8_t sec_src[8];
569
570 // Secondary near taps
571 sec_src[0] = vld1q_u16(in + s1o1);
572 sec_src[1] = vld1q_u16(in - s1o1);
573 sec_src[2] = vld1q_u16(in + s2o1);
574 sec_src[3] = vld1q_u16(in - s2o1);
575
576 // Secondary far taps
577 sec_src[4] = vld1q_u16(in + s1o2);
578 sec_src[5] = vld1q_u16(in - s1o2);
579 sec_src[6] = vld1q_u16(in + s2o2);
580 sec_src[7] = vld1q_u16(in - s2o2);
581
582 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
583
584 // The source is 16 bits, however, we only really care about the lower
585 // 8 bits. The upper 8 bits contain the "large" flag. After the final
586 // primary max has been calculated, zero out the upper 8 bits. Use this
587 // to find the "16 bit" max.
588 uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
589 vreinterpretq_u8_u16(sec_src[1]));
590 uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
591 vreinterpretq_u8_u16(sec_src[3]));
592 uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
593 vreinterpretq_u8_u16(sec_src[5]));
594 uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
595 vreinterpretq_u8_u16(sec_src[7]));
596 sec_max0 = vmaxq_u8(sec_max0, sec_max1);
597 sec_max2 = vmaxq_u8(sec_max2, sec_max3);
598 sec_max0 = vmaxq_u8(sec_max0, sec_max2);
599 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
600 cdef_large_value_mask));
601
602 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
603 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
604 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
605 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
606 sec_min0 = vminq_u16(sec_min0, sec_min1);
607 sec_min2 = vminq_u16(sec_min2, sec_min3);
608 sec_min0 = vminq_u16(sec_min0, sec_min2);
609 min = vminq_u16(min, sec_min0);
610
611 // res = s + ((sum - (sum < 0) + 8) >> 4)
612 sum =
613 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
614 int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
615
616 res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
617 vreinterpretq_s16_u16(max));
618
619 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
620 vst1_u8(dst8, res_u8);
621
622 in += CDEF_BSTRIDE;
623 dst8 += dstride;
624 } while (--h != 0);
625 } else {
626 uint8_t *dst8 = (uint8_t *)dest;
627
628 int h = block_height;
629 do {
630 int16x8_t sum = vdupq_n_s16(0);
631 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
632 max = min = s;
633
634 uint16x8_t pri_src[4];
635
636 // Primary near taps
637 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
638 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
639
640 // Primary far taps
641 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
642 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
643
644 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
645
646 // The source is 16 bits, however, we only really care about the lower
647 // 8 bits. The upper 8 bits contain the "large" flag. After the final
648 // primary max has been calculated, zero out the upper 8 bits. Use this
649 // to find the "16 bit" max.
650 uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
651 vreinterpretq_u8_u16(pri_src[1]));
652 uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
653 vreinterpretq_u8_u16(pri_src[3]));
654 pri_max0 = vmaxq_u8(pri_max0, pri_max1);
655 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
656 cdef_large_value_mask));
657
658 uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
659 uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
660 pri_min1 = vminq_u16(pri_min1, pri_min2);
661 min = vminq_u16(min, pri_min1);
662
663 uint16x8_t sec_src[8];
664
665 // Secondary near taps
666 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
667 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
668 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
669 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
670
671 // Secondary far taps
672 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
673 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
674 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
675 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
676
677 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
678
679 // The source is 16 bits, however, we only really care about the lower
680 // 8 bits. The upper 8 bits contain the "large" flag. After the final
681 // primary max has been calculated, zero out the upper 8 bits. Use this
682 // to find the "16 bit" max.
683 uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
684 vreinterpretq_u8_u16(sec_src[1]));
685 uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
686 vreinterpretq_u8_u16(sec_src[3]));
687 uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
688 vreinterpretq_u8_u16(sec_src[5]));
689 uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
690 vreinterpretq_u8_u16(sec_src[7]));
691 sec_max0 = vmaxq_u8(sec_max0, sec_max1);
692 sec_max2 = vmaxq_u8(sec_max2, sec_max3);
693 sec_max0 = vmaxq_u8(sec_max0, sec_max2);
694 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
695 cdef_large_value_mask));
696
697 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
698 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
699 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
700 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
701 sec_min0 = vminq_u16(sec_min0, sec_min1);
702 sec_min2 = vminq_u16(sec_min2, sec_min3);
703 sec_min0 = vminq_u16(sec_min0, sec_min2);
704 min = vminq_u16(min, sec_min0);
705
706 // res = s + ((sum - (sum < 0) + 8) >> 4)
707 sum =
708 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
709 int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
710
711 res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
712 vreinterpretq_s16_u16(max));
713
714 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
715 store_u8x4_strided_x2(dst8, dstride, res_u8);
716
717 in += 2 * CDEF_BSTRIDE;
718 dst8 += 2 * dstride;
719 h -= 2;
720 } while (h != 0);
721 }
722 }
723
cdef_filter_8_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)724 void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
725 int pri_strength, int sec_strength, int dir,
726 int pri_damping, int sec_damping, int coeff_shift,
727 int block_width, int block_height) {
728 (void)sec_strength;
729 (void)sec_damping;
730
731 const int po1 = cdef_directions[dir][0];
732 const int po2 = cdef_directions[dir][1];
733 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
734
735 if (pri_strength) {
736 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
737 }
738
739 if (block_width == 8) {
740 uint8_t *dst8 = (uint8_t *)dest;
741
742 int h = block_height;
743 do {
744 int16x8_t sum = vdupq_n_s16(0);
745 uint16x8_t s = vld1q_u16(in);
746
747 uint16x8_t tap[4];
748
749 // Primary near taps
750 tap[0] = vld1q_u16(in + po1);
751 tap[1] = vld1q_u16(in - po1);
752
753 // Primary far taps
754 tap[2] = vld1q_u16(in + po2);
755 tap[3] = vld1q_u16(in - po2);
756
757 primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
758
759 // res = s + ((sum - (sum < 0) + 8) >> 4)
760 sum =
761 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
762 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
763
764 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
765 vst1_u8(dst8, res_u8);
766
767 in += CDEF_BSTRIDE;
768 dst8 += dstride;
769 } while (--h != 0);
770
771 } else {
772 uint8_t *dst8 = (uint8_t *)dest;
773
774 int h = block_height;
775 do {
776 int16x8_t sum = vdupq_n_s16(0);
777 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
778
779 uint16x8_t pri_src[4];
780
781 // Primary near taps
782 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
783 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
784
785 // Primary far taps
786 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
787 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
788
789 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
790
791 // res = s + ((sum - (sum < 0) + 8) >> 4)
792 sum =
793 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
794 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
795
796 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
797 store_u8x4_strided_x2(dst8, dstride, res_u8);
798
799 in += 2 * CDEF_BSTRIDE;
800 dst8 += 2 * dstride;
801 h -= 2;
802 } while (h != 0);
803 }
804 }
805
cdef_filter_8_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)806 void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
807 int pri_strength, int sec_strength, int dir,
808 int pri_damping, int sec_damping, int coeff_shift,
809 int block_width, int block_height) {
810 (void)pri_strength;
811 (void)pri_damping;
812 (void)coeff_shift;
813
814 const int s1o1 = cdef_directions[dir + 2][0];
815 const int s1o2 = cdef_directions[dir + 2][1];
816 const int s2o1 = cdef_directions[dir - 2][0];
817 const int s2o2 = cdef_directions[dir - 2][1];
818 const int *sec_taps = cdef_sec_taps;
819
820 if (sec_strength) {
821 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
822 }
823
824 if (block_width == 8) {
825 uint8_t *dst8 = (uint8_t *)dest;
826
827 int h = block_height;
828 do {
829 int16x8_t sum = vdupq_n_s16(0);
830 uint16x8_t s = vld1q_u16(in);
831
832 uint16x8_t sec_src[8];
833
834 // Secondary near taps
835 sec_src[0] = vld1q_u16(in + s1o1);
836 sec_src[1] = vld1q_u16(in - s1o1);
837 sec_src[2] = vld1q_u16(in + s2o1);
838 sec_src[3] = vld1q_u16(in - s2o1);
839
840 // Secondary far taps
841 sec_src[4] = vld1q_u16(in + s1o2);
842 sec_src[5] = vld1q_u16(in - s1o2);
843 sec_src[6] = vld1q_u16(in + s2o2);
844 sec_src[7] = vld1q_u16(in - s2o2);
845
846 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
847
848 // res = s + ((sum - (sum < 0) + 8) >> 4)
849 sum =
850 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
851 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
852
853 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
854 vst1_u8(dst8, res_u8);
855
856 in += CDEF_BSTRIDE;
857 dst8 += dstride;
858 } while (--h != 0);
859 } else {
860 uint8_t *dst8 = (uint8_t *)dest;
861
862 int h = block_height;
863 do {
864 int16x8_t sum = vdupq_n_s16(0);
865 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
866
867 uint16x8_t sec_src[8];
868
869 // Secondary near taps
870 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
871 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
872 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
873 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
874
875 // Secondary far taps
876 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
877 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
878 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
879 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
880
881 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
882
883 // res = s + ((sum - (sum < 0) + 8) >> 4)
884 sum =
885 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
886 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
887
888 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
889 store_u8x4_strided_x2(dst8, dstride, res_u8);
890
891 in += 2 * CDEF_BSTRIDE;
892 dst8 += 2 * dstride;
893 h -= 2;
894 } while (h != 0);
895 }
896 }
897
cdef_filter_8_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)898 void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
899 int pri_strength, int sec_strength, int dir,
900 int pri_damping, int sec_damping, int coeff_shift,
901 int block_width, int block_height) {
902 (void)pri_strength;
903 (void)sec_strength;
904 (void)dir;
905 (void)pri_damping;
906 (void)sec_damping;
907 (void)coeff_shift;
908 (void)block_width;
909 if (block_width == 8) {
910 uint8_t *dst8 = (uint8_t *)dest;
911
912 int h = block_height;
913 do {
914 const uint16x8_t s = vld1q_u16(in);
915 const uint8x8_t res = vqmovn_u16(s);
916 vst1_u8(dst8, res);
917
918 in += CDEF_BSTRIDE;
919 dst8 += dstride;
920 } while (--h != 0);
921 } else {
922 uint8_t *dst8 = (uint8_t *)dest;
923
924 int h = block_height;
925 do {
926 const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
927 const uint8x8_t res = vqmovn_u16(s);
928 store_u8x4_strided_x2(dst8, dstride, res);
929
930 in += 2 * CDEF_BSTRIDE;
931 dst8 += 2 * dstride;
932 h -= 2;
933 } while (h != 0);
934 }
935 }
936
cdef_filter_16_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)937 void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
938 int pri_strength, int sec_strength, int dir,
939 int pri_damping, int sec_damping, int coeff_shift,
940 int block_width, int block_height) {
941 uint16x8_t max, min;
942 const uint16x8_t cdef_large_value_mask =
943 vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
944 const int po1 = cdef_directions[dir][0];
945 const int po2 = cdef_directions[dir][1];
946 const int s1o1 = cdef_directions[dir + 2][0];
947 const int s1o2 = cdef_directions[dir + 2][1];
948 const int s2o1 = cdef_directions[dir - 2][0];
949 const int s2o2 = cdef_directions[dir - 2][1];
950 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
951 const int *sec_taps = cdef_sec_taps;
952
953 if (pri_strength) {
954 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
955 }
956 if (sec_strength) {
957 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
958 }
959
960 if (block_width == 8) {
961 uint16_t *dst16 = (uint16_t *)dest;
962
963 int h = block_height;
964 do {
965 int16x8_t sum = vdupq_n_s16(0);
966 uint16x8_t s = vld1q_u16(in);
967 max = min = s;
968
969 uint16x8_t pri_src[4];
970
971 // Primary near taps
972 pri_src[0] = vld1q_u16(in + po1);
973 pri_src[1] = vld1q_u16(in - po1);
974
975 // Primary far taps
976 pri_src[2] = vld1q_u16(in + po2);
977 pri_src[3] = vld1q_u16(in - po2);
978
979 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
980
981 uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
982 uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
983 pri_min0 = vminq_u16(pri_min0, pri_min1);
984 min = vminq_u16(min, pri_min0);
985
986 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
987 pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
988 pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
989 pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
990 pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
991
992 uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
993 uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
994 pri_max0 = vmaxq_u16(pri_max0, pri_max1);
995 max = vmaxq_u16(max, pri_max0);
996
997 uint16x8_t sec_src[8];
998
999 // Secondary near taps
1000 sec_src[0] = vld1q_u16(in + s1o1);
1001 sec_src[1] = vld1q_u16(in - s1o1);
1002 sec_src[2] = vld1q_u16(in + s2o1);
1003 sec_src[3] = vld1q_u16(in - s2o1);
1004
1005 // Secondary far taps
1006 sec_src[4] = vld1q_u16(in + s1o2);
1007 sec_src[5] = vld1q_u16(in - s1o2);
1008 sec_src[6] = vld1q_u16(in + s2o2);
1009 sec_src[7] = vld1q_u16(in - s2o2);
1010
1011 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1012
1013 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1014 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1015 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1016 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1017 sec_min0 = vminq_u16(sec_min0, sec_min1);
1018 sec_min2 = vminq_u16(sec_min2, sec_min3);
1019 sec_min0 = vminq_u16(sec_min0, sec_min2);
1020 min = vminq_u16(min, sec_min0);
1021
1022 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1023 sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1024 sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1025 sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1026 sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1027 sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1028 sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1029 sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1030 sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1031
1032 uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1033 uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1034 uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1035 uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1036 sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1037 sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1038 sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1039 max = vmaxq_u16(max, sec_max0);
1040
1041 // res = s + ((sum - (sum < 0) + 8) >> 4)
1042 sum =
1043 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1044 int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1045
1046 res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1047 vreinterpretq_s16_u16(max));
1048
1049 vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1050
1051 in += CDEF_BSTRIDE;
1052 dst16 += dstride;
1053 } while (--h != 0);
1054 } else {
1055 uint16_t *dst16 = (uint16_t *)dest;
1056
1057 int h = block_height;
1058 do {
1059 int16x8_t sum = vdupq_n_s16(0);
1060 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1061 max = min = s;
1062
1063 uint16x8_t pri_src[4];
1064
1065 // Primary near taps
1066 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1067 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1068
1069 // Primary far taps
1070 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1071 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1072
1073 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1074
1075 uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
1076 uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
1077 pri_min1 = vminq_u16(pri_min1, pri_min2);
1078 min = vminq_u16(min, pri_min1);
1079
1080 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1081 pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
1082 pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
1083 pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
1084 pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
1085 uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
1086 uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
1087 pri_max0 = vmaxq_u16(pri_max0, pri_max1);
1088 max = vmaxq_u16(max, pri_max0);
1089
1090 uint16x8_t sec_src[8];
1091
1092 // Secondary near taps
1093 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1094 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1095 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1096 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1097
1098 // Secondary far taps
1099 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1100 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1101 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1102 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1103
1104 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1105
1106 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1107 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1108 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1109 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1110 sec_min0 = vminq_u16(sec_min0, sec_min1);
1111 sec_min2 = vminq_u16(sec_min2, sec_min3);
1112 sec_min0 = vminq_u16(sec_min0, sec_min2);
1113 min = vminq_u16(min, sec_min0);
1114
1115 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1116 sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1117 sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1118 sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1119 sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1120 sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1121 sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1122 sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1123 sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1124
1125 uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1126 uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1127 uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1128 uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1129 sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1130 sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1131 sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1132 max = vmaxq_u16(max, sec_max0);
1133
1134 // res = s + ((sum - (sum < 0) + 8) >> 4)
1135 sum =
1136 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1137 int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1138
1139 res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1140 vreinterpretq_s16_u16(max));
1141
1142 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1143
1144 in += 2 * CDEF_BSTRIDE;
1145 dst16 += 2 * dstride;
1146 h -= 2;
1147 } while (h != 0);
1148 }
1149 }
1150
cdef_filter_16_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1151 void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
1152 int pri_strength, int sec_strength, int dir,
1153 int pri_damping, int sec_damping, int coeff_shift,
1154 int block_width, int block_height) {
1155 (void)sec_strength;
1156 (void)sec_damping;
1157
1158 const int po1 = cdef_directions[dir][0];
1159 const int po2 = cdef_directions[dir][1];
1160 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
1161
1162 if (pri_strength) {
1163 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
1164 }
1165
1166 if (block_width == 8) {
1167 uint16_t *dst16 = (uint16_t *)dest;
1168
1169 int h = block_height;
1170 do {
1171 int16x8_t sum = vdupq_n_s16(0);
1172 uint16x8_t s = vld1q_u16(in);
1173
1174 uint16x8_t tap[4];
1175
1176 // Primary near taps
1177 tap[0] = vld1q_u16(in + po1);
1178 tap[1] = vld1q_u16(in - po1);
1179
1180 // Primary far taps
1181 tap[2] = vld1q_u16(in + po2);
1182 tap[3] = vld1q_u16(in - po2);
1183
1184 primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
1185
1186 // res = s + ((sum - (sum < 0) + 8) >> 4)
1187 sum =
1188 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1189 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1190
1191 vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1192
1193 in += CDEF_BSTRIDE;
1194 dst16 += dstride;
1195 } while (--h != 0);
1196 } else {
1197 uint16_t *dst16 = (uint16_t *)dest;
1198
1199 int h = block_height;
1200 do {
1201 int16x8_t sum = vdupq_n_s16(0);
1202 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1203
1204 uint16x8_t pri_src[4];
1205
1206 // Primary near taps
1207 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1208 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1209
1210 // Primary far taps
1211 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1212 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1213
1214 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1215
1216 // res = s + ((sum - (sum < 0) + 8) >> 4)
1217 sum =
1218 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1219 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1220
1221 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1222
1223 in += 2 * CDEF_BSTRIDE;
1224 dst16 += 2 * dstride;
1225 h -= 2;
1226 } while (h != 0);
1227 }
1228 }
1229
cdef_filter_16_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1230 void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
1231 int pri_strength, int sec_strength, int dir,
1232 int pri_damping, int sec_damping, int coeff_shift,
1233 int block_width, int block_height) {
1234 (void)pri_strength;
1235 (void)pri_damping;
1236 (void)coeff_shift;
1237
1238 const int s1o1 = cdef_directions[dir + 2][0];
1239 const int s1o2 = cdef_directions[dir + 2][1];
1240 const int s2o1 = cdef_directions[dir - 2][0];
1241 const int s2o2 = cdef_directions[dir - 2][1];
1242 const int *sec_taps = cdef_sec_taps;
1243
1244 if (sec_strength) {
1245 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
1246 }
1247
1248 if (block_width == 8) {
1249 uint16_t *dst16 = (uint16_t *)dest;
1250
1251 int h = block_height;
1252 do {
1253 int16x8_t sum = vdupq_n_s16(0);
1254 uint16x8_t s = vld1q_u16(in);
1255
1256 uint16x8_t sec_src[8];
1257
1258 // Secondary near taps
1259 sec_src[0] = vld1q_u16(in + s1o1);
1260 sec_src[1] = vld1q_u16(in - s1o1);
1261 sec_src[2] = vld1q_u16(in + s2o1);
1262 sec_src[3] = vld1q_u16(in - s2o1);
1263
1264 // Secondary far taps
1265 sec_src[4] = vld1q_u16(in + s1o2);
1266 sec_src[5] = vld1q_u16(in - s1o2);
1267 sec_src[6] = vld1q_u16(in + s2o2);
1268 sec_src[7] = vld1q_u16(in - s2o2);
1269
1270 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1271
1272 // res = s + ((sum - (sum < 0) + 8) >> 4)
1273 sum =
1274 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1275 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1276
1277 vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1278
1279 in += CDEF_BSTRIDE;
1280 dst16 += dstride;
1281 } while (--h != 0);
1282 } else {
1283 uint16_t *dst16 = (uint16_t *)dest;
1284
1285 int h = block_height;
1286 do {
1287 int16x8_t sum = vdupq_n_s16(0);
1288 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1289
1290 uint16x8_t sec_src[8];
1291
1292 // Secondary near taps
1293 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1294 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1295 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1296 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1297
1298 // Secondary far taps
1299 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1300 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1301 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1302 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1303
1304 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1305
1306 // res = s + ((sum - (sum < 0) + 8) >> 4)
1307 sum =
1308 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1309 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1310
1311 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1312
1313 in += 2 * CDEF_BSTRIDE;
1314 dst16 += 2 * dstride;
1315 h -= 2;
1316 } while (h != 0);
1317 }
1318 }
1319
cdef_filter_16_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1320 void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
1321 int pri_strength, int sec_strength, int dir,
1322 int pri_damping, int sec_damping, int coeff_shift,
1323 int block_width, int block_height) {
1324 (void)pri_strength;
1325 (void)sec_strength;
1326 (void)dir;
1327 (void)pri_damping;
1328 (void)sec_damping;
1329 (void)coeff_shift;
1330 (void)block_width;
1331 if (block_width == 8) {
1332 uint16_t *dst16 = (uint16_t *)dest;
1333
1334 int h = block_height;
1335 do {
1336 const uint16x8_t s = vld1q_u16(in);
1337 vst1q_u16(dst16, s);
1338
1339 in += CDEF_BSTRIDE;
1340 dst16 += dstride;
1341 } while (--h != 0);
1342 } else {
1343 uint16_t *dst16 = (uint16_t *)dest;
1344
1345 int h = block_height;
1346 do {
1347 const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1348 store_u16x4_strided_x2(dst16, dstride, s);
1349
1350 in += 2 * CDEF_BSTRIDE;
1351 dst16 += 2 * dstride;
1352 h -= 2;
1353 } while (h != 0);
1354 }
1355 }
1356