• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20 #include "av1/common/cdef_block.h"
21 
cdef_copy_rect8_8bit_to_16bit_neon(uint16_t * dst,int dstride,const uint8_t * src,int sstride,int width,int height)22 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
23                                         const uint8_t *src, int sstride,
24                                         int width, int height) {
25   do {
26     const uint8_t *src_ptr = src;
27     uint16_t *dst_ptr = dst;
28 
29     int w = 0;
30     while (width - w >= 16) {
31       uint8x16_t row = vld1q_u8(src_ptr + w);
32       uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
33       vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
34 
35       w += 16;
36     }
37     if (width - w >= 8) {
38       uint8x8_t row = vld1_u8(src_ptr + w);
39       vst1q_u16(dst_ptr + w, vmovl_u8(row));
40       w += 8;
41     }
42     if (width - w == 4) {
43       for (int i = w; i < w + 4; i++) {
44         dst_ptr[i] = src_ptr[i];
45       }
46     }
47 
48     src += sstride;
49     dst += dstride;
50   } while (--height != 0);
51 }
52 
cdef_copy_rect8_16bit_to_16bit_neon(uint16_t * dst,int dstride,const uint16_t * src,int sstride,int width,int height)53 void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
54                                          const uint16_t *src, int sstride,
55                                          int width, int height) {
56   do {
57     const uint16_t *src_ptr = src;
58     uint16_t *dst_ptr = dst;
59 
60     int w = 0;
61     while (width - w >= 8) {
62       uint16x8_t row = vld1q_u16(src_ptr + w);
63       vst1q_u16(dst_ptr + w, row);
64 
65       w += 8;
66     }
67     if (width - w == 4) {
68       uint16x4_t row = vld1_u16(src_ptr + w);
69       vst1_u16(dst_ptr + w, row);
70     }
71 
72     src += sstride;
73     dst += dstride;
74   } while (--height != 0);
75 }
76 
77 // partial A is a 16-bit vector of the form:
78 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
79 // [0  y1 y2 y3 y4 y5 y6 y7].
80 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
81 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
82 // and const2.
fold_mul_and_sum_neon(int16x8_t partiala,int16x8_t partialb,uint32x4_t const1,uint32x4_t const2)83 static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
84                                                int16x8_t partialb,
85                                                uint32x4_t const1,
86                                                uint32x4_t const2) {
87   // Reverse partial B.
88   // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }.
89   uint8x16_t pattern = vreinterpretq_u8_u64(
90       vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
91                    vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
92 
93 #if AOM_ARCH_AARCH64
94   partialb =
95       vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
96 #else
97   int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
98                      vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
99   int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
100   int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
101   partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
102 #endif
103 
104   // Square and add the corresponding x and y values.
105   int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala));
106   cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb));
107   int32x4_t cost_hi =
108       vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala));
109   cost_hi =
110       vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb));
111 
112   // Multiply by constant.
113   uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1);
114   cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2);
115   return cost;
116 }
117 
118 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
119 // down-right, 6 is vertical).
120 //
121 // For each direction the lines are shifted so that we can perform a
122 // basic sum on each vector element. For example, direction 5 is "south by
123 // southeast", so we need to add the pixels along each line i below:
124 //
125 // 0  1 2 3 4 5 6 7
126 // 0  1 2 3 4 5 6 7
127 // 8  0 1 2 3 4 5 6
128 // 8  0 1 2 3 4 5 6
129 // 9  8 0 1 2 3 4 5
130 // 9  8 0 1 2 3 4 5
131 // 10 9 8 0 1 2 3 4
132 // 10 9 8 0 1 2 3 4
133 //
134 // For this to fit nicely in vectors, the lines need to be shifted like so:
135 //        0 1 2 3 4 5 6 7
136 //        0 1 2 3 4 5 6 7
137 //      8 0 1 2 3 4 5 6
138 //      8 0 1 2 3 4 5 6
139 //    9 8 0 1 2 3 4 5
140 //    9 8 0 1 2 3 4 5
141 // 10 9 8 0 1 2 3 4
142 // 10 9 8 0 1 2 3 4
143 //
144 // In this configuration we can now perform SIMD additions to get the cost
145 // along direction 5. Since this won't fit into a single 128-bit vector, we use
146 // two of them to compute each half of the new configuration, and pad the empty
147 // spaces with zeros. Similar shifting is done for other directions, except
148 // direction 6 which is straightforward as it's the vertical direction.
compute_vert_directions_neon(int16x8_t lines[8],uint32_t cost[4])149 static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
150                                                       uint32_t cost[4]) {
151   const int16x8_t zero = vdupq_n_s16(0);
152 
153   // Partial sums for lines 0 and 1.
154   int16x8_t partial4a = vextq_s16(zero, lines[0], 1);
155   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2));
156   int16x8_t partial4b = vextq_s16(lines[0], zero, 1);
157   partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2));
158   int16x8_t tmp = vaddq_s16(lines[0], lines[1]);
159   int16x8_t partial5a = vextq_s16(zero, tmp, 3);
160   int16x8_t partial5b = vextq_s16(tmp, zero, 3);
161   int16x8_t partial7a = vextq_s16(zero, tmp, 6);
162   int16x8_t partial7b = vextq_s16(tmp, zero, 6);
163   int16x8_t partial6 = tmp;
164 
165   // Partial sums for lines 2 and 3.
166   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3));
167   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4));
168   partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3));
169   partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4));
170   tmp = vaddq_s16(lines[2], lines[3]);
171   partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4));
172   partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4));
173   partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5));
174   partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5));
175   partial6 = vaddq_s16(partial6, tmp);
176 
177   // Partial sums for lines 4 and 5.
178   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5));
179   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6));
180   partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5));
181   partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6));
182   tmp = vaddq_s16(lines[4], lines[5]);
183   partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5));
184   partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5));
185   partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4));
186   partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4));
187   partial6 = vaddq_s16(partial6, tmp);
188 
189   // Partial sums for lines 6 and 7.
190   partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7));
191   partial4a = vaddq_s16(partial4a, lines[7]);
192   partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7));
193   tmp = vaddq_s16(lines[6], lines[7]);
194   partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6));
195   partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6));
196   partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3));
197   partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3));
198   partial6 = vaddq_s16(partial6, tmp);
199 
200   uint32x4_t const0 = vreinterpretq_u32_u64(
201       vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
202                    vcreate_u64((uint64_t)210 << 32 | 280)));
203   uint32x4_t const1 = vreinterpretq_u32_u64(
204       vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
205                    vcreate_u64((uint64_t)105 << 32 | 120)));
206   uint32x4_t const2 = vreinterpretq_u32_u64(
207       vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
208   uint32x4_t const3 = vreinterpretq_u32_u64(
209       vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
210                    vcreate_u64((uint64_t)105 << 32 | 105)));
211 
212   // Compute costs in terms of partial sums.
213   int32x4_t partial6_s32 =
214       vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6));
215   partial6_s32 =
216       vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6));
217 
218   uint32x4_t costs[4];
219   costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
220   costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
221   costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105);
222   costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
223 
224   costs[0] = horizontal_add_4d_u32x4(costs);
225   vst1q_u32(cost, costs[0]);
226   return costs[0];
227 }
228 
fold_mul_and_sum_pairwise_neon(int16x8_t partiala,int16x8_t partialb,int16x8_t partialc,uint32x4_t const0)229 static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
230                                                         int16x8_t partialb,
231                                                         int16x8_t partialc,
232                                                         uint32x4_t const0) {
233   // Reverse partial c.
234   // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }.
235   uint8x16_t pattern = vreinterpretq_u8_u64(
236       vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a),
237                    vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302)));
238 
239 #if AOM_ARCH_AARCH64
240   partialc =
241       vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern));
242 #else
243   int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)),
244                      vget_high_s8(vreinterpretq_s8_s16(partialc)) } };
245   int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
246   int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
247   partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
248 #endif
249 
250   int32x4_t partiala_s32 = vpaddlq_s16(partiala);
251   int32x4_t partialb_s32 = vpaddlq_s16(partialb);
252   int32x4_t partialc_s32 = vpaddlq_s16(partialc);
253 
254   partiala_s32 = vmulq_s32(partiala_s32, partiala_s32);
255   partialb_s32 = vmulq_s32(partialb_s32, partialb_s32);
256   partialc_s32 = vmulq_s32(partialc_s32, partialc_s32);
257 
258   partiala_s32 = vaddq_s32(partiala_s32, partialc_s32);
259 
260   uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105);
261   cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0);
262   return cost;
263 }
264 
265 // This function computes the cost along directions 0, 1, 2, 3. (0 means
266 // 45-degree up-right, 2 is horizontal).
267 //
268 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted
269 // lines need three vectors instead of two. For direction 1 for example, we need
270 // to compute the sums along the line i below:
271 // 0 0 1 1 2 2 3  3
272 // 1 1 2 2 3 3 4  4
273 // 2 2 3 3 4 4 5  5
274 // 3 3 4 4 5 5 6  6
275 // 4 4 5 5 6 6 7  7
276 // 5 5 6 6 7 7 8  8
277 // 6 6 7 7 8 8 9  9
278 // 7 7 8 8 9 9 10 10
279 //
280 // Which means we need the following configuration:
281 // 0 0 1 1 2 2 3 3
282 //     1 1 2 2 3 3 4 4
283 //         2 2 3 3 4 4 5 5
284 //             3 3 4 4 5 5 6 6
285 //                 4 4 5 5 6 6 7 7
286 //                     5 5 6 6 7 7 8 8
287 //                         6 6 7 7 8 8 9 9
288 //                             7 7 8 8 9 9 10 10
289 //
290 // Three vectors are needed to compute this, as well as some extra pairwise
291 // additions.
compute_horiz_directions_neon(int16x8_t lines[8],uint32_t cost[4])292 static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8],
293                                                 uint32_t cost[4]) {
294   const int16x8_t zero = vdupq_n_s16(0);
295 
296   // Compute diagonal directions (1, 2, 3).
297   // Partial sums for lines 0 and 1.
298   int16x8_t partial0a = lines[0];
299   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7));
300   int16x8_t partial0b = vextq_s16(lines[1], zero, 7);
301   int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6));
302   int16x8_t partial1b = vextq_s16(lines[1], zero, 6);
303   int16x8_t partial3a = vextq_s16(lines[0], zero, 2);
304   partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4));
305   int16x8_t partial3b = vextq_s16(zero, lines[0], 2);
306   partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4));
307 
308   // Partial sums for lines 2 and 3.
309   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6));
310   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5));
311   partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6));
312   partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5));
313   partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4));
314   partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2));
315   partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4));
316   partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2));
317   partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6));
318   partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6));
319   partial3b = vaddq_s16(partial3b, lines[3]);
320 
321   // Partial sums for lines 4 and 5.
322   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4));
323   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3));
324   partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4));
325   partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3));
326   partial1b = vaddq_s16(partial1b, lines[4]);
327   partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6));
328   int16x8_t partial1c = vextq_s16(lines[5], zero, 6);
329   partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2));
330   partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4));
331   int16x8_t partial3c = vextq_s16(zero, lines[4], 2);
332   partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4));
333 
334   // Partial sums for lines 6 and 7.
335   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2));
336   partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1));
337   partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2));
338   partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1));
339   partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4));
340   partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2));
341   partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4));
342   partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2));
343   partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6));
344   partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6));
345   partial3c = vaddq_s16(partial3c, lines[7]);
346 
347   // Special case for direction 2 as it's just a sum along each line.
348   int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
349   int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
350   int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
351   int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
352 
353   uint32x4_t partial2a_u32 =
354       vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a));
355   uint32x4_t partial2b_u32 =
356       vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b));
357 
358   uint32x4_t const0 = vreinterpretq_u32_u64(
359       vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
360                    vcreate_u64((uint64_t)210 << 32 | 280)));
361   uint32x4_t const1 = vreinterpretq_u32_u64(
362       vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
363                    vcreate_u64((uint64_t)105 << 32 | 120)));
364   uint32x4_t const2 = vreinterpretq_u32_u64(
365       vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420),
366                    vcreate_u64((uint64_t)105 << 32 | 140)));
367 
368   uint32x4_t costs[4];
369   costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1);
370   costs[1] =
371       fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2);
372   costs[2] = vaddq_u32(partial2a_u32, partial2b_u32);
373   costs[2] = vmulq_n_u32(costs[2], 105);
374   costs[3] =
375       fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2);
376 
377   costs[0] = horizontal_add_4d_u32x4(costs);
378   vst1q_u32(cost, costs[0]);
379   return costs[0];
380 }
381 
cdef_find_dir_neon(const uint16_t * img,int stride,int32_t * var,int coeff_shift)382 int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
383                        int coeff_shift) {
384   uint32_t cost[8];
385   uint32_t best_cost = 0;
386   int best_dir = 0;
387   int16x8_t lines[8];
388   for (int i = 0; i < 8; i++) {
389     uint16x8_t s = vld1q_u16(&img[i * stride]);
390     lines[i] = vreinterpretq_s16_u16(
391         vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
392   }
393 
394   // Compute "mostly vertical" directions.
395   uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4);
396 
397   // Compute "mostly horizontal" directions.
398   uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost);
399 
400   // Find max cost as well as its index to get best_dir.
401   // The max cost needs to be propagated in the whole vector to find its
402   // position in the original cost vectors cost03 and cost47.
403   uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
404 #if AOM_ARCH_AARCH64
405   best_cost = vmaxvq_u32(cost07);
406   uint32x4_t max_cost = vdupq_n_u32(best_cost);
407   uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
408                            vreinterpretq_u8_u32(
409                                vceqq_u32(max_cost, cost47)) } };
410   // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
411   uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
412   // Get the lowest 8 bit of each 32-bit elements and reverse them.
413   uint8x8_t tbl = vqtbl2_u8(costs, idx);
414   uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
415   best_dir = aom_clzll(a) >> 3;
416 #else
417   uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
418   cost64 = vpmax_u32(cost64, cost64);
419   uint32x4_t max_cost = vcombine_u32(cost64, cost64);
420   best_cost = vget_lane_u32(cost64, 0);
421   uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
422                                   vmovn_u32(vceqq_u32(max_cost, cost47)));
423   uint8x8_t idx =
424       vand_u8(vmovn_u16(costs),
425               vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
426   int sum = horizontal_add_u8x8(idx);
427   best_dir = get_msb(sum ^ (sum - 1));
428 #endif
429 
430   // Difference between the optimal variance and the variance along the
431   // orthogonal direction. Again, the sum(x^2) terms cancel out.
432   *var = best_cost - cost[(best_dir + 4) & 7];
433   // We'd normally divide by 840, but dividing by 1024 is close enough
434   // for what we're going to do with this.
435   *var >>= 10;
436   return best_dir;
437 }
438 
cdef_find_dir_dual_neon(const uint16_t * img1,const uint16_t * img2,int stride,int32_t * var_out_1st,int32_t * var_out_2nd,int coeff_shift,int * out_dir_1st_8x8,int * out_dir_2nd_8x8)439 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
440                              int stride, int32_t *var_out_1st,
441                              int32_t *var_out_2nd, int coeff_shift,
442                              int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
443   // Process first 8x8.
444   *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
445 
446   // Process second 8x8.
447   *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
448 }
449 
450 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(uint16x8_t a,uint16x8_t b,unsigned int threshold,int adjdamp)451 static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
452                                     unsigned int threshold, int adjdamp) {
453   uint16x8_t diff = vabdq_u16(a, b);
454   const uint16x8_t a_gt_b = vcgtq_u16(a, b);
455   const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold),
456                                   vshlq_u16(diff, vdupq_n_s16(-adjdamp)));
457   const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s));
458   return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
459 }
460 
primary_filter(uint16x8_t s,uint16x8_t tap[4],const int * pri_taps,int pri_strength,int pri_damping,int16x8_t * sum)461 static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4],
462                                   const int *pri_taps, int pri_strength,
463                                   int pri_damping, int16x8_t *sum) {
464   // Near taps
465   int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping);
466   int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping);
467   // sum += pri_taps[0] * (n0 + n1)
468   n0 = vaddq_s16(n0, n1);
469   *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]);
470 
471   // Far taps
472   int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping);
473   int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping);
474   // sum += pri_taps[1] * (f0 + f1)
475   f0 = vaddq_s16(f0, f1);
476   *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
477 }
478 
secondary_filter(uint16x8_t s,uint16x8_t tap[8],const int * sec_taps,int sec_strength,int sec_damping,int16x8_t * sum)479 static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
480                                     const int *sec_taps, int sec_strength,
481                                     int sec_damping, int16x8_t *sum) {
482   // Near taps
483   int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping);
484   int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping);
485   int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping);
486   int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping);
487 
488   // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
489   s0 = vaddq_s16(s0, s1);
490   s2 = vaddq_s16(s2, s3);
491   s0 = vaddq_s16(s0, s2);
492   *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]);
493 
494   // Far taps
495   s0 = constrain16(tap[4], s, sec_strength, sec_damping);
496   s1 = constrain16(tap[5], s, sec_strength, sec_damping);
497   s2 = constrain16(tap[6], s, sec_strength, sec_damping);
498   s3 = constrain16(tap[7], s, sec_strength, sec_damping);
499 
500   // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
501   s0 = vaddq_s16(s0, s1);
502   s2 = vaddq_s16(s2, s3);
503   s0 = vaddq_s16(s0, s2);
504   *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]);
505 }
506 
cdef_filter_8_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)507 void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
508                           int pri_strength, int sec_strength, int dir,
509                           int pri_damping, int sec_damping, int coeff_shift,
510                           int block_width, int block_height) {
511   uint16x8_t max, min;
512   const uint16x8_t cdef_large_value_mask =
513       vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
514   const int po1 = cdef_directions[dir][0];
515   const int po2 = cdef_directions[dir][1];
516   const int s1o1 = cdef_directions[dir + 2][0];
517   const int s1o2 = cdef_directions[dir + 2][1];
518   const int s2o1 = cdef_directions[dir - 2][0];
519   const int s2o2 = cdef_directions[dir - 2][1];
520   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
521   const int *sec_taps = cdef_sec_taps;
522 
523   if (pri_strength) {
524     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
525   }
526   if (sec_strength) {
527     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
528   }
529 
530   if (block_width == 8) {
531     uint8_t *dst8 = (uint8_t *)dest;
532 
533     int h = block_height;
534     do {
535       int16x8_t sum = vdupq_n_s16(0);
536       uint16x8_t s = vld1q_u16(in);
537       max = min = s;
538 
539       uint16x8_t pri_src[4];
540 
541       // Primary near taps
542       pri_src[0] = vld1q_u16(in + po1);
543       pri_src[1] = vld1q_u16(in - po1);
544 
545       // Primary far taps
546       pri_src[2] = vld1q_u16(in + po2);
547       pri_src[3] = vld1q_u16(in - po2);
548 
549       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
550 
551       // The source is 16 bits, however, we only really care about the lower
552       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
553       // primary max has been calculated, zero out the upper 8 bits.  Use this
554       // to find the "16 bit" max.
555       uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
556                                      vreinterpretq_u8_u16(pri_src[1]));
557       uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
558                                      vreinterpretq_u8_u16(pri_src[3]));
559       pri_max0 = vmaxq_u8(pri_max0, pri_max1);
560       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
561                                      cdef_large_value_mask));
562 
563       uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
564       uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
565       pri_min0 = vminq_u16(pri_min0, pri_min1);
566       min = vminq_u16(min, pri_min0);
567 
568       uint16x8_t sec_src[8];
569 
570       // Secondary near taps
571       sec_src[0] = vld1q_u16(in + s1o1);
572       sec_src[1] = vld1q_u16(in - s1o1);
573       sec_src[2] = vld1q_u16(in + s2o1);
574       sec_src[3] = vld1q_u16(in - s2o1);
575 
576       // Secondary far taps
577       sec_src[4] = vld1q_u16(in + s1o2);
578       sec_src[5] = vld1q_u16(in - s1o2);
579       sec_src[6] = vld1q_u16(in + s2o2);
580       sec_src[7] = vld1q_u16(in - s2o2);
581 
582       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
583 
584       // The source is 16 bits, however, we only really care about the lower
585       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
586       // primary max has been calculated, zero out the upper 8 bits.  Use this
587       // to find the "16 bit" max.
588       uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
589                                      vreinterpretq_u8_u16(sec_src[1]));
590       uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
591                                      vreinterpretq_u8_u16(sec_src[3]));
592       uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
593                                      vreinterpretq_u8_u16(sec_src[5]));
594       uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
595                                      vreinterpretq_u8_u16(sec_src[7]));
596       sec_max0 = vmaxq_u8(sec_max0, sec_max1);
597       sec_max2 = vmaxq_u8(sec_max2, sec_max3);
598       sec_max0 = vmaxq_u8(sec_max0, sec_max2);
599       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
600                                      cdef_large_value_mask));
601 
602       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
603       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
604       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
605       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
606       sec_min0 = vminq_u16(sec_min0, sec_min1);
607       sec_min2 = vminq_u16(sec_min2, sec_min3);
608       sec_min0 = vminq_u16(sec_min0, sec_min2);
609       min = vminq_u16(min, sec_min0);
610 
611       // res = s + ((sum - (sum < 0) + 8) >> 4)
612       sum =
613           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
614       int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
615 
616       res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
617                           vreinterpretq_s16_u16(max));
618 
619       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
620       vst1_u8(dst8, res_u8);
621 
622       in += CDEF_BSTRIDE;
623       dst8 += dstride;
624     } while (--h != 0);
625   } else {
626     uint8_t *dst8 = (uint8_t *)dest;
627 
628     int h = block_height;
629     do {
630       int16x8_t sum = vdupq_n_s16(0);
631       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
632       max = min = s;
633 
634       uint16x8_t pri_src[4];
635 
636       // Primary near taps
637       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
638       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
639 
640       // Primary far taps
641       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
642       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
643 
644       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
645 
646       // The source is 16 bits, however, we only really care about the lower
647       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
648       // primary max has been calculated, zero out the upper 8 bits.  Use this
649       // to find the "16 bit" max.
650       uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
651                                      vreinterpretq_u8_u16(pri_src[1]));
652       uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
653                                      vreinterpretq_u8_u16(pri_src[3]));
654       pri_max0 = vmaxq_u8(pri_max0, pri_max1);
655       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
656                                      cdef_large_value_mask));
657 
658       uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
659       uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
660       pri_min1 = vminq_u16(pri_min1, pri_min2);
661       min = vminq_u16(min, pri_min1);
662 
663       uint16x8_t sec_src[8];
664 
665       // Secondary near taps
666       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
667       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
668       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
669       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
670 
671       // Secondary far taps
672       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
673       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
674       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
675       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
676 
677       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
678 
679       // The source is 16 bits, however, we only really care about the lower
680       // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
681       // primary max has been calculated, zero out the upper 8 bits.  Use this
682       // to find the "16 bit" max.
683       uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
684                                      vreinterpretq_u8_u16(sec_src[1]));
685       uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
686                                      vreinterpretq_u8_u16(sec_src[3]));
687       uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
688                                      vreinterpretq_u8_u16(sec_src[5]));
689       uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
690                                      vreinterpretq_u8_u16(sec_src[7]));
691       sec_max0 = vmaxq_u8(sec_max0, sec_max1);
692       sec_max2 = vmaxq_u8(sec_max2, sec_max3);
693       sec_max0 = vmaxq_u8(sec_max0, sec_max2);
694       max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
695                                      cdef_large_value_mask));
696 
697       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
698       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
699       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
700       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
701       sec_min0 = vminq_u16(sec_min0, sec_min1);
702       sec_min2 = vminq_u16(sec_min2, sec_min3);
703       sec_min0 = vminq_u16(sec_min0, sec_min2);
704       min = vminq_u16(min, sec_min0);
705 
706       // res = s + ((sum - (sum < 0) + 8) >> 4)
707       sum =
708           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
709       int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
710 
711       res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
712                           vreinterpretq_s16_u16(max));
713 
714       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
715       store_u8x4_strided_x2(dst8, dstride, res_u8);
716 
717       in += 2 * CDEF_BSTRIDE;
718       dst8 += 2 * dstride;
719       h -= 2;
720     } while (h != 0);
721   }
722 }
723 
cdef_filter_8_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)724 void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
725                           int pri_strength, int sec_strength, int dir,
726                           int pri_damping, int sec_damping, int coeff_shift,
727                           int block_width, int block_height) {
728   (void)sec_strength;
729   (void)sec_damping;
730 
731   const int po1 = cdef_directions[dir][0];
732   const int po2 = cdef_directions[dir][1];
733   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
734 
735   if (pri_strength) {
736     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
737   }
738 
739   if (block_width == 8) {
740     uint8_t *dst8 = (uint8_t *)dest;
741 
742     int h = block_height;
743     do {
744       int16x8_t sum = vdupq_n_s16(0);
745       uint16x8_t s = vld1q_u16(in);
746 
747       uint16x8_t tap[4];
748 
749       // Primary near taps
750       tap[0] = vld1q_u16(in + po1);
751       tap[1] = vld1q_u16(in - po1);
752 
753       // Primary far taps
754       tap[2] = vld1q_u16(in + po2);
755       tap[3] = vld1q_u16(in - po2);
756 
757       primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
758 
759       // res = s + ((sum - (sum < 0) + 8) >> 4)
760       sum =
761           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
762       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
763 
764       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
765       vst1_u8(dst8, res_u8);
766 
767       in += CDEF_BSTRIDE;
768       dst8 += dstride;
769     } while (--h != 0);
770 
771   } else {
772     uint8_t *dst8 = (uint8_t *)dest;
773 
774     int h = block_height;
775     do {
776       int16x8_t sum = vdupq_n_s16(0);
777       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
778 
779       uint16x8_t pri_src[4];
780 
781       // Primary near taps
782       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
783       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
784 
785       // Primary far taps
786       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
787       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
788 
789       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
790 
791       // res = s + ((sum - (sum < 0) + 8) >> 4)
792       sum =
793           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
794       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
795 
796       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
797       store_u8x4_strided_x2(dst8, dstride, res_u8);
798 
799       in += 2 * CDEF_BSTRIDE;
800       dst8 += 2 * dstride;
801       h -= 2;
802     } while (h != 0);
803   }
804 }
805 
cdef_filter_8_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)806 void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
807                           int pri_strength, int sec_strength, int dir,
808                           int pri_damping, int sec_damping, int coeff_shift,
809                           int block_width, int block_height) {
810   (void)pri_strength;
811   (void)pri_damping;
812   (void)coeff_shift;
813 
814   const int s1o1 = cdef_directions[dir + 2][0];
815   const int s1o2 = cdef_directions[dir + 2][1];
816   const int s2o1 = cdef_directions[dir - 2][0];
817   const int s2o2 = cdef_directions[dir - 2][1];
818   const int *sec_taps = cdef_sec_taps;
819 
820   if (sec_strength) {
821     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
822   }
823 
824   if (block_width == 8) {
825     uint8_t *dst8 = (uint8_t *)dest;
826 
827     int h = block_height;
828     do {
829       int16x8_t sum = vdupq_n_s16(0);
830       uint16x8_t s = vld1q_u16(in);
831 
832       uint16x8_t sec_src[8];
833 
834       // Secondary near taps
835       sec_src[0] = vld1q_u16(in + s1o1);
836       sec_src[1] = vld1q_u16(in - s1o1);
837       sec_src[2] = vld1q_u16(in + s2o1);
838       sec_src[3] = vld1q_u16(in - s2o1);
839 
840       // Secondary far taps
841       sec_src[4] = vld1q_u16(in + s1o2);
842       sec_src[5] = vld1q_u16(in - s1o2);
843       sec_src[6] = vld1q_u16(in + s2o2);
844       sec_src[7] = vld1q_u16(in - s2o2);
845 
846       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
847 
848       // res = s + ((sum - (sum < 0) + 8) >> 4)
849       sum =
850           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
851       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
852 
853       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
854       vst1_u8(dst8, res_u8);
855 
856       in += CDEF_BSTRIDE;
857       dst8 += dstride;
858     } while (--h != 0);
859   } else {
860     uint8_t *dst8 = (uint8_t *)dest;
861 
862     int h = block_height;
863     do {
864       int16x8_t sum = vdupq_n_s16(0);
865       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
866 
867       uint16x8_t sec_src[8];
868 
869       // Secondary near taps
870       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
871       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
872       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
873       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
874 
875       // Secondary far taps
876       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
877       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
878       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
879       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
880 
881       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
882 
883       // res = s + ((sum - (sum < 0) + 8) >> 4)
884       sum =
885           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
886       const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
887 
888       const uint8x8_t res_u8 = vqmovun_s16(res_s16);
889       store_u8x4_strided_x2(dst8, dstride, res_u8);
890 
891       in += 2 * CDEF_BSTRIDE;
892       dst8 += 2 * dstride;
893       h -= 2;
894     } while (h != 0);
895   }
896 }
897 
cdef_filter_8_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)898 void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
899                           int pri_strength, int sec_strength, int dir,
900                           int pri_damping, int sec_damping, int coeff_shift,
901                           int block_width, int block_height) {
902   (void)pri_strength;
903   (void)sec_strength;
904   (void)dir;
905   (void)pri_damping;
906   (void)sec_damping;
907   (void)coeff_shift;
908   (void)block_width;
909   if (block_width == 8) {
910     uint8_t *dst8 = (uint8_t *)dest;
911 
912     int h = block_height;
913     do {
914       const uint16x8_t s = vld1q_u16(in);
915       const uint8x8_t res = vqmovn_u16(s);
916       vst1_u8(dst8, res);
917 
918       in += CDEF_BSTRIDE;
919       dst8 += dstride;
920     } while (--h != 0);
921   } else {
922     uint8_t *dst8 = (uint8_t *)dest;
923 
924     int h = block_height;
925     do {
926       const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
927       const uint8x8_t res = vqmovn_u16(s);
928       store_u8x4_strided_x2(dst8, dstride, res);
929 
930       in += 2 * CDEF_BSTRIDE;
931       dst8 += 2 * dstride;
932       h -= 2;
933     } while (h != 0);
934   }
935 }
936 
cdef_filter_16_0_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)937 void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
938                            int pri_strength, int sec_strength, int dir,
939                            int pri_damping, int sec_damping, int coeff_shift,
940                            int block_width, int block_height) {
941   uint16x8_t max, min;
942   const uint16x8_t cdef_large_value_mask =
943       vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
944   const int po1 = cdef_directions[dir][0];
945   const int po2 = cdef_directions[dir][1];
946   const int s1o1 = cdef_directions[dir + 2][0];
947   const int s1o2 = cdef_directions[dir + 2][1];
948   const int s2o1 = cdef_directions[dir - 2][0];
949   const int s2o2 = cdef_directions[dir - 2][1];
950   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
951   const int *sec_taps = cdef_sec_taps;
952 
953   if (pri_strength) {
954     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
955   }
956   if (sec_strength) {
957     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
958   }
959 
960   if (block_width == 8) {
961     uint16_t *dst16 = (uint16_t *)dest;
962 
963     int h = block_height;
964     do {
965       int16x8_t sum = vdupq_n_s16(0);
966       uint16x8_t s = vld1q_u16(in);
967       max = min = s;
968 
969       uint16x8_t pri_src[4];
970 
971       // Primary near taps
972       pri_src[0] = vld1q_u16(in + po1);
973       pri_src[1] = vld1q_u16(in - po1);
974 
975       // Primary far taps
976       pri_src[2] = vld1q_u16(in + po2);
977       pri_src[3] = vld1q_u16(in - po2);
978 
979       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
980 
981       uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
982       uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
983       pri_min0 = vminq_u16(pri_min0, pri_min1);
984       min = vminq_u16(min, pri_min0);
985 
986       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
987       pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
988       pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
989       pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
990       pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
991 
992       uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
993       uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
994       pri_max0 = vmaxq_u16(pri_max0, pri_max1);
995       max = vmaxq_u16(max, pri_max0);
996 
997       uint16x8_t sec_src[8];
998 
999       // Secondary near taps
1000       sec_src[0] = vld1q_u16(in + s1o1);
1001       sec_src[1] = vld1q_u16(in - s1o1);
1002       sec_src[2] = vld1q_u16(in + s2o1);
1003       sec_src[3] = vld1q_u16(in - s2o1);
1004 
1005       // Secondary far taps
1006       sec_src[4] = vld1q_u16(in + s1o2);
1007       sec_src[5] = vld1q_u16(in - s1o2);
1008       sec_src[6] = vld1q_u16(in + s2o2);
1009       sec_src[7] = vld1q_u16(in - s2o2);
1010 
1011       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1012 
1013       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1014       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1015       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1016       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1017       sec_min0 = vminq_u16(sec_min0, sec_min1);
1018       sec_min2 = vminq_u16(sec_min2, sec_min3);
1019       sec_min0 = vminq_u16(sec_min0, sec_min2);
1020       min = vminq_u16(min, sec_min0);
1021 
1022       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1023       sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1024       sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1025       sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1026       sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1027       sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1028       sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1029       sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1030       sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1031 
1032       uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1033       uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1034       uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1035       uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1036       sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1037       sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1038       sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1039       max = vmaxq_u16(max, sec_max0);
1040 
1041       // res = s + ((sum - (sum < 0) + 8) >> 4)
1042       sum =
1043           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1044       int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1045 
1046       res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1047                       vreinterpretq_s16_u16(max));
1048 
1049       vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1050 
1051       in += CDEF_BSTRIDE;
1052       dst16 += dstride;
1053     } while (--h != 0);
1054   } else {
1055     uint16_t *dst16 = (uint16_t *)dest;
1056 
1057     int h = block_height;
1058     do {
1059       int16x8_t sum = vdupq_n_s16(0);
1060       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1061       max = min = s;
1062 
1063       uint16x8_t pri_src[4];
1064 
1065       // Primary near taps
1066       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1067       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1068 
1069       // Primary far taps
1070       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1071       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1072 
1073       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1074 
1075       uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
1076       uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
1077       pri_min1 = vminq_u16(pri_min1, pri_min2);
1078       min = vminq_u16(min, pri_min1);
1079 
1080       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1081       pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
1082       pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
1083       pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
1084       pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
1085       uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
1086       uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
1087       pri_max0 = vmaxq_u16(pri_max0, pri_max1);
1088       max = vmaxq_u16(max, pri_max0);
1089 
1090       uint16x8_t sec_src[8];
1091 
1092       // Secondary near taps
1093       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1094       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1095       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1096       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1097 
1098       // Secondary far taps
1099       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1100       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1101       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1102       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1103 
1104       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1105 
1106       uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
1107       uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
1108       uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
1109       uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
1110       sec_min0 = vminq_u16(sec_min0, sec_min1);
1111       sec_min2 = vminq_u16(sec_min2, sec_min3);
1112       sec_min0 = vminq_u16(sec_min0, sec_min2);
1113       min = vminq_u16(min, sec_min0);
1114 
1115       /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
1116       sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
1117       sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
1118       sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
1119       sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
1120       sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
1121       sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
1122       sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
1123       sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
1124 
1125       uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
1126       uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
1127       uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
1128       uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
1129       sec_max0 = vmaxq_u16(sec_max0, sec_max1);
1130       sec_max2 = vmaxq_u16(sec_max2, sec_max3);
1131       sec_max0 = vmaxq_u16(sec_max0, sec_max2);
1132       max = vmaxq_u16(max, sec_max0);
1133 
1134       // res = s + ((sum - (sum < 0) + 8) >> 4)
1135       sum =
1136           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1137       int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1138 
1139       res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
1140                       vreinterpretq_s16_u16(max));
1141 
1142       store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1143 
1144       in += 2 * CDEF_BSTRIDE;
1145       dst16 += 2 * dstride;
1146       h -= 2;
1147     } while (h != 0);
1148   }
1149 }
1150 
cdef_filter_16_1_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1151 void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
1152                            int pri_strength, int sec_strength, int dir,
1153                            int pri_damping, int sec_damping, int coeff_shift,
1154                            int block_width, int block_height) {
1155   (void)sec_strength;
1156   (void)sec_damping;
1157 
1158   const int po1 = cdef_directions[dir][0];
1159   const int po2 = cdef_directions[dir][1];
1160   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
1161 
1162   if (pri_strength) {
1163     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
1164   }
1165 
1166   if (block_width == 8) {
1167     uint16_t *dst16 = (uint16_t *)dest;
1168 
1169     int h = block_height;
1170     do {
1171       int16x8_t sum = vdupq_n_s16(0);
1172       uint16x8_t s = vld1q_u16(in);
1173 
1174       uint16x8_t tap[4];
1175 
1176       // Primary near taps
1177       tap[0] = vld1q_u16(in + po1);
1178       tap[1] = vld1q_u16(in - po1);
1179 
1180       // Primary far taps
1181       tap[2] = vld1q_u16(in + po2);
1182       tap[3] = vld1q_u16(in - po2);
1183 
1184       primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
1185 
1186       // res = s + ((sum - (sum < 0) + 8) >> 4)
1187       sum =
1188           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1189       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1190 
1191       vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1192 
1193       in += CDEF_BSTRIDE;
1194       dst16 += dstride;
1195     } while (--h != 0);
1196   } else {
1197     uint16_t *dst16 = (uint16_t *)dest;
1198 
1199     int h = block_height;
1200     do {
1201       int16x8_t sum = vdupq_n_s16(0);
1202       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1203 
1204       uint16x8_t pri_src[4];
1205 
1206       // Primary near taps
1207       pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
1208       pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
1209 
1210       // Primary far taps
1211       pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
1212       pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
1213 
1214       primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
1215 
1216       // res = s + ((sum - (sum < 0) + 8) >> 4)
1217       sum =
1218           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1219       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1220 
1221       store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1222 
1223       in += 2 * CDEF_BSTRIDE;
1224       dst16 += 2 * dstride;
1225       h -= 2;
1226     } while (h != 0);
1227   }
1228 }
1229 
cdef_filter_16_2_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1230 void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
1231                            int pri_strength, int sec_strength, int dir,
1232                            int pri_damping, int sec_damping, int coeff_shift,
1233                            int block_width, int block_height) {
1234   (void)pri_strength;
1235   (void)pri_damping;
1236   (void)coeff_shift;
1237 
1238   const int s1o1 = cdef_directions[dir + 2][0];
1239   const int s1o2 = cdef_directions[dir + 2][1];
1240   const int s2o1 = cdef_directions[dir - 2][0];
1241   const int s2o2 = cdef_directions[dir - 2][1];
1242   const int *sec_taps = cdef_sec_taps;
1243 
1244   if (sec_strength) {
1245     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
1246   }
1247 
1248   if (block_width == 8) {
1249     uint16_t *dst16 = (uint16_t *)dest;
1250 
1251     int h = block_height;
1252     do {
1253       int16x8_t sum = vdupq_n_s16(0);
1254       uint16x8_t s = vld1q_u16(in);
1255 
1256       uint16x8_t sec_src[8];
1257 
1258       // Secondary near taps
1259       sec_src[0] = vld1q_u16(in + s1o1);
1260       sec_src[1] = vld1q_u16(in - s1o1);
1261       sec_src[2] = vld1q_u16(in + s2o1);
1262       sec_src[3] = vld1q_u16(in - s2o1);
1263 
1264       // Secondary far taps
1265       sec_src[4] = vld1q_u16(in + s1o2);
1266       sec_src[5] = vld1q_u16(in - s1o2);
1267       sec_src[6] = vld1q_u16(in + s2o2);
1268       sec_src[7] = vld1q_u16(in - s2o2);
1269 
1270       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1271 
1272       // res = s + ((sum - (sum < 0) + 8) >> 4)
1273       sum =
1274           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1275       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1276 
1277       vst1q_u16(dst16, vreinterpretq_u16_s16(res));
1278 
1279       in += CDEF_BSTRIDE;
1280       dst16 += dstride;
1281     } while (--h != 0);
1282   } else {
1283     uint16_t *dst16 = (uint16_t *)dest;
1284 
1285     int h = block_height;
1286     do {
1287       int16x8_t sum = vdupq_n_s16(0);
1288       uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1289 
1290       uint16x8_t sec_src[8];
1291 
1292       // Secondary near taps
1293       sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
1294       sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
1295       sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
1296       sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
1297 
1298       // Secondary far taps
1299       sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
1300       sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
1301       sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
1302       sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
1303 
1304       secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
1305 
1306       // res = s + ((sum - (sum < 0) + 8) >> 4)
1307       sum =
1308           vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
1309       const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
1310 
1311       store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
1312 
1313       in += 2 * CDEF_BSTRIDE;
1314       dst16 += 2 * dstride;
1315       h -= 2;
1316     } while (h != 0);
1317   }
1318 }
1319 
cdef_filter_16_3_neon(void * dest,int dstride,const uint16_t * in,int pri_strength,int sec_strength,int dir,int pri_damping,int sec_damping,int coeff_shift,int block_width,int block_height)1320 void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
1321                            int pri_strength, int sec_strength, int dir,
1322                            int pri_damping, int sec_damping, int coeff_shift,
1323                            int block_width, int block_height) {
1324   (void)pri_strength;
1325   (void)sec_strength;
1326   (void)dir;
1327   (void)pri_damping;
1328   (void)sec_damping;
1329   (void)coeff_shift;
1330   (void)block_width;
1331   if (block_width == 8) {
1332     uint16_t *dst16 = (uint16_t *)dest;
1333 
1334     int h = block_height;
1335     do {
1336       const uint16x8_t s = vld1q_u16(in);
1337       vst1q_u16(dst16, s);
1338 
1339       in += CDEF_BSTRIDE;
1340       dst16 += dstride;
1341     } while (--h != 0);
1342   } else {
1343     uint16_t *dst16 = (uint16_t *)dest;
1344 
1345     int h = block_height;
1346     do {
1347       const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
1348       store_u16x4_strided_x2(dst16, dstride, s);
1349 
1350       in += 2 * CDEF_BSTRIDE;
1351       dst16 += 2 * dstride;
1352       h -= 2;
1353     } while (h != 0);
1354   }
1355 }
1356