1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * Copyright © 2024, Luca Barbato
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #undef NDEBUG
32 #include <assert.h>
33
34 #include <stdlib.h>
35
36 #include "common/attributes.h"
37 #include "common/intops.h"
38
39 #include "src/ppc/dav1d_types.h"
40 #include "src/ppc/loopfilter.h"
41
42 #if BITDEPTH == 8
43
44 #define LOAD4_H(idx) \
45 u8x16 idx##0 = vec_xl(0, dst); /* p1_0 p0_0 q0_0 q1_0 */ \
46 dst += stridea; \
47 u8x16 idx##1 = vec_xl(0, dst); /* p1_1 p0_1 q0_1 q1_1 */ \
48 dst += stridea; \
49 u8x16 idx##2 = vec_xl(0, dst); /* p1_2 p0_2 q0_2 q1_2 */ \
50 dst += stridea; \
51 u8x16 idx##3 = vec_xl(0, dst); /* p1_3 p0_3 q0_3 q1_3 */ \
52
53 // return idx##_01 and idx##_23
54 #define LOAD4_H_SINGLE(idx) \
55 LOAD4_H(idx) \
56 \
57 u8x16 idx##_01 = vec_mergeh(idx##0, idx##1); /* p1_0 p1_1 p0_0 p0_1 q0_0 q0_1 q1_0 q1_1 */ \
58 u8x16 idx##_23 = vec_mergeh(idx##2, idx##3); /* p1_2 p1_3 p0_2 p0_3 q0_2 q0_3 q1_2 q1_3 */
59
60
61 #define DECLARE_ADD_16HL(r, a, b) \
62 u16x8 r##h = vec_add(a##h, b##h); \
63 u16x8 r##l = vec_add(a##l, b##l);
64
65 #define ADD_16HL(r, a, b) \
66 r##h = vec_add(a##h, b##h); \
67 r##l = vec_add(a##l, b##l);
68
69 #define ADD_AND_SHIFT4(v) \
70 v##h = vec_sr(vec_add(v##h, v4u16), v3u16); \
71 v##l = vec_sr(vec_add(v##l, v4u16), v3u16);
72 #define ADD_AND_SHIFT8(v) \
73 v##h = vec_sr(vec_add(v##h, v8u16), v4u16); \
74 v##l = vec_sr(vec_add(v##l, v8u16), v4u16);
75
76 #define PACK_AND_SEL(v, m) \
77 vec_sel(v, vec_pack(o##v##h, o##v##l), m)
78
79 #define UNPACK_16(v) \
80 u16x8 v##h = u8h_to_u16(v); \
81 u16x8 v##l = u8l_to_u16(v);
82
83
84 #define APPLY_4 \
85 b8x16 hev = vec_cmpgt(max_a_p1p0_q1q0, H); \
86 \
87 i8x16 ps1 = (i8x16)vec_xor(p1, s); \
88 i8x16 ps0 = (i8x16)vec_xor(p0, s); \
89 i8x16 qs0 = (i8x16)vec_xor(q0, s); \
90 i8x16 qs1 = (i8x16)vec_xor(q1, s); \
91 i8x16 f0 = vec_and(vec_subs(ps1, qs1), hev); \
92 i16x8 q0sh = (i16x8)q0h; \
93 i16x8 q0sl = (i16x8)q0l; \
94 i16x8 p0sh = (i16x8)p0h; \
95 i16x8 p0sl = (i16x8)p0l; \
96 i16x8 f0h = i8h_to_i16(f0); \
97 i16x8 f0l = i8l_to_i16(f0); \
98 i16x8 d0h = vec_sub(q0sh, p0sh); \
99 i16x8 d0l = vec_sub(q0sl, p0sl); \
100 u8x16 v3u8 = vec_splat_u8(3); \
101 i16x8 d0h_2 = vec_add(d0h, d0h); \
102 i16x8 d0l_2 = vec_add(d0l, d0l); \
103 u8x16 v4u8 = vec_splat_u8(4); \
104 i16x8 f0_d0h = vec_add(d0h, f0h); \
105 i16x8 f0_d0l = vec_add(d0l, f0l); \
106 i16x8 fh = vec_add(d0h_2, f0_d0h); \
107 i16x8 fl = vec_add(d0l_2, f0_d0l); \
108 i8x16 f = vec_packs(fh, fl); \
109 i8x16 f1 = vec_adds(f, (i8x16)v4u8); \
110 i8x16 f2 = vec_adds(f, (i8x16)v3u8); \
111 f1 = vec_sra(f1, v3u8); \
112 f2 = vec_sra(f2, v3u8); \
113 f1 = vec_and(f1, fm); \
114 f2 = vec_and(f2, fm); \
115 i8x16 f3 = vec_adds(f1, (i8x16)v1u8); \
116 b8x16 m3 = vec_and(~hev, (b8x16)fm); \
117 f3 = vec_sra(f3, v1u8); \
118 f3 = vec_and(f3, m3); \
119 i8x16 op0s = vec_adds(ps0, f2); \
120 i8x16 oq0s = vec_subs(qs0, f1); \
121 i8x16 oq1s = vec_subs(qs1, f3); \
122 i8x16 op1s = vec_adds(ps1, f3); \
123 p0 = (u8x16)vec_xor(op0s, s); \
124 q0 = (u8x16)vec_xor(oq0s, s); \
125 q1 = (u8x16)vec_xor(oq1s, s); \
126 p1 = (u8x16)vec_xor(op1s, s);
127
128 #define APPLY_8 \
129 DECLARE_ADD_16HL(p1p0, p1, p0) \
130 DECLARE_ADD_16HL(p2q0, p2, q0) \
131 DECLARE_ADD_16HL(q1q2, q1, q2) \
132 DECLARE_ADD_16HL(p3p3, p3, p3) \
133 DECLARE_ADD_16HL(q0q3, q0, q3) \
134 DECLARE_ADD_16HL(p3p2, p3, p2) \
135 DECLARE_ADD_16HL(p1q1, p1, q1) \
136 DECLARE_ADD_16HL(p3p0, p3, p0) \
137 DECLARE_ADD_16HL(p0q2, p0, q2) \
138 DECLARE_ADD_16HL(q1q3, q1, q3) \
139 DECLARE_ADD_16HL(q3q3, q3, q3) \
140 DECLARE_ADD_16HL(q0q1q2q3, q0q3, q1q2) \
141 DECLARE_ADD_16HL(p2p1p0q0, p1p0, p2q0) \
142 DECLARE_ADD_16HL(p3p3p3p2, p3p3, p3p2) \
143 DECLARE_ADD_16HL(p3p3p1q1, p3p3, p1q1) \
144 DECLARE_ADD_16HL(p3p0q1q2, p3p0, q1q2) \
145 DECLARE_ADD_16HL(p1p0q1q3, p1p0, q1q3) \
146 DECLARE_ADD_16HL(p0q2q3q3, p0q2, q3q3) \
147 \
148 DECLARE_ADD_16HL(op2, p3p3p3p2, p2p1p0q0) \
149 DECLARE_ADD_16HL(op1, p3p3p1q1, p2p1p0q0) \
150 DECLARE_ADD_16HL(op0, p3p0q1q2, p2p1p0q0) \
151 DECLARE_ADD_16HL(oq0, p2p1p0q0, q0q1q2q3) \
152 DECLARE_ADD_16HL(oq1, p1p0q1q3, q0q1q2q3) \
153 DECLARE_ADD_16HL(oq2, p0q2q3q3, q0q1q2q3) \
154 \
155 ADD_AND_SHIFT4(op2) \
156 ADD_AND_SHIFT4(op1) \
157 ADD_AND_SHIFT4(op0) \
158 ADD_AND_SHIFT4(oq0) \
159 ADD_AND_SHIFT4(oq1) \
160 ADD_AND_SHIFT4(oq2) \
161 \
162 p2 = PACK_AND_SEL(p2, apply_8); \
163 p1 = PACK_AND_SEL(p1, apply_8); \
164 p0 = PACK_AND_SEL(p0, apply_8); \
165 q0 = PACK_AND_SEL(q0, apply_8); \
166 q1 = PACK_AND_SEL(q1, apply_8); \
167 q2 = PACK_AND_SEL(q2, apply_8);
168
169 #define APPLY_16 \
170 DECLARE_ADD_16HL(p6p6, p6, p6) \
171 DECLARE_ADD_16HL(p6p5, p6, p5) \
172 DECLARE_ADD_16HL(p6p4, p6, p4) \
173 DECLARE_ADD_16HL(p4p3, p4, p3) \
174 DECLARE_ADD_16HL(p2p1, p2, p1) \
175 DECLARE_ADD_16HL(p2q2, p2, q2) \
176 DECLARE_ADD_16HL(p3q1, p3, q1) \
177 DECLARE_ADD_16HL(p0q0, p0, q0) \
178 DECLARE_ADD_16HL(p0q1, p0, q1) \
179 DECLARE_ADD_16HL(p1q3, p1, q3) \
180 DECLARE_ADD_16HL(p1q0, p1, q0) \
181 DECLARE_ADD_16HL(p1q5, p1, q5) \
182 DECLARE_ADD_16HL(q3q4, q3, q4) \
183 DECLARE_ADD_16HL(q2q5, q2, q5) \
184 DECLARE_ADD_16HL(q1q6, q1, q6) \
185 DECLARE_ADD_16HL(q0q1, q0, q1) \
186 DECLARE_ADD_16HL(q6q6, q6, q6) \
187 DECLARE_ADD_16HL(q2q6, q2, q6) \
188 DECLARE_ADD_16HL(q3q6, q3, q6) \
189 DECLARE_ADD_16HL(q4q6, q4, q6) \
190 DECLARE_ADD_16HL(p5q0, p5, q0) \
191 \
192 DECLARE_ADD_16HL(p6q2, p6, q2) \
193 DECLARE_ADD_16HL(p6p6p6p4, p6p6, p6p4) \
194 DECLARE_ADD_16HL(p6p5p2p1, p6p5, p2p1) \
195 DECLARE_ADD_16HL(p4p3p0q0, p4p3, p0q0) \
196 DECLARE_ADD_16HL(p2q2p3q1, p2q2, p3q1) \
197 DECLARE_ADD_16HL(p6p5p6p6, p6p5, p6p6) \
198 DECLARE_ADD_16HL(p6p5p3q1, p6p5, p3q1) \
199 DECLARE_ADD_16HL(p6p6p1q3, p6p6, p1q3) \
200 DECLARE_ADD_16HL(q2q5q3q4, q2q5, q3q4) \
201 DECLARE_ADD_16HL(p2p1q1q6, p2p1, q1q6) \
202 DECLARE_ADD_16HL(p0q0q3q6, p0q0, q3q6) \
203 DECLARE_ADD_16HL(q4q6q6q6, q4q6, q6q6) \
204 u16x8 q5q6q6q6h = vec_madd(v3u16, q6h, q5h); \
205 u16x8 q5q6q6q6l = vec_madd(v3u16, q6l, q5l); \
206 DECLARE_ADD_16HL(p0q0q1q6, p0q0, q1q6) \
207 DECLARE_ADD_16HL(p0q1q3q4, p0q1, q3q4) \
208 \
209 DECLARE_ADD_16HL(p6q2p2p1, p6q2, p2p1) \
210 DECLARE_ADD_16HL(p1q0q2q5, p1q0, q2q5) \
211 DECLARE_ADD_16HL(p0q1p5q0, p0q1, p5q0) \
212 DECLARE_ADD_16HL(q0q1q2q6, q0q1, q2q6) \
213 DECLARE_ADD_16HL(p3q1q2q6, p3q1, q2q6) \
214 DECLARE_ADD_16HL(q2q6q4q6, q2q6, q4q6) \
215 DECLARE_ADD_16HL(q3q6p1q5, q3q6, p1q5) \
216 \
217 DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6, p4p3p0q0, p2p1q1q6) \
218 DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0, p6p5p2p1, p4p3p0q0) \
219 DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4, p2p1q1q6, q2q5q3q4) \
220 DECLARE_ADD_16HL(q2q5q3q4q4q6q6q6, q2q5q3q4, q4q6q6q6) \
221 DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p2q2p3q1, p6p5p2p1p4p3p0q0, p2q2p3q1) \
222 DECLARE_ADD_16HL(p6p6p6p4p6p5p2p1p4p3p0q0, p6p6p6p4, p6p5p2p1p4p3p0q0) \
223 DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6q2q5q3q4, p4p3p0q0p2p1q1q6, q2q5q3q4) \
224 DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4p0q0q3q6, p2p1q1q6q2q5q3q4, p0q0q3q6) \
225 DECLARE_ADD_16HL(p0q0q1q6q2q5q3q4q4q6q6q6, p0q0q1q6, q2q5q3q4q4q6q6q6) \
226 DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p0q1q3q4, p6p5p2p1p4p3p0q0, p0q1q3q4) \
227 \
228 DECLARE_ADD_16HL(op5, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p6p6) \
229 DECLARE_ADD_16HL(op4, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p3q1) \
230 DECLARE_ADD_16HL(op3, p6p6p6p4, p6p5p2p1p4p3p0q0p2q2p3q1) \
231 DECLARE_ADD_16HL(op2, p6p6p1q3, p6p5p2p1p4p3p0q0p2q2p3q1) \
232 DECLARE_ADD_16HL(op1, p6p5p2p1p4p3p0q0p0q1q3q4, p6q2p2p1) \
233 DECLARE_ADD_16HL(op0, p6p5p2p1p4p3p0q0p0q1q3q4, p1q0q2q5) \
234 DECLARE_ADD_16HL(oq0, p4p3p0q0p2p1q1q6q2q5q3q4, p0q1p5q0) \
235 DECLARE_ADD_16HL(oq1, p4p3p0q0p2p1q1q6q2q5q3q4, q0q1q2q6) \
236 DECLARE_ADD_16HL(oq2, p2p1q1q6q2q5q3q4p0q0q3q6, p3q1q2q6) \
237 DECLARE_ADD_16HL(oq3, p2p1q1q6q2q5q3q4p0q0q3q6, q2q6q4q6) \
238 DECLARE_ADD_16HL(oq4, p0q0q1q6q2q5q3q4q4q6q6q6, q3q6p1q5) \
239 DECLARE_ADD_16HL(oq5, p0q0q1q6q2q5q3q4q4q6q6q6, q5q6q6q6) \
240 \
241 ADD_AND_SHIFT8(op5) \
242 ADD_AND_SHIFT8(op4) \
243 ADD_AND_SHIFT8(op3) \
244 ADD_AND_SHIFT8(op2) \
245 ADD_AND_SHIFT8(op1) \
246 ADD_AND_SHIFT8(op0) \
247 ADD_AND_SHIFT8(oq0) \
248 ADD_AND_SHIFT8(oq1) \
249 ADD_AND_SHIFT8(oq2) \
250 ADD_AND_SHIFT8(oq3) \
251 ADD_AND_SHIFT8(oq4) \
252 ADD_AND_SHIFT8(oq5) \
253 \
254 p5 = PACK_AND_SEL(p5, apply_16); \
255 p4 = PACK_AND_SEL(p4, apply_16); \
256 p3 = PACK_AND_SEL(p3, apply_16); \
257 p2 = PACK_AND_SEL(p2, apply_16); \
258 p1 = PACK_AND_SEL(p1, apply_16); \
259 p0 = PACK_AND_SEL(p0, apply_16); \
260 q0 = PACK_AND_SEL(q0, apply_16); \
261 q1 = PACK_AND_SEL(q1, apply_16); \
262 q2 = PACK_AND_SEL(q2, apply_16); \
263 q3 = PACK_AND_SEL(q3, apply_16); \
264 q4 = PACK_AND_SEL(q4, apply_16); \
265 q5 = PACK_AND_SEL(q5, apply_16); \
266
267
268
store_h_4(u8x16 out,uint8_t * dst,int stridea)269 static inline void store_h_4(u8x16 out, uint8_t *dst, int stridea)
270 {
271 u8x16 out1 = (u8x16)vec_splat((u32x4)out, 1);
272 u8x16 out2 = (u8x16)vec_splat((u32x4)out, 2);
273 u8x16 out3 = (u8x16)vec_splat((u32x4)out, 3);
274 vec_xst_len(out, dst, 4);
275 dst += stridea;
276 vec_xst_len(out1, dst, 4);
277 dst += stridea;
278 vec_xst_len(out2, dst, 4);
279 dst += stridea;
280 vec_xst_len(out3, dst, 4);
281 }
282
store_h_8(u8x16 outa,u8x16 outb,uint8_t * dst,int stridea)283 static inline void store_h_8(u8x16 outa, u8x16 outb, uint8_t *dst, int stridea)
284 {
285 u8x16 out1 = (u8x16)vec_mergel((u64x2)outa, (u64x2)outa);
286 u8x16 out3 = (u8x16)vec_mergel((u64x2)outb, (u64x2)outb);
287 vec_xst_len(outa, dst, 6);
288 dst += stridea;
289 vec_xst_len(out1, dst, 6);
290 dst += stridea;
291 vec_xst_len(outb, dst, 6);
292 dst += stridea;
293 vec_xst_len(out3, dst, 6);
294 }
295
296 // Assume a layout {v}0 {v}1 {v}2 {v}3, produces {v}01 {v}23
297 #define MERGEH_4(v) \
298 u8x16 v##01 = vec_mergeh(v##0, v##1); \
299 u8x16 v##23 = vec_mergeh(v##2, v##3);
300
301 #define MERGEL_4(v) \
302 u8x16 v##01 = vec_mergel(v##0, v##1); \
303 u8x16 v##23 = vec_mergel(v##2, v##3);
304
305 // produce {v}0123h
306 #define MERGEH_U16_0123(v) \
307 u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23);
308
309 #define MERGEHL_U16_0123(v) \
310 u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23);
311
312 #define MERGE_U16_0123(v) \
313 u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23); \
314 u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23);
315
316 // produce {ac,bd}0123h{dir}
317 #define MERGEH_U32_LINE(dir) \
318 u32x4 ac0123h##dir = vec_mergeh((u32x4)a0123##dir, (u32x4)c0123##dir); \
319 u32x4 bd0123h##dir = vec_mergeh((u32x4)b0123##dir, (u32x4)d0123##dir);
320
321 #define MERGEL_U32_LINE(dir) \
322 u32x4 ac0123l##dir = vec_mergel((u32x4)a0123##dir, (u32x4)c0123##dir); \
323 u32x4 bd0123l##dir = vec_mergel((u32x4)b0123##dir, (u32x4)d0123##dir);
324
325
326 // produce the pair of mergeh/mergel of {ac,bd}01234{dira}{dirb}
327 #define MERGE_U32(oh, ol, dira, dirb) \
328 oh = (u8x16)vec_mergeh(ac0123##dira##dirb, bd0123##dira##dirb); \
329 ol = (u8x16)vec_mergel(ac0123##dira##dirb, bd0123##dira##dirb);
330
331 #define MERGEHL_U8(a, b) \
332 u8x16 a##b##h = vec_mergeh(a, b); \
333 u8x16 a##b##l = vec_mergel(a, b);
334
335 #define MERGEHL_U16(out, a, b) \
336 u8x16 out##h = (u8x16)vec_mergeh((u16x8)a, (u16x8)b); \
337 u8x16 out##l = (u8x16)vec_mergel((u16x8)a, (u16x8)b);
338
339 #define MERGEHL_U32(out, a, b) \
340 u8x16 out##h = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \
341 u8x16 out##l = (u8x16)vec_mergel((u32x4)a, (u32x4)b);
342
343 static inline void
loop_filter_h_4_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply HIGHBD_DECL_SUFFIX)344 loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
345 const ptrdiff_t stridea, b32x4 apply
346 HIGHBD_DECL_SUFFIX)
347 {
348 dst -= 2;
349 uint8_t *dst2 = dst;
350 u8x16 p1, p0, q0, q1;
351
352 LOAD4_H(a)
353 dst += stridea;
354 LOAD4_H(b)
355 dst += stridea;
356 LOAD4_H(c)
357 dst += stridea;
358 LOAD4_H(d)
359
360 MERGEH_4(a)
361 MERGEH_4(b)
362 MERGEH_4(c)
363 MERGEH_4(d)
364
365 MERGEH_U16_0123(a)
366 MERGEH_U16_0123(b)
367 MERGEH_U16_0123(c)
368 MERGEH_U16_0123(d)
369
370 MERGEH_U32_LINE(h)
371 MERGEL_U32_LINE(h)
372
373 MERGE_U32(p1, p0, h, h)
374 MERGE_U32(q0, q1, l, h)
375
376 const u8x16 zero = vec_splat_u8(0);
377 const u8x16 v1u8 = vec_splat_u8(1);
378 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
379
380 const u8x16 a_p1_p0 = vec_absd(p1, p0);
381 const u8x16 a_q1_q0 = vec_absd(q1, q0);
382 const u8x16 a_p0_q0 = vec_absd(p0, q0);
383 const u8x16 a_p1_q1 = vec_absd(p1, q1);
384
385 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
386 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
387 const u8x16 cmp_I = max_a_p1p0_q1q0;
388 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
389 const b8x16 ltI = vec_cmple(cmp_I, I);
390 const b8x16 ltE = vec_cmple(cmp_E, E);
391 b8x16 fm = vec_and(ltI, ltE);
392
393 fm = vec_and(fm, (b8x16)apply);
394 if (vec_all_eq(fm, zero))
395 return;
396
397 UNPACK_16(p0)
398 UNPACK_16(q0)
399
400 APPLY_4
401
402 u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ...
403 u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ...
404 u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ...
405 u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ...
406
407 u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ...
408 u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab);
409 u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd);
410 u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd);
411
412 if (apply[0]) {
413 store_h_4(outa, dst2, stridea);
414 }
415 dst2 += 4 * stridea;
416 if (apply[1]) {
417 store_h_4(outb, dst2, stridea);
418 }
419 dst2 += 4 * stridea;
420 if (apply[2]) {
421 store_h_4(outc, dst2, stridea);
422 }
423 dst2 += 4 * stridea;
424 if (apply[3]) {
425 store_h_4(outd, dst2, stridea);
426 }
427 }
428
429 static inline void
loop_filter_h_6_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply,b32x4 m6 HIGHBD_DECL_SUFFIX)430 loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
431 const ptrdiff_t stridea, b32x4 apply, b32x4 m6
432 HIGHBD_DECL_SUFFIX)
433 {
434 uint8_t *dst2 = dst - 2;
435 dst -= 3;
436 u8x16 p2, p1, p0, q0, q1, q2;
437
438 LOAD4_H(a)
439 dst += stridea;
440 LOAD4_H(b)
441 dst += stridea;
442 LOAD4_H(c)
443 dst += stridea;
444 LOAD4_H(d)
445
446 MERGEH_4(a)
447 MERGEH_4(b)
448 MERGEH_4(c)
449 MERGEH_4(d)
450
451 MERGE_U16_0123(a)
452 MERGE_U16_0123(b)
453 MERGE_U16_0123(c)
454 MERGE_U16_0123(d)
455
456 MERGEH_U32_LINE(h)
457 MERGEL_U32_LINE(h)
458 MERGEH_U32_LINE(l)
459
460 MERGE_U32(p2, p1, h, h)
461 MERGE_U32(p0, q0, l, h)
462 MERGE_U32(q1, q2, h, l)
463
464 const u8x16 F = vec_splat_u8(1);
465
466 const u8x16 zero = vec_splat_u8(0);
467 const u16x8 v3u16 = vec_splat_u16(3);
468 const u16x8 v4u16 = vec_splat_u16(4);
469 const u8x16 v1u8 = vec_splat_u8(1);
470 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
471
472 const u8x16 a_p1_p0 = vec_absd(p1, p0);
473 const u8x16 a_q1_q0 = vec_absd(q1, q0);
474 const u8x16 a_p0_q0 = vec_absd(p0, q0);
475 const u8x16 a_p1_q1 = vec_absd(p1, q1);
476 const u8x16 a_p2_p1 = vec_absd(p2, p1);
477 const u8x16 a_q2_q1 = vec_absd(q2, q1);
478 const u8x16 a_p2_p0 = vec_absd(p2, p0);
479 const u8x16 a_q2_q0 = vec_absd(q2, q0);
480
481 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
482 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
483 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
484 u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
485 u8x16 cmp_I_m6 = max_a_p2p1_q2q1;
486 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
487 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
488 cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6);
489 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6);
490 const b8x16 ltE = vec_cmple(cmp_E, E);
491 const b8x16 ltI = vec_cmple(cmp_I, I);
492 b8x16 fm = vec_and(ltI, ltE);
493
494 fm = vec_and(fm, (b8x16)apply);
495 if (vec_all_eq(fm, zero))
496 return;
497
498 UNPACK_16(p2)
499 UNPACK_16(p1)
500 UNPACK_16(p0)
501 UNPACK_16(q0)
502 UNPACK_16(q1)
503 UNPACK_16(q2)
504
505 m6 = vec_and(m6, (b32x4)fm);
506
507 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
508 b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6);
509
510 b8x16 apply_4 = vec_andc(fm, apply_6);
511
512 if (vec_any_ne(apply_4, zero)) {
513 APPLY_4
514 }
515
516 if (vec_any_ne(apply_6, zero)) {
517 DECLARE_ADD_16HL(p2p2, p2, p2)
518 DECLARE_ADD_16HL(p2p1, p2, p1)
519 DECLARE_ADD_16HL(p1p0, p1, p0)
520 DECLARE_ADD_16HL(p0q0, p0, q0)
521 DECLARE_ADD_16HL(q0q1, q0, q1)
522 DECLARE_ADD_16HL(q1q2, q1, q2)
523 DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0)
524 DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0)
525 DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2)
526 DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1)
527 u16x8 q1q2q2q2h = q2h * 3 + q1h;
528 u16x8 q1q2q2q2l = q2l * 3 + q1l;
529
530 DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0)
531 DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1)
532 DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1)
533 DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2)
534
535 ADD_AND_SHIFT4(op1)
536 ADD_AND_SHIFT4(op0)
537 ADD_AND_SHIFT4(oq0)
538 ADD_AND_SHIFT4(oq1)
539
540 p1 = PACK_AND_SEL(p1, apply_6);
541 p0 = PACK_AND_SEL(p0, apply_6);
542 q0 = PACK_AND_SEL(q0, apply_6);
543 q1 = PACK_AND_SEL(q1, apply_6);
544 }
545
546 u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ...
547 u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ...
548 u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ...
549 u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ...
550
551 u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ...
552 u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab);
553 u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd);
554 u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd);
555
556 if (apply[0]) {
557 store_h_4(outa, dst2, stridea);
558 }
559 dst2 += 4 * stridea;
560 if (apply[1]) {
561 store_h_4(outb, dst2, stridea);
562 }
563 dst2 += 4 * stridea;
564 if (apply[2]) {
565 store_h_4(outc, dst2, stridea);
566 }
567 dst2 += 4 * stridea;
568 if (apply[3]) {
569 store_h_4(outd, dst2, stridea);
570 }
571 }
572
573 static inline void
loop_filter_h_8_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply,b32x4 m8 HIGHBD_DECL_SUFFIX)574 loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
575 const ptrdiff_t stridea, b32x4 apply, b32x4 m8
576 HIGHBD_DECL_SUFFIX)
577 {
578 uint8_t *dst2 = dst - 3;
579 dst -= 4;
580 u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
581
582 LOAD4_H(a)
583 dst += stridea;
584 LOAD4_H(b)
585 dst += stridea;
586 LOAD4_H(c)
587 dst += stridea;
588 LOAD4_H(d)
589
590 MERGEH_4(a)
591 MERGEH_4(b)
592 MERGEH_4(c)
593 MERGEH_4(d)
594
595 MERGE_U16_0123(a)
596 MERGE_U16_0123(b)
597 MERGE_U16_0123(c)
598 MERGE_U16_0123(d)
599
600 MERGEH_U32_LINE(h)
601 MERGEL_U32_LINE(h)
602 MERGEH_U32_LINE(l)
603 MERGEL_U32_LINE(l)
604
605 MERGE_U32(p3, p2, h, h)
606 MERGE_U32(p1, p0, l, h)
607 MERGE_U32(q0, q1, h, l)
608 MERGE_U32(q2, q3, l, l)
609
610 const u8x16 F = vec_splat_u8(1);
611
612 const u8x16 zero = vec_splat_u8(0);
613 const u16x8 v3u16 = vec_splat_u16(3);
614 const u16x8 v4u16 = vec_splat_u16(4);
615 const u8x16 v1u8 = vec_splat_u8(1);
616 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
617
618 const u8x16 a_p1_p0 = vec_absd(p1, p0);
619 const u8x16 a_q1_q0 = vec_absd(q1, q0);
620 const u8x16 a_p0_q0 = vec_absd(p0, q0);
621 const u8x16 a_p1_q1 = vec_absd(p1, q1);
622 const u8x16 a_p2_p1 = vec_absd(p2, p1);
623 const u8x16 a_q2_q1 = vec_absd(q2, q1);
624 const u8x16 a_p2_p0 = vec_absd(p2, p0);
625 const u8x16 a_q2_q0 = vec_absd(q2, q0);
626 const u8x16 a_p3_p0 = vec_absd(p3, p0);
627 const u8x16 a_q3_q0 = vec_absd(q3, q0);
628 const u8x16 a_p3_p2 = vec_absd(p3, p2);
629 const u8x16 a_q3_q2 = vec_absd(q3, q2);
630
631 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
632 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
633 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
634 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
635 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
636 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
637 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
638 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
639 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
640 cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8);
641 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
642 const b8x16 ltE = vec_cmple(cmp_E, E);
643 const b8x16 ltI = vec_cmple(cmp_I, I);
644 b8x16 fm = vec_and(ltI, ltE);
645
646 fm = vec_and(fm, (b8x16)apply);
647 if (vec_all_eq(fm, zero))
648 return;
649
650 #define UNPACK_16(v) \
651 u16x8 v##h = u8h_to_u16(v); \
652 u16x8 v##l = u8l_to_u16(v);
653
654 UNPACK_16(p3)
655 UNPACK_16(p2)
656 UNPACK_16(p1)
657 UNPACK_16(p0)
658 UNPACK_16(q0)
659 UNPACK_16(q1)
660 UNPACK_16(q2)
661 UNPACK_16(q3)
662
663 m8 = vec_and(m8, (b32x4)fm);
664
665 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
666 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
667 b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8);
668
669 b8x16 apply_4 = vec_andc(fm, apply_8);
670
671 if (vec_any_ne(apply_4, zero)) {
672 APPLY_4
673 }
674
675 if (vec_any_ne(apply_8, zero)) {
676 APPLY_8
677 }
678
679 MERGEHL_U8(p2, p1) // A0 A1 A2 A3 B0 B1 B2 B3
680 MERGEHL_U8(p0, q0)
681 MERGEHL_U8(q1, q2)
682
683 MERGEHL_U16(ab_p2p1p0q0, p2p1h, p0q0h) // A0 p2 p1 p0 q0 | A1 p2 p1 p0 q0 | A2 ...
684 // B0 ...
685 MERGEHL_U16(cd_p2p1p0q0, p2p1l, p0q0l) // C0 ...
686 // D0 ...
687 MERGEHL_U16(ab_q1q2, q1q2h, q1q2h) // A0 q1 q2 q1 q2 | A1 q1 q2 ...
688 // B0 ...
689 MERGEHL_U16(cd_q1q2, q1q2l, q1q2l) // C0 ...
690 // D0 ...
691
692 MERGEHL_U32(a, ab_p2p1p0q0h, ab_q1q2h) // A0 p2 p1 p0 q0 q1 q2 q1 q2 | A1 ..
693 // A2 ... | A3 ...
694 MERGEHL_U32(b, ab_p2p1p0q0l, ab_q1q2l) // B0 ...
695 // C2 ...
696 MERGEHL_U32(c, cd_p2p1p0q0h, cd_q1q2h) // C0 ...
697 // C2
698 MERGEHL_U32(d, cd_p2p1p0q0l, cd_q1q2l) // D0 ..
699 // D2 ..
700 if (apply[0]) {
701 store_h_8(ah, al, dst2, stridea);
702 }
703 dst2 += 4 * stridea;
704 if (apply[1]) {
705 store_h_8(bh, bl, dst2, stridea);
706 }
707 dst2 += 4 * stridea;
708
709 if (apply[2]) {
710 store_h_8(ch, cl, dst2, stridea);
711 }
712 dst2 += 4 * stridea;
713 if (apply[3]) {
714 store_h_8(dh, dl, dst2, stridea);
715 }
716
717 }
718
719 static inline void
loop_filter_h_16_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply,b32x4 m8,b32x4 m16 HIGHBD_DECL_SUFFIX)720 loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
721 const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16
722 HIGHBD_DECL_SUFFIX)
723 {
724 uint8_t *dst2 = dst -6 ;
725 dst -= 7;
726 u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
727 u8x16 p6, p5, p4, q4, q5, q6;
728
729 LOAD4_H(a)
730 dst += stridea;
731 LOAD4_H(b)
732 dst += stridea;
733 LOAD4_H(c)
734 dst += stridea;
735 LOAD4_H(d)
736
737 {
738 MERGEH_4(a)
739 MERGEH_4(b)
740 MERGEH_4(c)
741 MERGEH_4(d)
742
743 MERGE_U16_0123(a)
744 MERGE_U16_0123(b)
745 MERGE_U16_0123(c)
746 MERGE_U16_0123(d)
747
748 MERGEH_U32_LINE(h)
749 MERGEL_U32_LINE(h)
750 MERGEH_U32_LINE(l)
751 MERGEL_U32_LINE(l)
752
753 MERGE_U32(p6, p5, h, h)
754 MERGE_U32(p4, p3, l, h)
755 MERGE_U32(p2, p1, h, l)
756 MERGE_U32(p0, q0, l, l)
757 }
758 {
759 MERGEL_4(a)
760 MERGEL_4(b)
761 MERGEL_4(c)
762 MERGEL_4(d)
763
764 MERGE_U16_0123(a)
765 MERGE_U16_0123(b)
766 MERGE_U16_0123(c)
767 MERGE_U16_0123(d)
768
769 MERGEH_U32_LINE(h)
770 MERGEL_U32_LINE(h)
771 MERGEH_U32_LINE(l)
772
773 MERGE_U32(q1, q2, h, h)
774 MERGE_U32(q3, q4, l, h)
775 MERGE_U32(q5, q6, h, l)
776 }
777
778 const u8x16 F = vec_splat_u8(1);
779
780 const u8x16 zero = vec_splat_u8(0);
781 const u16x8 v3u16 = vec_splat_u16(3);
782 const u16x8 v4u16 = vec_splat_u16(4);
783 const u16x8 v8u16 = vec_splat_u16(8);
784 const u8x16 v1u8 = vec_splat_u8(1);
785 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
786
787 const u8x16 a_p6_p0 = vec_absd(p6, p0);
788 const u8x16 a_p5_p0 = vec_absd(p5, p0);
789 const u8x16 a_p4_p0 = vec_absd(p4, p0);
790 const u8x16 a_q4_q0 = vec_absd(q4, q0);
791 const u8x16 a_q5_q0 = vec_absd(q5, q0);
792 const u8x16 a_q6_q0 = vec_absd(q6, q0);
793
794 const u8x16 a_p1_p0 = vec_absd(p1, p0);
795 const u8x16 a_q1_q0 = vec_absd(q1, q0);
796 const u8x16 a_p0_q0 = vec_absd(p0, q0);
797 const u8x16 a_p1_q1 = vec_absd(p1, q1);
798 const u8x16 a_p2_p1 = vec_absd(p2, p1);
799 const u8x16 a_q2_q1 = vec_absd(q2, q1);
800 const u8x16 a_p2_p0 = vec_absd(p2, p0);
801 const u8x16 a_q2_q0 = vec_absd(q2, q0);
802 const u8x16 a_p3_p0 = vec_absd(p3, p0);
803 const u8x16 a_q3_q0 = vec_absd(q3, q0);
804 const u8x16 a_p3_p2 = vec_absd(p3, p2);
805 const u8x16 a_q3_q2 = vec_absd(q3, q2);
806
807 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
808 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
809 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
810 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
811 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
812
813 const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0);
814 const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0);
815 const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0);
816
817 b32x4 m8_16 = vec_or(m8, m16);
818
819 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
820 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
821 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
822 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
823 cmp_I_m8 = vec_and(cmp_I_m8, (b8x16)m8_16);
824 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
825 const b8x16 ltE = vec_cmple(cmp_E, E);
826 const b8x16 ltI = vec_cmple(cmp_I, I);
827 b8x16 fm = vec_and(ltI, ltE);
828
829 fm = vec_and(fm, (b8x16)apply);
830 if (vec_all_eq(fm, zero))
831 return;
832
833 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
834 u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0);
835
836 m8_16 = vec_and(m8_16, (b32x4)fm);
837 m16 = vec_and(m16, (b32x4)fm);
838
839 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
840 cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out);
841 b8x16 flat8in = vec_cmple(cmp_flat8in, F);
842 b8x16 flat8out = vec_cmple(cmp_flat8out, F);
843 flat8in = vec_and(flat8in, (b8x16)m8_16);
844 flat8out = vec_and(flat8out, (b8x16)m16);
845
846 b8x16 apply_16 = vec_and(flat8out, flat8in);
847 b8x16 apply_8 = vec_andc(flat8in, flat8out);
848
849 UNPACK_16(p6)
850 UNPACK_16(p5)
851 UNPACK_16(p4)
852 UNPACK_16(p3)
853 UNPACK_16(p2)
854 UNPACK_16(p1)
855 UNPACK_16(p0)
856
857 b8x16 apply_4 = vec_and(fm, vec_nor(apply_16, apply_8));
858
859 UNPACK_16(q0)
860 UNPACK_16(q1)
861 UNPACK_16(q2)
862 UNPACK_16(q3)
863 UNPACK_16(q4)
864 UNPACK_16(q5)
865 UNPACK_16(q6)
866
867 if (vec_any_ne(apply_4, zero)) {
868 APPLY_4
869 }
870
871 if (vec_any_ne(apply_16, zero)) {
872 APPLY_16
873 }
874
875 if (vec_any_ne(apply_8, zero)) {
876 APPLY_8
877 }
878
879 MERGEHL_U8(p5, p4)
880 MERGEHL_U8(p3, p2)
881 MERGEHL_U8(p1, p0)
882 MERGEHL_U8(q0, q1)
883 MERGEHL_U8(q2, q3)
884 MERGEHL_U8(q4, q5)
885
886 MERGEHL_U16(ab_p5p4p3p2, p5p4h, p3p2h)
887 MERGEHL_U16(cd_p5p4p3p2, p5p4l, p3p2l)
888 MERGEHL_U16(ab_p1p0q0q1, p1p0h, q0q1h)
889 MERGEHL_U16(cd_p1p0q0q1, p1p0l, q0q1l)
890 MERGEHL_U16(ab_q2q3q4q5, q2q3h, q4q5h)
891 MERGEHL_U16(cd_q2q3q4q5, q2q3l, q4q5l)
892
893
894 MERGEHL_U32(a_p5p4p3p2q2q3q4q5, ab_p5p4p3p2h, ab_q2q3q4q5h) // A0 p5p4p3p2 q2q3q4q5 A1
895 // A2 A3
896 MERGEHL_U32(a_p1p0q0q1q2q3q4q5, ab_p1p0q0q1h, ab_q2q3q4q5h) // A0 p1p0q0q1 q2q3q4q5 A1
897 // A2 A3
898 MERGEHL_U32(b_p5p4p3p2q2q3q4q5, ab_p5p4p3p2l, ab_q2q3q4q5l) // B0 p5p4p3p2 q2q3q4q5 B1
899 // A2 A3
900 MERGEHL_U32(b_p1p0q0q1q2q3q4q5, ab_p1p0q0q1l, ab_q2q3q4q5l) // B0 p1p0q0q1 q2q3q4q5 B1
901 // B2 B3
902 MERGEHL_U32(c_p5p4p3p2q2q3q4q5, cd_p5p4p3p2h, cd_q2q3q4q5h) // C0 p5p4p3p2 q2q3q4q5 C1
903 // C2 C3
904 MERGEHL_U32(c_p1p0q0q1q2q3q4q5, cd_p1p0q0q1h, cd_q2q3q4q5h) // C0 p1p0q0q1 q2q3q4q5 C1
905 // C2 C3
906 MERGEHL_U32(d_p5p4p3p2q2q3q4q5, cd_p5p4p3p2l, cd_q2q3q4q5l) // D0 p5p4p3p2 q2q3q4q5 D1
907 // D2 D3
908 MERGEHL_U32(d_p1p0q0q1q2q3q4q5, cd_p1p0q0q1l, cd_q2q3q4q5l) // D0 p1p0q0q1 q2q3q4q5 D1
909 // D2 D3
910
911 MERGEHL_U32(a01, a_p5p4p3p2q2q3q4q5h, a_p1p0q0q1q2q3q4q5h) // A0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
912 // A1
913 vec_xst_len(a01h, dst2, 12);
914 dst2 += stridea;
915 vec_xst_len(a01l, dst2, 12);
916 dst2 += stridea;
917 MERGEHL_U32(a23, a_p5p4p3p2q2q3q4q5l, a_p1p0q0q1q2q3q4q5l) // A2
918 // A3
919 vec_xst_len(a23h, dst2, 12);
920 dst2 += stridea;
921 vec_xst_len(a23l, dst2, 12);
922 dst2 += stridea;
923 MERGEHL_U32(b01, b_p5p4p3p2q2q3q4q5h, b_p1p0q0q1q2q3q4q5h) // B0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
924 // B1
925 vec_xst_len(b01h, dst2, 12);
926 dst2 += stridea;
927 vec_xst_len(b01l, dst2, 12);
928 dst2 += stridea;
929 MERGEHL_U32(b23, b_p5p4p3p2q2q3q4q5l, b_p1p0q0q1q2q3q4q5l) // B2
930 // B3
931 vec_xst_len(b23h, dst2, 12);
932 dst2 += stridea;
933 vec_xst_len(b23l, dst2, 12);
934 dst2 += stridea;
935 MERGEHL_U32(c01, c_p5p4p3p2q2q3q4q5h, c_p1p0q0q1q2q3q4q5h) // C0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
936 // C1
937 vec_xst_len(c01h, dst2, 12);
938 dst2 += stridea;
939 vec_xst_len(c01l, dst2, 12);
940 dst2 += stridea;
941 MERGEHL_U32(c23, c_p5p4p3p2q2q3q4q5l, c_p1p0q0q1q2q3q4q5l) // C2
942 // C3
943 vec_xst_len(c23h, dst2, 12);
944 dst2 += stridea;
945 vec_xst_len(c23l, dst2, 12);
946 dst2 += stridea;
947 MERGEHL_U32(d01, d_p5p4p3p2q2q3q4q5h, d_p1p0q0q1q2q3q4q5h) // D0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
948 // D1
949 vec_xst_len(d01h, dst2, 12);
950 dst2 += stridea;
951 vec_xst_len(d01l, dst2, 12);
952 dst2 += stridea;
953 MERGEHL_U32(d23, d_p5p4p3p2q2q3q4q5l, d_p1p0q0q1q2q3q4q5l) // D2
954 // D3
955 vec_xst_len(d23h, dst2, 12);
956 dst2 += stridea;
957 vec_xst_len(d23l, dst2, 12);
958 dst2 += stridea;
959 }
960
961 static inline void
loop_filter_v_4_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply HIGHBD_DECL_SUFFIX)962 loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
963 const ptrdiff_t strideb, b32x4 apply
964 HIGHBD_DECL_SUFFIX)
965 {
966 uint8_t *p1d = dst + strideb * -2;
967 uint8_t *p0d = dst + strideb * -1;
968 uint8_t *q0d = dst + strideb * +0;
969 uint8_t *q1d = dst + strideb * +1;
970
971 u8x16 p1 = vec_xl(0, p1d);
972 u8x16 p0 = vec_xl(0, p0d);
973 u8x16 q0 = vec_xl(0, q0d);
974 u8x16 q1 = vec_xl(0, q1d);
975
976 const u8x16 zero = vec_splat_u8(0);
977 const u8x16 v1u8 = vec_splat_u8(1);
978 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
979
980 const u8x16 a_p1_p0 = vec_absd(p1, p0);
981 const u8x16 a_q1_q0 = vec_absd(q1, q0);
982 const u8x16 a_p0_q0 = vec_absd(p0, q0);
983 const u8x16 a_p1_q1 = vec_absd(p1, q1);
984
985 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
986 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
987 const u8x16 cmp_I = max_a_p1p0_q1q0;
988 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
989 const b8x16 ltI = vec_cmple(cmp_I, I);
990 const b8x16 ltE = vec_cmple(cmp_E, E);
991 b8x16 fm = vec_and(ltI, ltE);
992
993 fm = vec_and(fm, (b8x16)apply);
994 if (vec_all_eq(fm, zero))
995 return;
996
997 UNPACK_16(p0)
998 UNPACK_16(q0)
999
1000 APPLY_4
1001
1002 vec_xst(p0, 0, p0d);
1003 vec_xst(q0, 0, q0d);
1004 vec_xst(q1, 0, q1d);
1005 vec_xst(p1, 0, p1d);
1006 }
1007
1008 static inline void
loop_filter_v_6_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply,b32x4 m6 HIGHBD_DECL_SUFFIX)1009 loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
1010 const ptrdiff_t strideb, b32x4 apply, b32x4 m6
1011 HIGHBD_DECL_SUFFIX)
1012 {
1013 uint8_t *p2d = dst + strideb * -3;
1014 uint8_t *p1d = dst + strideb * -2;
1015 uint8_t *p0d = dst + strideb * -1;
1016 uint8_t *q0d = dst + strideb * +0;
1017 uint8_t *q1d = dst + strideb * +1;
1018 uint8_t *q2d = dst + strideb * +2;
1019
1020 u8x16 p2 = vec_xl(0, p2d);
1021 u8x16 p1 = vec_xl(0, p1d);
1022 u8x16 p0 = vec_xl(0, p0d);
1023 u8x16 q0 = vec_xl(0, q0d);
1024 u8x16 q1 = vec_xl(0, q1d);
1025 u8x16 q2 = vec_xl(0, q2d);
1026
1027 const u8x16 F = vec_splat_u8(1);
1028
1029 const u8x16 zero = vec_splat_u8(0);
1030 const u16x8 v3u16 = vec_splat_u16(3);
1031 const u16x8 v4u16 = vec_splat_u16(4);
1032 const u8x16 v1u8 = vec_splat_u8(1);
1033 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
1034
1035 const u8x16 a_p1_p0 = vec_absd(p1, p0);
1036 const u8x16 a_q1_q0 = vec_absd(q1, q0);
1037 const u8x16 a_p0_q0 = vec_absd(p0, q0);
1038 const u8x16 a_p1_q1 = vec_absd(p1, q1);
1039 const u8x16 a_p2_p1 = vec_absd(p2, p1);
1040 const u8x16 a_q2_q1 = vec_absd(q2, q1);
1041 const u8x16 a_p2_p0 = vec_absd(p2, p0);
1042 const u8x16 a_q2_q0 = vec_absd(q2, q0);
1043
1044 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
1045 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
1046 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
1047 u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
1048 u8x16 cmp_I_m6 = max_a_p2p1_q2q1;
1049 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
1050 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
1051 cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6);
1052 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6);
1053 const b8x16 ltE = vec_cmple(cmp_E, E);
1054 const b8x16 ltI = vec_cmple(cmp_I, I);
1055 b8x16 fm = vec_and(ltI, ltE);
1056
1057 fm = vec_and(fm, (b8x16)apply);
1058 if (vec_all_eq(fm, zero))
1059 return;
1060
1061 UNPACK_16(p2)
1062 UNPACK_16(p1)
1063 UNPACK_16(p0)
1064 UNPACK_16(q0)
1065 UNPACK_16(q1)
1066 UNPACK_16(q2)
1067
1068 m6 = vec_and(m6, (b32x4)fm);
1069
1070 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
1071 b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6);
1072
1073 b8x16 apply_4 = vec_andc(fm, apply_6);
1074
1075 if (vec_any_ne(apply_4, zero)) {
1076 APPLY_4
1077 }
1078
1079 if (vec_any_ne(apply_6, zero)) {
1080 DECLARE_ADD_16HL(p2p2, p2, p2)
1081 DECLARE_ADD_16HL(p2p1, p2, p1)
1082 DECLARE_ADD_16HL(p1p0, p1, p0)
1083 DECLARE_ADD_16HL(p0q0, p0, q0)
1084 DECLARE_ADD_16HL(q0q1, q0, q1)
1085 DECLARE_ADD_16HL(q1q2, q1, q2)
1086 DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0)
1087 DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0)
1088 DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2)
1089 DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1)
1090 u16x8 q1q2q2q2h = q2h * 3 + q1h;
1091 u16x8 q1q2q2q2l = q2l * 3 + q1l;
1092
1093 DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0)
1094 DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1)
1095 DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1)
1096 DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2)
1097
1098 ADD_AND_SHIFT4(op1)
1099 ADD_AND_SHIFT4(op0)
1100 ADD_AND_SHIFT4(oq0)
1101 ADD_AND_SHIFT4(oq1)
1102
1103 p1 = PACK_AND_SEL(p1, apply_6);
1104 p0 = PACK_AND_SEL(p0, apply_6);
1105 q0 = PACK_AND_SEL(q0, apply_6);
1106 q1 = PACK_AND_SEL(q1, apply_6);
1107 }
1108
1109 vec_xst(p0, 0, p0d);
1110 vec_xst(q0, 0, q0d);
1111 vec_xst(q1, 0, q1d);
1112 vec_xst(p1, 0, p1d);
1113 }
1114
1115 static inline void
loop_filter_v_8_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply,b32x4 m8 HIGHBD_DECL_SUFFIX)1116 loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
1117 const ptrdiff_t strideb, b32x4 apply, b32x4 m8
1118 HIGHBD_DECL_SUFFIX)
1119
1120 {
1121
1122 uint8_t *p3d = dst + strideb * -4;
1123 uint8_t *p2d = dst + strideb * -3;
1124 uint8_t *p1d = dst + strideb * -2;
1125 uint8_t *p0d = dst + strideb * -1;
1126 uint8_t *q0d = dst + strideb * +0;
1127 uint8_t *q1d = dst + strideb * +1;
1128 uint8_t *q2d = dst + strideb * +2;
1129 uint8_t *q3d = dst + strideb * +3;
1130
1131 u8x16 p3 = vec_xl(0, p3d);
1132 u8x16 p2 = vec_xl(0, p2d);
1133 u8x16 p1 = vec_xl(0, p1d);
1134 u8x16 p0 = vec_xl(0, p0d);
1135 u8x16 q0 = vec_xl(0, q0d);
1136 u8x16 q1 = vec_xl(0, q1d);
1137 u8x16 q2 = vec_xl(0, q2d);
1138 u8x16 q3 = vec_xl(0, q3d);
1139
1140 const u8x16 F = vec_splat_u8(1);
1141
1142 const u8x16 zero = vec_splat_u8(0);
1143 const u16x8 v3u16 = vec_splat_u16(3);
1144 const u16x8 v4u16 = vec_splat_u16(4);
1145 const u8x16 v1u8 = vec_splat_u8(1);
1146 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
1147
1148 const u8x16 a_p1_p0 = vec_absd(p1, p0);
1149 const u8x16 a_q1_q0 = vec_absd(q1, q0);
1150 const u8x16 a_p0_q0 = vec_absd(p0, q0);
1151 const u8x16 a_p1_q1 = vec_absd(p1, q1);
1152 const u8x16 a_p2_p1 = vec_absd(p2, p1);
1153 const u8x16 a_q2_q1 = vec_absd(q2, q1);
1154 const u8x16 a_p2_p0 = vec_absd(p2, p0);
1155 const u8x16 a_q2_q0 = vec_absd(q2, q0);
1156 const u8x16 a_p3_p0 = vec_absd(p3, p0);
1157 const u8x16 a_q3_q0 = vec_absd(q3, q0);
1158 const u8x16 a_p3_p2 = vec_absd(p3, p2);
1159 const u8x16 a_q3_q2 = vec_absd(q3, q2);
1160
1161 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
1162 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
1163 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
1164 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
1165 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
1166 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
1167 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
1168 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
1169 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
1170 cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8);
1171 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
1172 const b8x16 ltE = vec_cmple(cmp_E, E);
1173 const b8x16 ltI = vec_cmple(cmp_I, I);
1174 b8x16 fm = vec_and(ltI, ltE);
1175
1176 fm = vec_and(fm, (b8x16)apply);
1177 if (vec_all_eq(fm, zero))
1178 return;
1179
1180 #define UNPACK_16(v) \
1181 u16x8 v##h = u8h_to_u16(v); \
1182 u16x8 v##l = u8l_to_u16(v);
1183
1184 UNPACK_16(p3)
1185 UNPACK_16(p2)
1186 UNPACK_16(p1)
1187 UNPACK_16(p0)
1188 UNPACK_16(q0)
1189 UNPACK_16(q1)
1190 UNPACK_16(q2)
1191 UNPACK_16(q3)
1192
1193 m8 = vec_and(m8, (b32x4)fm);
1194
1195 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
1196 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
1197 b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8);
1198
1199 b8x16 apply_4 = vec_andc(fm, apply_8);
1200
1201 if (vec_any_ne(apply_4, zero)) {
1202 APPLY_4
1203 }
1204
1205 if (vec_any_ne(apply_8, zero)) {
1206 APPLY_8
1207 }
1208
1209 vec_xst(p0, 0, p0d);
1210 vec_xst(q0, 0, q0d);
1211 vec_xst(q1, 0, q1d);
1212 vec_xst(p1, 0, p1d);
1213 vec_xst(q2, 0, q2d);
1214 vec_xst(p2, 0, p2d);
1215 }
1216
1217 static inline void
loop_filter_v_16_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply,b32x4 m8,b32x4 m16 HIGHBD_DECL_SUFFIX)1218 loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
1219 const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16
1220 HIGHBD_DECL_SUFFIX)
1221
1222 {
1223
1224 uint8_t *p6d = dst + strideb * -7;
1225 uint8_t *p5d = dst + strideb * -6;
1226 uint8_t *p4d = dst + strideb * -5;
1227 uint8_t *p3d = dst + strideb * -4;
1228 uint8_t *p2d = dst + strideb * -3;
1229 uint8_t *p1d = dst + strideb * -2;
1230 uint8_t *p0d = dst + strideb * -1;
1231 uint8_t *q0d = dst + strideb * +0;
1232 uint8_t *q1d = dst + strideb * +1;
1233 uint8_t *q2d = dst + strideb * +2;
1234 uint8_t *q3d = dst + strideb * +3;
1235 uint8_t *q4d = dst + strideb * +4;
1236 uint8_t *q5d = dst + strideb * +5;
1237 uint8_t *q6d = dst + strideb * +6;
1238
1239 u8x16 p6 = vec_xl(0, p6d);
1240 u8x16 p5 = vec_xl(0, p5d);
1241 u8x16 p4 = vec_xl(0, p4d);
1242 u8x16 p3 = vec_xl(0, p3d);
1243 u8x16 p2 = vec_xl(0, p2d);
1244 u8x16 p1 = vec_xl(0, p1d);
1245 u8x16 p0 = vec_xl(0, p0d);
1246 u8x16 q0 = vec_xl(0, q0d);
1247 u8x16 q1 = vec_xl(0, q1d);
1248 u8x16 q2 = vec_xl(0, q2d);
1249 u8x16 q3 = vec_xl(0, q3d);
1250 u8x16 q4 = vec_xl(0, q4d);
1251 u8x16 q5 = vec_xl(0, q5d);
1252 u8x16 q6 = vec_xl(0, q6d);
1253
1254 const u8x16 F = vec_splat_u8(1);
1255
1256 const u8x16 zero = vec_splat_u8(0);
1257 const u16x8 v3u16 = vec_splat_u16(3);
1258 const u16x8 v4u16 = vec_splat_u16(4);
1259 const u16x8 v8u16 = vec_splat_u16(8);
1260 const u8x16 v1u8 = vec_splat_u8(1);
1261 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
1262
1263 const u8x16 a_p6_p0 = vec_absd(p6, p0);
1264 const u8x16 a_p5_p0 = vec_absd(p5, p0);
1265 const u8x16 a_p4_p0 = vec_absd(p4, p0);
1266 const u8x16 a_q4_q0 = vec_absd(q4, q0);
1267 const u8x16 a_q5_q0 = vec_absd(q5, q0);
1268 const u8x16 a_q6_q0 = vec_absd(q6, q0);
1269
1270 const u8x16 a_p1_p0 = vec_absd(p1, p0);
1271 const u8x16 a_q1_q0 = vec_absd(q1, q0);
1272 const u8x16 a_p0_q0 = vec_absd(p0, q0);
1273 const u8x16 a_p1_q1 = vec_absd(p1, q1);
1274 const u8x16 a_p2_p1 = vec_absd(p2, p1);
1275 const u8x16 a_q2_q1 = vec_absd(q2, q1);
1276 const u8x16 a_p2_p0 = vec_absd(p2, p0);
1277 const u8x16 a_q2_q0 = vec_absd(q2, q0);
1278 const u8x16 a_p3_p0 = vec_absd(p3, p0);
1279 const u8x16 a_q3_q0 = vec_absd(q3, q0);
1280 const u8x16 a_p3_p2 = vec_absd(p3, p2);
1281 const u8x16 a_q3_q2 = vec_absd(q3, q2);
1282
1283 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
1284 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
1285 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
1286 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
1287 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
1288
1289 const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0);
1290 const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0);
1291 const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0);
1292
1293 b32x4 m8_16 = vec_or(m8, m16);
1294
1295 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
1296 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
1297 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
1298 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
1299 cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8_16);
1300 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
1301 const b8x16 ltE = vec_cmple(cmp_E, E);
1302 const b8x16 ltI = vec_cmple(cmp_I, I);
1303 b8x16 fm = vec_and(ltI, ltE);
1304
1305 fm = vec_and(fm, (b8x16)apply);
1306 if (vec_all_eq(fm, zero))
1307 return;
1308
1309 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
1310 u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0);
1311
1312 m8_16 = vec_and(m8_16, (b32x4)fm);
1313 m16 = vec_and(m16, (b32x4)fm);
1314
1315 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
1316 cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out);
1317 b8x16 flat8in = vec_cmple(cmp_flat8in, F);
1318 b8x16 flat8out = vec_cmple(cmp_flat8out, F);
1319 flat8in = vec_and(flat8in, (b8x16)m8_16);
1320 flat8out = vec_and(flat8out, (b8x16)m16);
1321
1322 b8x16 apply_16 = vec_and(flat8out, flat8in);
1323 b8x16 apply_8 = vec_andc(flat8in, flat8out);
1324
1325 UNPACK_16(p6)
1326 UNPACK_16(p5)
1327 UNPACK_16(p4)
1328 UNPACK_16(p3)
1329 UNPACK_16(p2)
1330 UNPACK_16(p1)
1331 UNPACK_16(p0)
1332
1333 b8x16 apply_4 = vec_nor(apply_16, apply_8);
1334
1335 UNPACK_16(q0)
1336 UNPACK_16(q1)
1337 UNPACK_16(q2)
1338 UNPACK_16(q3)
1339 UNPACK_16(q4)
1340 UNPACK_16(q5)
1341 UNPACK_16(q6)
1342
1343 if (vec_any_ne(apply_4, zero)) {
1344 APPLY_4
1345 }
1346 if (vec_any_ne(apply_16, zero)) {
1347 APPLY_16
1348 }
1349 if (vec_any_ne(apply_8, zero)) {
1350 APPLY_8
1351 }
1352
1353 vec_xst(p5, 0, p5d);
1354 vec_xst(p4, 0, p4d);
1355 vec_xst(p3, 0, p3d);
1356 vec_xst(p2, 0, p2d);
1357 vec_xst(p1, 0, p1d);
1358 vec_xst(p0, 0, p0d);
1359 vec_xst(q0, 0, q0d);
1360 vec_xst(q1, 0, q1d);
1361 vec_xst(q2, 0, q2d);
1362 vec_xst(q3, 0, q3d);
1363 vec_xst(q4, 0, q4d);
1364 vec_xst(q5, 0, q5d);
1365 }
1366
1367 #if defined(DAV1D_VSX)
1368 #define LPF(fn) BF(dav1d_lpf_##fn, vsx)
1369 #elif defined(DAV1D_PWR9)
1370 #define LPF(fn) BF(dav1d_lpf_##fn, pwr9)
1371 #endif
1372
LPF(h_sb_y)1373 void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride,
1374 const uint32_t *const vmask,
1375 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1376 const Av1FilterLUT *lut, const int h
1377 HIGHBD_DECL_SUFFIX)
1378 {
1379 unsigned vm = vmask[0] | vmask[1] | vmask[2];
1380
1381 u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]);
1382 u32x4 vm1 = vec_splats(vmask[1]);
1383 u32x4 vm2 = vec_splats(vmask[2]);
1384 u32x4 mm = (u32x4){1, 2, 4, 8};
1385
1386 const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1387 const u8x16 s0 = vec_splat(sharp, 0);
1388 const u8x16 s1 = vec_splat(sharp, 8);
1389 const u32x4 v4u32 = vec_splat_u32(4);
1390 const u32x4 zero = vec_splat_u32(0);
1391 const u8x16 v1u8 = vec_splat_u8(1);
1392 const u8x16 v2u8 = vec_splat_u8(2);
1393 const u8x16 v4u8 = vec_splat_u8(4);
1394 const uint8_t (*pl)[4] = &l[-1];
1395
1396 const u8x16 spread = (u8x16){
1397 0x00, 0x00, 0x00, 0x00,
1398 0x04, 0x04, 0x04, 0x04,
1399 0x08, 0x08, 0x08, 0x08,
1400 0x0c, 0x0c, 0x0c, 0x0c,
1401 };
1402
1403 for (;
1404 vm;
1405 vm >>= 4,
1406 mm = vec_sl(mm, v4u32),
1407 dst += 4 * 4 * PXSTRIDE(stride),
1408 pl += 4 * b4_stride) {
1409 if (!(vm & 0x0f))
1410 continue;
1411 u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ...
1412 u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl);
1413 u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl);
1414 u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl);
1415
1416 u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0]
1417 u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0]
1418
1419 u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8]
1420 u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1421 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1422
1423 u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1]
1424 u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0]
1425
1426 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1427
1428 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] }
1429
1430 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1431
1432 b32x4 m16 = vec_cmpeq(wd16, mm);
1433 b32x4 m8 = vec_cmpeq(wd8, mm);
1434 b32x4 m4 = vec_cmpeq(wd4, mm);
1435
1436 b32x4 apply = vec_cmpne((u32x4)L, zero);
1437
1438 if (vec_all_eq((u32x4)L, zero))
1439 continue;
1440
1441 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1442 u8x16 H = vec_sr(L, v4u8);
1443 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1444 u8x16 E = vec_add(L, v2u8); // L + 2
1445 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1446 E = vec_add(E, E); // 2 * (L + 2)
1447 E = vec_add(E, I); // 2 * (L + 2) + limit
1448
1449 apply = vec_and(m4, apply);
1450
1451 if (vec_any_ne(wd16, zero)) {
1452 loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX);
1453 } else if (vec_any_ne(wd8, zero)) {
1454 loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX);
1455 } else { // wd4 == 0 already tested
1456 loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
1457 }
1458 }
1459 }
1460
LPF(v_sb_y)1461 void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride,
1462 const uint32_t *const vmask,
1463 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1464 const Av1FilterLUT *lut, const int w
1465 HIGHBD_DECL_SUFFIX)
1466 {
1467 unsigned vm = vmask[0] | vmask[1] | vmask[2];
1468
1469 u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]);
1470 u32x4 vm1 = vec_splats(vmask[1]);
1471 u32x4 vm2 = vec_splats(vmask[2]);
1472
1473 u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1474 u8x16 s0 = vec_splat(sharp, 0);
1475 u8x16 s1 = vec_splat(sharp, 8);
1476 u32x4 mm = (u32x4){1, 2, 4, 8};
1477 u32x4 v4u32 = vec_splat_u32(4);
1478 u32x4 zero = vec_splat_u32(0);
1479 u8x16 v1u8 = vec_splat_u8(1);
1480 u8x16 v2u8 = vec_splat_u8(2);
1481 u8x16 v4u8 = vec_splat_u8(4);
1482 const uint8_t (*pl)[4] = l;
1483 const uint8_t (*plb4)[4] = l - b4_stride;
1484 const u8x16 spread = (u8x16){
1485 0x00, 0x00, 0x00, 0x00,
1486 0x04, 0x04, 0x04, 0x04,
1487 0x08, 0x08, 0x08, 0x08,
1488 0x0c, 0x0c, 0x0c, 0x0c,
1489 };
1490
1491 for (;
1492 vm;
1493 vm >>= 4,
1494 mm = vec_sl(mm, v4u32),
1495 dst += 4 * 4,
1496 pl += 4,
1497 plb4 += 4) {
1498 if (!(vm & 0x0f))
1499 continue;
1500 u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl);
1501 u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4);
1502
1503 u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8]
1504 u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1505 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1506
1507 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1508
1509 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] }
1510
1511 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1512
1513 b32x4 m16 = vec_cmpeq(wd16, mm);
1514 b32x4 m8 = vec_cmpeq(wd8, mm);
1515 b32x4 m4 = vec_cmpeq(wd4, mm);
1516
1517 b32x4 apply = vec_cmpne((u32x4)L, zero);
1518
1519 if (vec_all_eq((u32x4)L, zero))
1520 continue;
1521
1522 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1523 u8x16 H = vec_sr(L, v4u8);
1524 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1525 u8x16 E = vec_add(L, v2u8); // L + 2
1526 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1527 E = vec_add(E, E); // 2 * (L + 2)
1528 E = vec_add(E, I); // 2 * (L + 2) + limit
1529
1530 apply = vec_and(apply, m4);
1531
1532 if (vec_any_ne(wd16, zero)) {
1533 loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16 HIGHBD_TAIL_SUFFIX);
1534 } else if (vec_any_ne(wd8, zero)) {
1535 loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8 HIGHBD_TAIL_SUFFIX);
1536 } else {
1537 loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
1538 }
1539
1540 }
1541 }
1542
LPF(h_sb_uv)1543 void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride,
1544 const uint32_t *const vmask,
1545 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1546 const Av1FilterLUT *lut, const int h
1547 HIGHBD_DECL_SUFFIX)
1548 {
1549 unsigned vm = vmask[0] | vmask[1];
1550 u32x4 vm0 = vec_splats(vm);
1551 u32x4 vm1 = vec_splats(vmask[1]);
1552 u32x4 mm = (u32x4){1, 2, 4, 8};
1553
1554 const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1555 const u8x16 s0 = vec_splat(sharp, 0);
1556 const u8x16 s1 = vec_splat(sharp, 8);
1557 const u32x4 v4u32 = vec_splat_u32(4);
1558 const u32x4 zero = vec_splat_u32(0);
1559 const u8x16 v1u8 = vec_splat_u8(1);
1560 const u8x16 v2u8 = vec_splat_u8(2);
1561 const u8x16 v4u8 = vec_splat_u8(4);
1562 const uint8_t (*pl)[4] = &l[-1];
1563 const u8x16 spread = (u8x16){
1564 0x00, 0x00, 0x00, 0x00,
1565 0x04, 0x04, 0x04, 0x04,
1566 0x08, 0x08, 0x08, 0x08,
1567 0x0c, 0x0c, 0x0c, 0x0c,
1568 };
1569
1570 for (;
1571 vm;
1572 vm >>= 4,
1573 mm = vec_sl(mm, v4u32),
1574 dst += 4 * 4 * PXSTRIDE(stride),
1575 pl += 4 * b4_stride) {
1576 if (!(vm & 0x0f))
1577 continue;
1578 u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ...
1579 u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl);
1580 u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl);
1581 u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl);
1582
1583 u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0]
1584 u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0]
1585
1586 u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1587 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1588
1589 u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1]
1590 u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0]
1591
1592 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1593
1594 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] }
1595
1596 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1597
1598 b32x4 m6 = vec_cmpeq(wd6, mm);
1599 b32x4 m4 = vec_cmpeq(wd4, mm);
1600
1601 b32x4 apply = vec_cmpne((u32x4)L, zero);
1602
1603 if (vec_all_eq((u32x4)L, zero))
1604 continue;
1605
1606 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1607 u8x16 H = vec_sr(L, v4u8);
1608 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1609 u8x16 E = vec_add(L, v2u8); // L + 2
1610 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1611 E = vec_add(E, E); // 2 * (L + 2)
1612 E = vec_add(E, I); // 2 * (L + 2) + limit
1613
1614 apply = vec_and(m4, apply);
1615
1616 if (vec_any_ne(wd6, zero)) {
1617 loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX);
1618 // loop_filter_h_8
1619 } else { // wd4 == 0 already tested
1620 loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
1621
1622 // loop_filter_h_4
1623 }
1624
1625 }
1626 }
1627
LPF(v_sb_uv)1628 void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride,
1629 const uint32_t *const vmask,
1630 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1631 const Av1FilterLUT *lut, const int w
1632 HIGHBD_DECL_SUFFIX)
1633 {
1634 unsigned vm = vmask[0] | vmask[1];
1635
1636 u32x4 vm0 = vec_splats(vm);
1637 u32x4 vm1 = vec_splats(vmask[1]);
1638
1639 u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1640 u8x16 s0 = vec_splat(sharp, 0);
1641 u8x16 s1 = vec_splat(sharp, 8);
1642 u32x4 mm = (u32x4){1, 2, 4, 8};
1643 u32x4 v4u32 = vec_splat_u32(4);
1644 u32x4 zero = vec_splat_u32(0);
1645 u8x16 v1u8 = vec_splat_u8(1);
1646 u8x16 v2u8 = vec_splat_u8(2);
1647 u8x16 v4u8 = vec_splat_u8(4);
1648 const uint8_t (*pl)[4] = l;
1649 const uint8_t (*plb4)[4] = l - b4_stride;
1650 const u8x16 spread = (u8x16){
1651 0x00, 0x00, 0x00, 0x00,
1652 0x04, 0x04, 0x04, 0x04,
1653 0x08, 0x08, 0x08, 0x08,
1654 0x0c, 0x0c, 0x0c, 0x0c,
1655 };
1656
1657 for (;
1658 vm;
1659 vm >>= 4,
1660 mm = vec_sl(mm, v4u32),
1661 dst += 4 * 4,
1662 pl += 4,
1663 plb4 += 4) {
1664 if (!(vm & 0x0f))
1665 continue;
1666 u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl);
1667 u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4);
1668
1669 u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1670 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1671
1672 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1673
1674 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] }
1675
1676 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1677
1678 b32x4 m6 = vec_cmpeq(wd6, mm);
1679 b32x4 m4 = vec_cmpeq(wd4, mm);
1680
1681 b32x4 apply = vec_cmpne((u32x4)L, zero);
1682
1683 if (vec_all_eq((u32x4)L, zero))
1684 continue;
1685
1686 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1687 u8x16 H = vec_sr(L, v4u8);
1688 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1689 u8x16 E = vec_add(L, v2u8); // L + 2
1690 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1691 E = vec_add(E, E); // 2 * (L + 2)
1692 E = vec_add(E, I); // 2 * (L + 2) + limit
1693
1694 apply = vec_and(apply, m4);
1695
1696 if (vec_any_ne(wd6, zero)) {
1697 loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6 HIGHBD_TAIL_SUFFIX);
1698 } else {
1699 loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply HIGHBD_TAIL_SUFFIX);
1700 }
1701 }
1702 }
1703
1704 #endif // BITDEPTH
1705