1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "config.h"
22 #if HAVE_UNISTD_H
23 #include <unistd.h>
24 #endif
25
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/ppc/util_altivec.h"
29
30 #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
31
32 #if HAVE_BIGENDIAN
33 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
34 vec_u8 srcR1 = vec_ld(-2, s);\
35 vec_u8 srcR2 = vec_ld(14, s);\
36 switch (ali) {\
37 default: {\
38 srcM2 = vec_perm(srcR1, srcR2, pm2);\
39 srcM1 = vec_perm(srcR1, srcR2, pm1);\
40 srcP0 = vec_perm(srcR1, srcR2, pp0);\
41 srcP1 = vec_perm(srcR1, srcR2, pp1);\
42 srcP2 = vec_perm(srcR1, srcR2, pp2);\
43 srcP3 = vec_perm(srcR1, srcR2, pp3);\
44 } break;\
45 case 11: {\
46 srcM2 = vec_perm(srcR1, srcR2, pm2);\
47 srcM1 = vec_perm(srcR1, srcR2, pm1);\
48 srcP0 = vec_perm(srcR1, srcR2, pp0);\
49 srcP1 = vec_perm(srcR1, srcR2, pp1);\
50 srcP2 = vec_perm(srcR1, srcR2, pp2);\
51 srcP3 = srcR2;\
52 } break;\
53 case 12: {\
54 vec_u8 srcR3 = vec_ld(30, s);\
55 srcM2 = vec_perm(srcR1, srcR2, pm2);\
56 srcM1 = vec_perm(srcR1, srcR2, pm1);\
57 srcP0 = vec_perm(srcR1, srcR2, pp0);\
58 srcP1 = vec_perm(srcR1, srcR2, pp1);\
59 srcP2 = srcR2;\
60 srcP3 = vec_perm(srcR2, srcR3, pp3);\
61 } break;\
62 case 13: {\
63 vec_u8 srcR3 = vec_ld(30, s);\
64 srcM2 = vec_perm(srcR1, srcR2, pm2);\
65 srcM1 = vec_perm(srcR1, srcR2, pm1);\
66 srcP0 = vec_perm(srcR1, srcR2, pp0);\
67 srcP1 = srcR2;\
68 srcP2 = vec_perm(srcR2, srcR3, pp2);\
69 srcP3 = vec_perm(srcR2, srcR3, pp3);\
70 } break;\
71 case 14: {\
72 vec_u8 srcR3 = vec_ld(30, s);\
73 srcM2 = vec_perm(srcR1, srcR2, pm2);\
74 srcM1 = vec_perm(srcR1, srcR2, pm1);\
75 srcP0 = srcR2;\
76 srcP1 = vec_perm(srcR2, srcR3, pp1);\
77 srcP2 = vec_perm(srcR2, srcR3, pp2);\
78 srcP3 = vec_perm(srcR2, srcR3, pp3);\
79 } break;\
80 case 15: {\
81 vec_u8 srcR3 = vec_ld(30, s);\
82 srcM2 = vec_perm(srcR1, srcR2, pm2);\
83 srcM1 = srcR2;\
84 srcP0 = vec_perm(srcR2, srcR3, pp0);\
85 srcP1 = vec_perm(srcR2, srcR3, pp1);\
86 srcP2 = vec_perm(srcR2, srcR3, pp2);\
87 srcP3 = vec_perm(srcR2, srcR3, pp3);\
88 } break;\
89 }\
90 }
91 #else
92 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
93 srcM2 = vec_vsx_ld(-2, s);\
94 srcM1 = vec_vsx_ld(-1, s);\
95 srcP0 = vec_vsx_ld(0, s);\
96 srcP1 = vec_vsx_ld(1, s);\
97 srcP2 = vec_vsx_ld(2, s);\
98 srcP3 = vec_vsx_ld(3, s);\
99 }
100 #endif /* HAVE_BIGENDIAN */
101
102 /* this code assume stride % 16 == 0 */
103 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst,const uint8_t * src,int dstStride,int srcStride)104 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
105 const uint8_t *src,
106 int dstStride, int srcStride)
107 {
108 register int i;
109
110 LOAD_ZERO;
111 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
112 const vec_s16 v5ss = vec_splat_s16(5);
113 const vec_u16 v5us = vec_splat_u16(5);
114 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
115 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
116
117 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
118
119 register int align = ((((unsigned long)src) - 2) % 16);
120
121 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
122 srcP2A, srcP2B, srcP3A, srcP3B,
123 srcM1A, srcM1B, srcM2A, srcM2B,
124 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
125 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
126 psumA, psumB, sumA, sumB;
127
128 vec_u8 sum, fsum;
129
130 #if HAVE_BIGENDIAN
131 permM2 = vec_lvsl(-2, src);
132 permM1 = vec_lvsl(-1, src);
133 permP0 = vec_lvsl(+0, src);
134 permP1 = vec_lvsl(+1, src);
135 permP2 = vec_lvsl(+2, src);
136 permP3 = vec_lvsl(+3, src);
137 #endif /* HAVE_BIGENDIAN */
138
139 for (i = 0 ; i < 16 ; i ++) {
140 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
141
142 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
143 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
144 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
145 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
146
147 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
148 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
149 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
150 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
151
152 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
153 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
154 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
155 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
156
157 sum1A = vec_adds(srcP0A, srcP1A);
158 sum1B = vec_adds(srcP0B, srcP1B);
159 sum2A = vec_adds(srcM1A, srcP2A);
160 sum2B = vec_adds(srcM1B, srcP2B);
161 sum3A = vec_adds(srcM2A, srcP3A);
162 sum3B = vec_adds(srcM2B, srcP3B);
163
164 pp1A = vec_mladd(sum1A, v20ss, v16ss);
165 pp1B = vec_mladd(sum1B, v20ss, v16ss);
166
167 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
168 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
169
170 pp3A = vec_add(sum3A, pp1A);
171 pp3B = vec_add(sum3B, pp1B);
172
173 psumA = vec_sub(pp3A, pp2A);
174 psumB = vec_sub(pp3B, pp2B);
175
176 sumA = vec_sra(psumA, v5us);
177 sumB = vec_sra(psumB, v5us);
178
179 sum = vec_packsu(sumA, sumB);
180
181 ASSERT_ALIGNED(dst);
182
183 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
184
185 vec_st(fsum, 0, dst);
186
187 src += srcStride;
188 dst += dstStride;
189 }
190 }
191 #endif /* PREFIX_h264_qpel16_h_lowpass_altivec */
192
193 /* this code assume stride % 16 == 0 */
194 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst,const uint8_t * src,int dstStride,int srcStride)195 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
196 const uint8_t *src,
197 int dstStride, int srcStride)
198 {
199 register int i;
200
201 LOAD_ZERO;
202 vec_u8 perm;
203 #if HAVE_BIGENDIAN
204 perm = vec_lvsl(0, src);
205 #endif
206 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
207 const vec_u16 v5us = vec_splat_u16(5);
208 const vec_s16 v5ss = vec_splat_s16(5);
209 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
210
211 const uint8_t *srcbis = src - (srcStride * 2);
212
213 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
214 srcbis += srcStride;
215 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
216 srcbis += srcStride;
217 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
218 srcbis += srcStride;
219 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
220 srcbis += srcStride;
221 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
222 srcbis += srcStride;
223
224 vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
225 vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
226 vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
227 vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
228 vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
229 vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
230 vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
231 vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
232 vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
233 vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
234
235 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
236 psumA, psumB, sumA, sumB,
237 srcP3ssA, srcP3ssB,
238 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
239
240 vec_u8 sum, fsum, srcP3;
241
242 for (i = 0 ; i < 16 ; i++) {
243 srcP3 = load_with_perm_vec(0, srcbis, perm);
244 srcbis += srcStride;
245
246 srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
247 srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
248
249 sum1A = vec_adds(srcP0ssA, srcP1ssA);
250 sum1B = vec_adds(srcP0ssB, srcP1ssB);
251 sum2A = vec_adds(srcM1ssA, srcP2ssA);
252 sum2B = vec_adds(srcM1ssB, srcP2ssB);
253 sum3A = vec_adds(srcM2ssA, srcP3ssA);
254 sum3B = vec_adds(srcM2ssB, srcP3ssB);
255
256 srcM2ssA = srcM1ssA;
257 srcM2ssB = srcM1ssB;
258 srcM1ssA = srcP0ssA;
259 srcM1ssB = srcP0ssB;
260 srcP0ssA = srcP1ssA;
261 srcP0ssB = srcP1ssB;
262 srcP1ssA = srcP2ssA;
263 srcP1ssB = srcP2ssB;
264 srcP2ssA = srcP3ssA;
265 srcP2ssB = srcP3ssB;
266
267 pp1A = vec_mladd(sum1A, v20ss, v16ss);
268 pp1B = vec_mladd(sum1B, v20ss, v16ss);
269
270 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
271 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
272
273 pp3A = vec_add(sum3A, pp1A);
274 pp3B = vec_add(sum3B, pp1B);
275
276 psumA = vec_sub(pp3A, pp2A);
277 psumB = vec_sub(pp3B, pp2B);
278
279 sumA = vec_sra(psumA, v5us);
280 sumB = vec_sra(psumB, v5us);
281
282 sum = vec_packsu(sumA, sumB);
283
284 ASSERT_ALIGNED(dst);
285
286 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
287
288 vec_st(fsum, 0, dst);
289
290 dst += dstStride;
291 }
292 }
293 #endif /* PREFIX_h264_qpel16_v_lowpass_altivec */
294
295 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
296 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst,int16_t * tmp,const uint8_t * src,int dstStride,int tmpStride,int srcStride)297 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
298 const uint8_t *src,
299 int dstStride, int tmpStride,
300 int srcStride)
301 {
302 register int i;
303 LOAD_ZERO;
304 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
305 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306 const vec_u32 v10ui = vec_splat_u32(10);
307 const vec_s16 v5ss = vec_splat_s16(5);
308 const vec_s16 v1ss = vec_splat_s16(1);
309 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
310 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
311
312 register int align = ((((unsigned long)src) - 2) % 16);
313
314 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
315 srcP2A, srcP2B, srcP3A, srcP3B,
316 srcM1A, srcM1B, srcM2A, srcM2B,
317 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
318 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
319
320 const vec_u8 mperm = (const vec_u8)
321 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
322 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
323 int16_t *tmpbis = tmp;
324
325 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
326 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
327 tmpP2ssA, tmpP2ssB;
328
329 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
330 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
331 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
332 ssumAe, ssumAo, ssumBe, ssumBo;
333 vec_u8 fsum, sumv, sum;
334 vec_s16 ssume, ssumo;
335
336 #if HAVE_BIGENDIAN
337 permM2 = vec_lvsl(-2, src);
338 permM1 = vec_lvsl(-1, src);
339 permP0 = vec_lvsl(+0, src);
340 permP1 = vec_lvsl(+1, src);
341 permP2 = vec_lvsl(+2, src);
342 permP3 = vec_lvsl(+3, src);
343 #endif /* HAVE_BIGENDIAN */
344
345 src -= (2 * srcStride);
346 for (i = 0 ; i < 21 ; i ++) {
347 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
348
349 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
350
351 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
352 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
353 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
354 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
355
356 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
357 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
358 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
359 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
360
361 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
362 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
363 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
364 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
365
366 sum1A = vec_adds(srcP0A, srcP1A);
367 sum1B = vec_adds(srcP0B, srcP1B);
368 sum2A = vec_adds(srcM1A, srcP2A);
369 sum2B = vec_adds(srcM1B, srcP2B);
370 sum3A = vec_adds(srcM2A, srcP3A);
371 sum3B = vec_adds(srcM2B, srcP3B);
372
373 pp1A = vec_mladd(sum1A, v20ss, sum3A);
374 pp1B = vec_mladd(sum1B, v20ss, sum3B);
375
376 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
377 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
378
379 psumA = vec_sub(pp1A, pp2A);
380 psumB = vec_sub(pp1B, pp2B);
381
382 vec_st(psumA, 0, tmp);
383 vec_st(psumB, 16, tmp);
384
385 src += srcStride;
386 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
387 }
388
389 tmpM2ssA = vec_ld(0, tmpbis);
390 tmpM2ssB = vec_ld(16, tmpbis);
391 tmpbis += tmpStride;
392 tmpM1ssA = vec_ld(0, tmpbis);
393 tmpM1ssB = vec_ld(16, tmpbis);
394 tmpbis += tmpStride;
395 tmpP0ssA = vec_ld(0, tmpbis);
396 tmpP0ssB = vec_ld(16, tmpbis);
397 tmpbis += tmpStride;
398 tmpP1ssA = vec_ld(0, tmpbis);
399 tmpP1ssB = vec_ld(16, tmpbis);
400 tmpbis += tmpStride;
401 tmpP2ssA = vec_ld(0, tmpbis);
402 tmpP2ssB = vec_ld(16, tmpbis);
403 tmpbis += tmpStride;
404
405 for (i = 0 ; i < 16 ; i++) {
406 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
407 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
408
409 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
410 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
411 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
412 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
413 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
414 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
415
416 tmpbis += tmpStride;
417
418 tmpM2ssA = tmpM1ssA;
419 tmpM2ssB = tmpM1ssB;
420 tmpM1ssA = tmpP0ssA;
421 tmpM1ssB = tmpP0ssB;
422 tmpP0ssA = tmpP1ssA;
423 tmpP0ssB = tmpP1ssB;
424 tmpP1ssA = tmpP2ssA;
425 tmpP1ssB = tmpP2ssB;
426 tmpP2ssA = tmpP3ssA;
427 tmpP2ssB = tmpP3ssB;
428
429 pp1Ae = vec_mule(sum1A, v20ss);
430 pp1Ao = vec_mulo(sum1A, v20ss);
431 pp1Be = vec_mule(sum1B, v20ss);
432 pp1Bo = vec_mulo(sum1B, v20ss);
433
434 pp2Ae = vec_mule(sum2A, v5ss);
435 pp2Ao = vec_mulo(sum2A, v5ss);
436 pp2Be = vec_mule(sum2B, v5ss);
437 pp2Bo = vec_mulo(sum2B, v5ss);
438
439 pp3Ao = vec_mulo(sum3A, v1ss);
440 pp3Bo = vec_mulo(sum3B, v1ss);
441 #if !HAVE_BIGENDIAN
442 sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
443 sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
444 #endif
445 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
446 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
447
448 pp1cAe = vec_add(pp1Ae, v512si);
449 pp1cAo = vec_add(pp1Ao, v512si);
450 pp1cBe = vec_add(pp1Be, v512si);
451 pp1cBo = vec_add(pp1Bo, v512si);
452
453 pp32Ae = vec_sub(pp3Ae, pp2Ae);
454 pp32Ao = vec_sub(pp3Ao, pp2Ao);
455 pp32Be = vec_sub(pp3Be, pp2Be);
456 pp32Bo = vec_sub(pp3Bo, pp2Bo);
457
458 sumAe = vec_add(pp1cAe, pp32Ae);
459 sumAo = vec_add(pp1cAo, pp32Ao);
460 sumBe = vec_add(pp1cBe, pp32Be);
461 sumBo = vec_add(pp1cBo, pp32Bo);
462
463 ssumAe = vec_sra(sumAe, v10ui);
464 ssumAo = vec_sra(sumAo, v10ui);
465 ssumBe = vec_sra(sumBe, v10ui);
466 ssumBo = vec_sra(sumBo, v10ui);
467
468 ssume = vec_packs(ssumAe, ssumBe);
469 ssumo = vec_packs(ssumAo, ssumBo);
470
471 sumv = vec_packsu(ssume, ssumo);
472 sum = vec_perm(sumv, sumv, mperm);
473
474 ASSERT_ALIGNED(dst);
475
476 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
477
478 vec_st(fsum, 0, dst);
479
480 dst += dstStride;
481 }
482 }
483 #endif /* PREFIX_h264_qpel16_hv_lowpass_altivec */
484