• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "config.h"
22 #if HAVE_UNISTD_H
23 #include <unistd.h>
24 #endif
25 
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/ppc/util_altivec.h"
29 
30 #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
31 
32 #if HAVE_BIGENDIAN
33 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
34     vec_u8 srcR1 = vec_ld(-2, s);\
35     vec_u8 srcR2 = vec_ld(14, s);\
36     switch (ali) {\
37     default: {\
38         srcM2 = vec_perm(srcR1, srcR2, pm2);\
39         srcM1 = vec_perm(srcR1, srcR2, pm1);\
40         srcP0 = vec_perm(srcR1, srcR2, pp0);\
41         srcP1 = vec_perm(srcR1, srcR2, pp1);\
42         srcP2 = vec_perm(srcR1, srcR2, pp2);\
43         srcP3 = vec_perm(srcR1, srcR2, pp3);\
44     } break;\
45     case 11: {\
46         srcM2 = vec_perm(srcR1, srcR2, pm2);\
47         srcM1 = vec_perm(srcR1, srcR2, pm1);\
48         srcP0 = vec_perm(srcR1, srcR2, pp0);\
49         srcP1 = vec_perm(srcR1, srcR2, pp1);\
50         srcP2 = vec_perm(srcR1, srcR2, pp2);\
51         srcP3 = srcR2;\
52     } break;\
53     case 12: {\
54         vec_u8 srcR3 = vec_ld(30, s);\
55         srcM2 = vec_perm(srcR1, srcR2, pm2);\
56         srcM1 = vec_perm(srcR1, srcR2, pm1);\
57         srcP0 = vec_perm(srcR1, srcR2, pp0);\
58         srcP1 = vec_perm(srcR1, srcR2, pp1);\
59         srcP2 = srcR2;\
60         srcP3 = vec_perm(srcR2, srcR3, pp3);\
61     } break;\
62     case 13: {\
63         vec_u8 srcR3 = vec_ld(30, s);\
64         srcM2 = vec_perm(srcR1, srcR2, pm2);\
65         srcM1 = vec_perm(srcR1, srcR2, pm1);\
66         srcP0 = vec_perm(srcR1, srcR2, pp0);\
67         srcP1 = srcR2;\
68         srcP2 = vec_perm(srcR2, srcR3, pp2);\
69         srcP3 = vec_perm(srcR2, srcR3, pp3);\
70     } break;\
71     case 14: {\
72         vec_u8 srcR3 = vec_ld(30, s);\
73         srcM2 = vec_perm(srcR1, srcR2, pm2);\
74         srcM1 = vec_perm(srcR1, srcR2, pm1);\
75         srcP0 = srcR2;\
76         srcP1 = vec_perm(srcR2, srcR3, pp1);\
77         srcP2 = vec_perm(srcR2, srcR3, pp2);\
78         srcP3 = vec_perm(srcR2, srcR3, pp3);\
79     } break;\
80     case 15: {\
81         vec_u8 srcR3 = vec_ld(30, s);\
82         srcM2 = vec_perm(srcR1, srcR2, pm2);\
83         srcM1 = srcR2;\
84         srcP0 = vec_perm(srcR2, srcR3, pp0);\
85         srcP1 = vec_perm(srcR2, srcR3, pp1);\
86         srcP2 = vec_perm(srcR2, srcR3, pp2);\
87         srcP3 = vec_perm(srcR2, srcR3, pp3);\
88     } break;\
89     }\
90  }
91 #else
92 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
93     srcM2 =  vec_vsx_ld(-2, s);\
94     srcM1 = vec_vsx_ld(-1, s);\
95     srcP0 = vec_vsx_ld(0, s);\
96     srcP1 = vec_vsx_ld(1, s);\
97     srcP2 = vec_vsx_ld(2, s);\
98     srcP3 = vec_vsx_ld(3, s);\
99  }
100 #endif /* HAVE_BIGENDIAN */
101 
102 /* this code assume stride % 16 == 0 */
103 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst,const uint8_t * src,int dstStride,int srcStride)104 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
105                                                  const uint8_t *src,
106                                                  int dstStride, int srcStride)
107 {
108     register int i;
109 
110     LOAD_ZERO;
111     vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
112     const vec_s16 v5ss = vec_splat_s16(5);
113     const vec_u16 v5us = vec_splat_u16(5);
114     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
115     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
116 
117     vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
118 
119     register int align = ((((unsigned long)src) - 2) % 16);
120 
121     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
122               srcP2A, srcP2B, srcP3A, srcP3B,
123               srcM1A, srcM1B, srcM2A, srcM2B,
124               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
125               pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
126               psumA, psumB, sumA, sumB;
127 
128     vec_u8 sum, fsum;
129 
130 #if HAVE_BIGENDIAN
131     permM2 = vec_lvsl(-2, src);
132     permM1 = vec_lvsl(-1, src);
133     permP0 = vec_lvsl(+0, src);
134     permP1 = vec_lvsl(+1, src);
135     permP2 = vec_lvsl(+2, src);
136     permP3 = vec_lvsl(+3, src);
137 #endif /* HAVE_BIGENDIAN */
138 
139     for (i = 0 ; i < 16 ; i ++) {
140         load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
141 
142         srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
143         srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
144         srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
145         srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
146 
147         srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
148         srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
149         srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
150         srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
151 
152         srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
153         srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
154         srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
155         srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
156 
157         sum1A = vec_adds(srcP0A, srcP1A);
158         sum1B = vec_adds(srcP0B, srcP1B);
159         sum2A = vec_adds(srcM1A, srcP2A);
160         sum2B = vec_adds(srcM1B, srcP2B);
161         sum3A = vec_adds(srcM2A, srcP3A);
162         sum3B = vec_adds(srcM2B, srcP3B);
163 
164         pp1A = vec_mladd(sum1A, v20ss, v16ss);
165         pp1B = vec_mladd(sum1B, v20ss, v16ss);
166 
167         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
168         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
169 
170         pp3A = vec_add(sum3A, pp1A);
171         pp3B = vec_add(sum3B, pp1B);
172 
173         psumA = vec_sub(pp3A, pp2A);
174         psumB = vec_sub(pp3B, pp2B);
175 
176         sumA = vec_sra(psumA, v5us);
177         sumB = vec_sra(psumB, v5us);
178 
179         sum = vec_packsu(sumA, sumB);
180 
181         ASSERT_ALIGNED(dst);
182 
183         OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
184 
185         vec_st(fsum, 0, dst);
186 
187         src += srcStride;
188         dst += dstStride;
189     }
190 }
191 #endif /* PREFIX_h264_qpel16_h_lowpass_altivec */
192 
193 /* this code assume stride % 16 == 0 */
194 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst,const uint8_t * src,int dstStride,int srcStride)195 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
196                                                  const uint8_t *src,
197                                                  int dstStride, int srcStride)
198 {
199     register int i;
200 
201     LOAD_ZERO;
202     vec_u8 perm;
203 #if HAVE_BIGENDIAN
204     perm = vec_lvsl(0, src);
205 #endif
206     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
207     const vec_u16 v5us = vec_splat_u16(5);
208     const vec_s16 v5ss = vec_splat_s16(5);
209     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
210 
211     const uint8_t *srcbis = src - (srcStride * 2);
212 
213     const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
214     srcbis += srcStride;
215     const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
216     srcbis += srcStride;
217     const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
218     srcbis += srcStride;
219     const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
220     srcbis += srcStride;
221     const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
222     srcbis += srcStride;
223 
224     vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
225     vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
226     vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
227     vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
228     vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
229     vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
230     vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
231     vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
232     vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
233     vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
234 
235     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
236               psumA, psumB, sumA, sumB,
237               srcP3ssA, srcP3ssB,
238               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
239 
240     vec_u8 sum, fsum, srcP3;
241 
242     for (i = 0 ; i < 16 ; i++) {
243         srcP3 = load_with_perm_vec(0, srcbis, perm);
244         srcbis += srcStride;
245 
246         srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
247         srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
248 
249         sum1A = vec_adds(srcP0ssA, srcP1ssA);
250         sum1B = vec_adds(srcP0ssB, srcP1ssB);
251         sum2A = vec_adds(srcM1ssA, srcP2ssA);
252         sum2B = vec_adds(srcM1ssB, srcP2ssB);
253         sum3A = vec_adds(srcM2ssA, srcP3ssA);
254         sum3B = vec_adds(srcM2ssB, srcP3ssB);
255 
256         srcM2ssA = srcM1ssA;
257         srcM2ssB = srcM1ssB;
258         srcM1ssA = srcP0ssA;
259         srcM1ssB = srcP0ssB;
260         srcP0ssA = srcP1ssA;
261         srcP0ssB = srcP1ssB;
262         srcP1ssA = srcP2ssA;
263         srcP1ssB = srcP2ssB;
264         srcP2ssA = srcP3ssA;
265         srcP2ssB = srcP3ssB;
266 
267         pp1A = vec_mladd(sum1A, v20ss, v16ss);
268         pp1B = vec_mladd(sum1B, v20ss, v16ss);
269 
270         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
271         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
272 
273         pp3A = vec_add(sum3A, pp1A);
274         pp3B = vec_add(sum3B, pp1B);
275 
276         psumA = vec_sub(pp3A, pp2A);
277         psumB = vec_sub(pp3B, pp2B);
278 
279         sumA = vec_sra(psumA, v5us);
280         sumB = vec_sra(psumB, v5us);
281 
282         sum = vec_packsu(sumA, sumB);
283 
284         ASSERT_ALIGNED(dst);
285 
286         OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
287 
288         vec_st(fsum, 0, dst);
289 
290         dst += dstStride;
291     }
292 }
293 #endif /* PREFIX_h264_qpel16_v_lowpass_altivec */
294 
295 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
296 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst,int16_t * tmp,const uint8_t * src,int dstStride,int tmpStride,int srcStride)297 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
298                                                   const uint8_t *src,
299                                                   int dstStride, int tmpStride,
300                                                   int srcStride)
301 {
302     register int i;
303     LOAD_ZERO;
304     vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
305     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306     const vec_u32 v10ui = vec_splat_u32(10);
307     const vec_s16 v5ss = vec_splat_s16(5);
308     const vec_s16 v1ss = vec_splat_s16(1);
309     const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
310     const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
311 
312     register int align = ((((unsigned long)src) - 2) % 16);
313 
314     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
315               srcP2A, srcP2B, srcP3A, srcP3B,
316               srcM1A, srcM1B, srcM2A, srcM2B,
317               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
318               pp1A, pp1B, pp2A, pp2B, psumA, psumB;
319 
320     const vec_u8 mperm = (const vec_u8)
321         {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
322          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
323     int16_t *tmpbis = tmp;
324 
325     vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
326               tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
327               tmpP2ssA, tmpP2ssB;
328 
329     vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
330               pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
331               pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
332               ssumAe, ssumAo, ssumBe, ssumBo;
333     vec_u8 fsum, sumv, sum;
334     vec_s16 ssume, ssumo;
335 
336 #if HAVE_BIGENDIAN
337     permM2 = vec_lvsl(-2, src);
338     permM1 = vec_lvsl(-1, src);
339     permP0 = vec_lvsl(+0, src);
340     permP1 = vec_lvsl(+1, src);
341     permP2 = vec_lvsl(+2, src);
342     permP3 = vec_lvsl(+3, src);
343 #endif /* HAVE_BIGENDIAN */
344 
345     src -= (2 * srcStride);
346     for (i = 0 ; i < 21 ; i ++) {
347         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
348 
349         load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
350 
351         srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
352         srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
353         srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
354         srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
355 
356         srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
357         srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
358         srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
359         srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
360 
361         srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
362         srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
363         srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
364         srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
365 
366         sum1A = vec_adds(srcP0A, srcP1A);
367         sum1B = vec_adds(srcP0B, srcP1B);
368         sum2A = vec_adds(srcM1A, srcP2A);
369         sum2B = vec_adds(srcM1B, srcP2B);
370         sum3A = vec_adds(srcM2A, srcP3A);
371         sum3B = vec_adds(srcM2B, srcP3B);
372 
373         pp1A = vec_mladd(sum1A, v20ss, sum3A);
374         pp1B = vec_mladd(sum1B, v20ss, sum3B);
375 
376         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
377         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
378 
379         psumA = vec_sub(pp1A, pp2A);
380         psumB = vec_sub(pp1B, pp2B);
381 
382         vec_st(psumA, 0, tmp);
383         vec_st(psumB, 16, tmp);
384 
385         src += srcStride;
386         tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
387     }
388 
389     tmpM2ssA = vec_ld(0, tmpbis);
390     tmpM2ssB = vec_ld(16, tmpbis);
391     tmpbis += tmpStride;
392     tmpM1ssA = vec_ld(0, tmpbis);
393     tmpM1ssB = vec_ld(16, tmpbis);
394     tmpbis += tmpStride;
395     tmpP0ssA = vec_ld(0, tmpbis);
396     tmpP0ssB = vec_ld(16, tmpbis);
397     tmpbis += tmpStride;
398     tmpP1ssA = vec_ld(0, tmpbis);
399     tmpP1ssB = vec_ld(16, tmpbis);
400     tmpbis += tmpStride;
401     tmpP2ssA = vec_ld(0, tmpbis);
402     tmpP2ssB = vec_ld(16, tmpbis);
403     tmpbis += tmpStride;
404 
405     for (i = 0 ; i < 16 ; i++) {
406         const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
407         const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
408 
409         const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
410         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
411         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
412         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
413         vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
414         vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
415 
416         tmpbis += tmpStride;
417 
418         tmpM2ssA = tmpM1ssA;
419         tmpM2ssB = tmpM1ssB;
420         tmpM1ssA = tmpP0ssA;
421         tmpM1ssB = tmpP0ssB;
422         tmpP0ssA = tmpP1ssA;
423         tmpP0ssB = tmpP1ssB;
424         tmpP1ssA = tmpP2ssA;
425         tmpP1ssB = tmpP2ssB;
426         tmpP2ssA = tmpP3ssA;
427         tmpP2ssB = tmpP3ssB;
428 
429         pp1Ae = vec_mule(sum1A, v20ss);
430         pp1Ao = vec_mulo(sum1A, v20ss);
431         pp1Be = vec_mule(sum1B, v20ss);
432         pp1Bo = vec_mulo(sum1B, v20ss);
433 
434         pp2Ae = vec_mule(sum2A, v5ss);
435         pp2Ao = vec_mulo(sum2A, v5ss);
436         pp2Be = vec_mule(sum2B, v5ss);
437         pp2Bo = vec_mulo(sum2B, v5ss);
438 
439         pp3Ao = vec_mulo(sum3A, v1ss);
440         pp3Bo = vec_mulo(sum3B, v1ss);
441 #if !HAVE_BIGENDIAN
442         sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
443         sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
444 #endif
445         pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
446         pp3Be = vec_sra((vec_s32)sum3B, v16ui);
447 
448         pp1cAe = vec_add(pp1Ae, v512si);
449         pp1cAo = vec_add(pp1Ao, v512si);
450         pp1cBe = vec_add(pp1Be, v512si);
451         pp1cBo = vec_add(pp1Bo, v512si);
452 
453         pp32Ae = vec_sub(pp3Ae, pp2Ae);
454         pp32Ao = vec_sub(pp3Ao, pp2Ao);
455         pp32Be = vec_sub(pp3Be, pp2Be);
456         pp32Bo = vec_sub(pp3Bo, pp2Bo);
457 
458         sumAe = vec_add(pp1cAe, pp32Ae);
459         sumAo = vec_add(pp1cAo, pp32Ao);
460         sumBe = vec_add(pp1cBe, pp32Be);
461         sumBo = vec_add(pp1cBo, pp32Bo);
462 
463         ssumAe = vec_sra(sumAe, v10ui);
464         ssumAo = vec_sra(sumAo, v10ui);
465         ssumBe = vec_sra(sumBe, v10ui);
466         ssumBo = vec_sra(sumBo, v10ui);
467 
468         ssume = vec_packs(ssumAe, ssumBe);
469         ssumo = vec_packs(ssumAo, ssumBo);
470 
471         sumv = vec_packsu(ssume, ssumo);
472         sum = vec_perm(sumv, sumv, mperm);
473 
474         ASSERT_ALIGNED(dst);
475 
476         OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
477 
478         vec_st(fsum, 0, dst);
479 
480         dst += dstStride;
481     }
482 }
483 #endif /* PREFIX_h264_qpel16_hv_lowpass_altivec */
484