1 /*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "config.h"
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
28 #include "libavutil/ppc/util_altivec.h"
29
30 #include "libavcodec/hpeldsp.h"
31
32 #include "hpeldsp_altivec.h"
33
34 #if HAVE_ALTIVEC
35 /* next one assumes that ((line_size % 16) == 0) */
ff_put_pixels16_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)36 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
37 {
38 register vector unsigned char pixelsv1;
39 register vector unsigned char pixelsv1B;
40 register vector unsigned char pixelsv1C;
41 register vector unsigned char pixelsv1D;
42
43 int i;
44 register ptrdiff_t line_size_2 = line_size << 1;
45 register ptrdiff_t line_size_3 = line_size + line_size_2;
46 register ptrdiff_t line_size_4 = line_size << 2;
47
48 // hand-unrolling the loop by 4 gains about 15%
49 // mininum execution time goes from 74 to 60 cycles
50 // it's faster than -funroll-loops, but using
51 // -funroll-loops w/ this is bad - 74 cycles again.
52 // all this is on a 7450, tuning for the 7450
53 for (i = 0; i < h; i += 4) {
54 pixelsv1 = unaligned_load( 0, pixels);
55 pixelsv1B = unaligned_load(line_size, pixels);
56 pixelsv1C = unaligned_load(line_size_2, pixels);
57 pixelsv1D = unaligned_load(line_size_3, pixels);
58 VEC_ST(pixelsv1, 0, (unsigned char*)block);
59 VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
60 VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
61 VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
62 pixels+=line_size_4;
63 block +=line_size_4;
64 }
65 }
66
67 /* next one assumes that ((line_size % 16) == 0) */
68 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
ff_avg_pixels16_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)69 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
70 {
71 register vector unsigned char pixelsv, blockv;
72
73 int i;
74 for (i = 0; i < h; i++) {
75 blockv = vec_ld(0, block);
76 pixelsv = VEC_LD( 0, pixels);
77 blockv = vec_avg(blockv,pixelsv);
78 vec_st(blockv, 0, (unsigned char*)block);
79 pixels+=line_size;
80 block +=line_size;
81 }
82 }
83
84 /* next one assumes that ((line_size % 8) == 0) */
avg_pixels8_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)85 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
86 {
87 register vector unsigned char pixelsv, blockv;
88 int i;
89
90 for (i = 0; i < h; i++) {
91 /* block is 8 bytes-aligned, so we're either in the
92 left block (16 bytes-aligned) or in the right block (not) */
93 int rightside = ((unsigned long)block & 0x0000000F);
94
95 blockv = vec_ld(0, block);
96 pixelsv = VEC_LD( 0, pixels);
97
98 if (rightside) {
99 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
100 } else {
101 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
102 }
103
104 blockv = vec_avg(blockv, pixelsv);
105
106 vec_st(blockv, 0, block);
107
108 pixels += line_size;
109 block += line_size;
110 }
111 }
112
113 /* next one assumes that ((line_size % 8) == 0) */
put_pixels8_xy2_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)114 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
115 {
116 register int i;
117 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
118 register vector unsigned char blockv;
119 register vector unsigned short pixelssum1, pixelssum2, temp3;
120 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
121 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
122
123 pixelsv1 = VEC_LD(0, pixels);
124 pixelsv2 = VEC_LD(1, pixels);
125 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
126 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
127
128 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
129 (vector unsigned short)pixelsv2);
130 pixelssum1 = vec_add(pixelssum1, vctwo);
131
132 for (i = 0; i < h ; i++) {
133 int rightside = ((unsigned long)block & 0x0000000F);
134 blockv = vec_ld(0, block);
135
136 pixelsv1 = unaligned_load(line_size, pixels);
137 pixelsv2 = unaligned_load(line_size+1, pixels);
138 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
139 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
140 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
141 (vector unsigned short)pixelsv2);
142 temp3 = vec_add(pixelssum1, pixelssum2);
143 temp3 = vec_sra(temp3, vctwo);
144 pixelssum1 = vec_add(pixelssum2, vctwo);
145 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
146
147 if (rightside) {
148 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
149 } else {
150 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
151 }
152
153 vec_st(blockv, 0, block);
154
155 block += line_size;
156 pixels += line_size;
157 }
158 }
159
160 /* next one assumes that ((line_size % 8) == 0) */
put_no_rnd_pixels8_xy2_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)161 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
162 {
163 register int i;
164 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
165 register vector unsigned char blockv;
166 register vector unsigned short pixelssum1, pixelssum2, temp3;
167 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
168 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
169 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
170
171 pixelsv1 = VEC_LD(0, pixels);
172 pixelsv2 = VEC_LD(1, pixels);
173 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
174 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
175 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
176 (vector unsigned short)pixelsv2);
177 pixelssum1 = vec_add(pixelssum1, vcone);
178
179 for (i = 0; i < h ; i++) {
180 int rightside = ((unsigned long)block & 0x0000000F);
181 blockv = vec_ld(0, block);
182
183 pixelsv1 = unaligned_load(line_size, pixels);
184 pixelsv2 = unaligned_load(line_size+1, pixels);
185 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
186 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
187 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
188 (vector unsigned short)pixelsv2);
189 temp3 = vec_add(pixelssum1, pixelssum2);
190 temp3 = vec_sra(temp3, vctwo);
191 pixelssum1 = vec_add(pixelssum2, vcone);
192 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
193
194 if (rightside) {
195 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
196 } else {
197 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
198 }
199
200 vec_st(blockv, 0, block);
201
202 block += line_size;
203 pixels += line_size;
204 }
205 }
206
207 /* next one assumes that ((line_size % 16) == 0) */
put_pixels16_xy2_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)208 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
209 {
210 register int i;
211 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
212 register vector unsigned char blockv;
213 register vector unsigned short temp3, temp4,
214 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
215 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
216 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
217
218 pixelsv1 = VEC_LD(0, pixels);
219 pixelsv2 = VEC_LD(1, pixels);
220 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
221 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
222 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
223 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
224 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
225 (vector unsigned short)pixelsv4);
226 pixelssum3 = vec_add(pixelssum3, vctwo);
227 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
228 (vector unsigned short)pixelsv2);
229 pixelssum1 = vec_add(pixelssum1, vctwo);
230
231 for (i = 0; i < h ; i++) {
232 blockv = vec_ld(0, block);
233
234 pixelsv1 = unaligned_load(line_size, pixels);
235 pixelsv2 = unaligned_load(line_size+1, pixels);
236
237 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
238 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
239 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
240 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
241 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
242 (vector unsigned short)pixelsv4);
243 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
244 (vector unsigned short)pixelsv2);
245 temp4 = vec_add(pixelssum3, pixelssum4);
246 temp4 = vec_sra(temp4, vctwo);
247 temp3 = vec_add(pixelssum1, pixelssum2);
248 temp3 = vec_sra(temp3, vctwo);
249
250 pixelssum3 = vec_add(pixelssum4, vctwo);
251 pixelssum1 = vec_add(pixelssum2, vctwo);
252
253 blockv = vec_packsu(temp3, temp4);
254
255 vec_st(blockv, 0, block);
256
257 block += line_size;
258 pixels += line_size;
259 }
260 }
261
262 /* next one assumes that ((line_size % 16) == 0) */
put_no_rnd_pixels16_xy2_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)263 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
264 {
265 register int i;
266 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
267 register vector unsigned char blockv;
268 register vector unsigned short temp3, temp4,
269 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
270 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
271 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
272 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
273
274 pixelsv1 = VEC_LD(0, pixels);
275 pixelsv2 = VEC_LD(1, pixels);
276 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
277 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
278 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
279 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
280 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
281 (vector unsigned short)pixelsv4);
282 pixelssum3 = vec_add(pixelssum3, vcone);
283 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
284 (vector unsigned short)pixelsv2);
285 pixelssum1 = vec_add(pixelssum1, vcone);
286
287 for (i = 0; i < h ; i++) {
288 pixelsv1 = unaligned_load(line_size, pixels);
289 pixelsv2 = unaligned_load(line_size+1, pixels);
290
291 pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
292 pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
293 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
294 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
295 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296 (vector unsigned short)pixelsv4);
297 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298 (vector unsigned short)pixelsv2);
299 temp4 = vec_add(pixelssum3, pixelssum4);
300 temp4 = vec_sra(temp4, vctwo);
301 temp3 = vec_add(pixelssum1, pixelssum2);
302 temp3 = vec_sra(temp3, vctwo);
303
304 pixelssum3 = vec_add(pixelssum4, vcone);
305 pixelssum1 = vec_add(pixelssum2, vcone);
306
307 blockv = vec_packsu(temp3, temp4);
308
309 VEC_ST(blockv, 0, block);
310
311 block += line_size;
312 pixels += line_size;
313 }
314 }
315
316 /* next one assumes that ((line_size % 8) == 0) */
avg_pixels8_xy2_altivec(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)317 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
318 {
319 register int i;
320 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
321 register vector unsigned char blockv, blocktemp;
322 register vector unsigned short pixelssum1, pixelssum2, temp3;
323
324 register const vector unsigned char vczero = (const vector unsigned char)
325 vec_splat_u8(0);
326 register const vector unsigned short vctwo = (const vector unsigned short)
327 vec_splat_u16(2);
328
329 pixelsv1 = VEC_LD(0, pixels);
330 pixelsv2 = VEC_LD(1, pixels);
331 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
332 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
333 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
334 (vector unsigned short)pixelsv2);
335 pixelssum1 = vec_add(pixelssum1, vctwo);
336
337 for (i = 0; i < h ; i++) {
338 int rightside = ((unsigned long)block & 0x0000000F);
339 blockv = vec_ld(0, block);
340
341 pixelsv1 = unaligned_load(line_size, pixels);
342 pixelsv2 = unaligned_load(line_size+1, pixels);
343
344 pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
345 pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
346 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
347 (vector unsigned short)pixelsv2);
348 temp3 = vec_add(pixelssum1, pixelssum2);
349 temp3 = vec_sra(temp3, vctwo);
350 pixelssum1 = vec_add(pixelssum2, vctwo);
351 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
352
353 if (rightside) {
354 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
355 } else {
356 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
357 }
358
359 blockv = vec_avg(blocktemp, blockv);
360 vec_st(blockv, 0, block);
361
362 block += line_size;
363 pixels += line_size;
364 }
365 }
366 #endif /* HAVE_ALTIVEC */
367
ff_hpeldsp_init_ppc(HpelDSPContext * c,int flags)368 av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
369 {
370 #if HAVE_ALTIVEC
371 if (!PPC_ALTIVEC(av_get_cpu_flags()))
372 return;
373
374 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
375 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
376 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
377
378 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
379 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
380 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
381
382 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
383 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
384 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
385 #endif /* HAVE_ALTIVEC */
386 }
387