1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* SLOW INTEGER INVERSE DCT */
24
25 #include "jsimd_altivec.h"
26
27
28 #define F_0_298 2446 /* FIX(0.298631336) */
29 #define F_0_390 3196 /* FIX(0.390180644) */
30 #define F_0_541 4433 /* FIX(0.541196100) */
31 #define F_0_765 6270 /* FIX(0.765366865) */
32 #define F_0_899 7373 /* FIX(0.899976223) */
33 #define F_1_175 9633 /* FIX(1.175875602) */
34 #define F_1_501 12299 /* FIX(1.501321110) */
35 #define F_1_847 15137 /* FIX(1.847759065) */
36 #define F_1_961 16069 /* FIX(1.961570560) */
37 #define F_2_053 16819 /* FIX(2.053119869) */
38 #define F_2_562 20995 /* FIX(2.562915447) */
39 #define F_3_072 25172 /* FIX(3.072711026) */
40
41 #define CONST_BITS 13
42 #define PASS1_BITS 2
43 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44 #define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
45
46
47 #define DO_IDCT(in, PASS) { \
48 /* Even part \
49 * \
50 * (Original) \
51 * z1 = (z2 + z3) * 0.541196100; \
52 * tmp2 = z1 + z3 * -1.847759065; \
53 * tmp3 = z1 + z2 * 0.765366865; \
54 * \
55 * (This implementation) \
56 * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
57 * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
58 */ \
59 \
60 in##26l = vec_mergeh(in##2, in##6); \
61 in##26h = vec_mergel(in##2, in##6); \
62 \
63 tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
64 tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
65 tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
66 tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
67 \
68 tmp0 = vec_add(in##0, in##4); \
69 tmp1 = vec_sub(in##0, in##4); \
70 \
71 tmp0l = vec_unpackh(tmp0); \
72 tmp0h = vec_unpackl(tmp0); \
73 tmp0l = vec_sl(tmp0l, const_bits); \
74 tmp0h = vec_sl(tmp0h, const_bits); \
75 tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
76 tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
77 \
78 tmp10l = vec_add(tmp0l, tmp3l); \
79 tmp10h = vec_add(tmp0h, tmp3h); \
80 tmp13l = vec_sub(tmp0l, tmp3l); \
81 tmp13h = vec_sub(tmp0h, tmp3h); \
82 \
83 tmp1l = vec_unpackh(tmp1); \
84 tmp1h = vec_unpackl(tmp1); \
85 tmp1l = vec_sl(tmp1l, const_bits); \
86 tmp1h = vec_sl(tmp1h, const_bits); \
87 tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
88 tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
89 \
90 tmp11l = vec_add(tmp1l, tmp2l); \
91 tmp11h = vec_add(tmp1h, tmp2h); \
92 tmp12l = vec_sub(tmp1l, tmp2l); \
93 tmp12h = vec_sub(tmp1h, tmp2h); \
94 \
95 /* Odd part */ \
96 \
97 z3 = vec_add(in##3, in##7); \
98 z4 = vec_add(in##1, in##5); \
99 \
100 /* (Original) \
101 * z5 = (z3 + z4) * 1.175875602; \
102 * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
103 * z3 += z5; z4 += z5; \
104 * \
105 * (This implementation) \
106 * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
107 * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
108 */ \
109 \
110 z34l = vec_mergeh(z3, z4); \
111 z34h = vec_mergel(z3, z4); \
112 \
113 z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
114 z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
115 z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
116 z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
117 \
118 /* (Original) \
119 * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
120 * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
121 * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
122 * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
123 * tmp0 += z1 + z3; tmp1 += z2 + z4; \
124 * tmp2 += z2 + z3; tmp3 += z1 + z4; \
125 * \
126 * (This implementation) \
127 * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
128 * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
129 * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
130 * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
131 * tmp0 += z3; tmp1 += z4; \
132 * tmp2 += z3; tmp3 += z4; \
133 */ \
134 \
135 in##71l = vec_mergeh(in##7, in##1); \
136 in##71h = vec_mergel(in##7, in##1); \
137 \
138 tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
139 tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
140 tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
141 tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
142 \
143 in##53l = vec_mergeh(in##5, in##3); \
144 in##53h = vec_mergel(in##5, in##3); \
145 \
146 tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
147 tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
148 tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
149 tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
150 \
151 /* Final output stage */ \
152 \
153 out0l = vec_add(tmp10l, tmp3l); \
154 out0h = vec_add(tmp10h, tmp3h); \
155 out7l = vec_sub(tmp10l, tmp3l); \
156 out7h = vec_sub(tmp10h, tmp3h); \
157 \
158 out0l = vec_sra(out0l, descale_p##PASS); \
159 out0h = vec_sra(out0h, descale_p##PASS); \
160 out7l = vec_sra(out7l, descale_p##PASS); \
161 out7h = vec_sra(out7h, descale_p##PASS); \
162 \
163 out0 = vec_pack(out0l, out0h); \
164 out7 = vec_pack(out7l, out7h); \
165 \
166 out1l = vec_add(tmp11l, tmp2l); \
167 out1h = vec_add(tmp11h, tmp2h); \
168 out6l = vec_sub(tmp11l, tmp2l); \
169 out6h = vec_sub(tmp11h, tmp2h); \
170 \
171 out1l = vec_sra(out1l, descale_p##PASS); \
172 out1h = vec_sra(out1h, descale_p##PASS); \
173 out6l = vec_sra(out6l, descale_p##PASS); \
174 out6h = vec_sra(out6h, descale_p##PASS); \
175 \
176 out1 = vec_pack(out1l, out1h); \
177 out6 = vec_pack(out6l, out6h); \
178 \
179 out2l = vec_add(tmp12l, tmp1l); \
180 out2h = vec_add(tmp12h, tmp1h); \
181 out5l = vec_sub(tmp12l, tmp1l); \
182 out5h = vec_sub(tmp12h, tmp1h); \
183 \
184 out2l = vec_sra(out2l, descale_p##PASS); \
185 out2h = vec_sra(out2h, descale_p##PASS); \
186 out5l = vec_sra(out5l, descale_p##PASS); \
187 out5h = vec_sra(out5h, descale_p##PASS); \
188 \
189 out2 = vec_pack(out2l, out2h); \
190 out5 = vec_pack(out5l, out5h); \
191 \
192 out3l = vec_add(tmp13l, tmp0l); \
193 out3h = vec_add(tmp13h, tmp0h); \
194 out4l = vec_sub(tmp13l, tmp0l); \
195 out4h = vec_sub(tmp13h, tmp0h); \
196 \
197 out3l = vec_sra(out3l, descale_p##PASS); \
198 out3h = vec_sra(out3h, descale_p##PASS); \
199 out4l = vec_sra(out4l, descale_p##PASS); \
200 out4h = vec_sra(out4h, descale_p##PASS); \
201 \
202 out3 = vec_pack(out3l, out3h); \
203 out4 = vec_pack(out4l, out4h); \
204 }
205
206
jsimd_idct_islow_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)207 void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
208 JSAMPARRAY output_buf, JDIMENSION output_col)
209 {
210 short *dct_table = (short *)dct_table_;
211 int *outptr;
212
213 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
214 col0, col1, col2, col3, col4, col5, col6, col7,
215 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
216 tmp0, tmp1, tmp2, tmp3, z3, z4,
217 z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
218 row71l, row71h, row26l, row26h, row53l, row53h,
219 out0, out1, out2, out3, out4, out5, out6, out7;
220 __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
221 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
222 z3l, z3h, z4l, z4h,
223 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
224 out5l, out5h, out6l, out6h, out7l, out7h;
225 __vector signed char outb;
226
227 /* Constants */
228 __vector short pw_zero = { __8X(0) },
229 pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
230 pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
231 pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
232 pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
233 pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
234 pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
235 pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
236 pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
237 __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
238 __vector int pd_zero = { __4X(0) },
239 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
240 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
241 __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
242 descale_p2 = { __4X(DESCALE_P2) },
243 const_bits = { __4X(CONST_BITS) };
244 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
245
246 /* Pass 1: process columns */
247
248 col0 = vec_ld(0, coef_block);
249 col1 = vec_ld(16, coef_block);
250 col2 = vec_ld(32, coef_block);
251 col3 = vec_ld(48, coef_block);
252 col4 = vec_ld(64, coef_block);
253 col5 = vec_ld(80, coef_block);
254 col6 = vec_ld(96, coef_block);
255 col7 = vec_ld(112, coef_block);
256
257 tmp1 = vec_or(col1, col2);
258 tmp2 = vec_or(col3, col4);
259 tmp1 = vec_or(tmp1, tmp2);
260 tmp3 = vec_or(col5, col6);
261 tmp3 = vec_or(tmp3, col7);
262 tmp1 = vec_or(tmp1, tmp3);
263
264 quant0 = vec_ld(0, dct_table);
265 col0 = vec_mladd(col0, quant0, pw_zero);
266
267 if (vec_all_eq(tmp1, pw_zero)) {
268 /* AC terms all zero */
269
270 col0 = vec_sl(col0, pass1_bits);
271
272 row0 = vec_splat(col0, 0);
273 row1 = vec_splat(col0, 1);
274 row2 = vec_splat(col0, 2);
275 row3 = vec_splat(col0, 3);
276 row4 = vec_splat(col0, 4);
277 row5 = vec_splat(col0, 5);
278 row6 = vec_splat(col0, 6);
279 row7 = vec_splat(col0, 7);
280
281 } else {
282
283 quant1 = vec_ld(16, dct_table);
284 quant2 = vec_ld(32, dct_table);
285 quant3 = vec_ld(48, dct_table);
286 quant4 = vec_ld(64, dct_table);
287 quant5 = vec_ld(80, dct_table);
288 quant6 = vec_ld(96, dct_table);
289 quant7 = vec_ld(112, dct_table);
290
291 col1 = vec_mladd(col1, quant1, pw_zero);
292 col2 = vec_mladd(col2, quant2, pw_zero);
293 col3 = vec_mladd(col3, quant3, pw_zero);
294 col4 = vec_mladd(col4, quant4, pw_zero);
295 col5 = vec_mladd(col5, quant5, pw_zero);
296 col6 = vec_mladd(col6, quant6, pw_zero);
297 col7 = vec_mladd(col7, quant7, pw_zero);
298
299 DO_IDCT(col, 1);
300
301 TRANSPOSE(out, row);
302 }
303
304 /* Pass 2: process rows */
305
306 DO_IDCT(row, 2);
307
308 TRANSPOSE(out, col);
309
310 outb = vec_packs(col0, col0);
311 outb = vec_add(outb, pb_centerjsamp);
312 outptr = (int *)(output_buf[0] + output_col);
313 vec_ste((__vector int)outb, 0, outptr);
314 vec_ste((__vector int)outb, 4, outptr);
315
316 outb = vec_packs(col1, col1);
317 outb = vec_add(outb, pb_centerjsamp);
318 outptr = (int *)(output_buf[1] + output_col);
319 vec_ste((__vector int)outb, 0, outptr);
320 vec_ste((__vector int)outb, 4, outptr);
321
322 outb = vec_packs(col2, col2);
323 outb = vec_add(outb, pb_centerjsamp);
324 outptr = (int *)(output_buf[2] + output_col);
325 vec_ste((__vector int)outb, 0, outptr);
326 vec_ste((__vector int)outb, 4, outptr);
327
328 outb = vec_packs(col3, col3);
329 outb = vec_add(outb, pb_centerjsamp);
330 outptr = (int *)(output_buf[3] + output_col);
331 vec_ste((__vector int)outb, 0, outptr);
332 vec_ste((__vector int)outb, 4, outptr);
333
334 outb = vec_packs(col4, col4);
335 outb = vec_add(outb, pb_centerjsamp);
336 outptr = (int *)(output_buf[4] + output_col);
337 vec_ste((__vector int)outb, 0, outptr);
338 vec_ste((__vector int)outb, 4, outptr);
339
340 outb = vec_packs(col5, col5);
341 outb = vec_add(outb, pb_centerjsamp);
342 outptr = (int *)(output_buf[5] + output_col);
343 vec_ste((__vector int)outb, 0, outptr);
344 vec_ste((__vector int)outb, 4, outptr);
345
346 outb = vec_packs(col6, col6);
347 outb = vec_add(outb, pb_centerjsamp);
348 outptr = (int *)(output_buf[6] + output_col);
349 vec_ste((__vector int)outb, 0, outptr);
350 vec_ste((__vector int)outb, 4, outptr);
351
352 outb = vec_packs(col7, col7);
353 outb = vec_add(outb, pb_centerjsamp);
354 outptr = (int *)(output_buf[7] + output_col);
355 vec_ste((__vector int)outb, 0, outptr);
356 vec_ste((__vector int)outb, 4, outptr);
357 }
358