1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* SLOW INTEGER INVERSE DCT */
24
25 #include "jsimd_altivec.h"
26
27
28 #define F_0_298 2446 /* FIX(0.298631336) */
29 #define F_0_390 3196 /* FIX(0.390180644) */
30 #define F_0_541 4433 /* FIX(0.541196100) */
31 #define F_0_765 6270 /* FIX(0.765366865) */
32 #define F_0_899 7373 /* FIX(0.899976223) */
33 #define F_1_175 9633 /* FIX(1.175875602) */
34 #define F_1_501 12299 /* FIX(1.501321110) */
35 #define F_1_847 15137 /* FIX(1.847759065) */
36 #define F_1_961 16069 /* FIX(1.961570560) */
37 #define F_2_053 16819 /* FIX(2.053119869) */
38 #define F_2_562 20995 /* FIX(2.562915447) */
39 #define F_3_072 25172 /* FIX(3.072711026) */
40
41 #define CONST_BITS 13
42 #define PASS1_BITS 2
43 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44 #define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
45
46
47 #define DO_IDCT(in, PASS) \
48 { \
49 /* Even part \
50 * \
51 * (Original) \
52 * z1 = (z2 + z3) * 0.541196100; \
53 * tmp2 = z1 + z3 * -1.847759065; \
54 * tmp3 = z1 + z2 * 0.765366865; \
55 * \
56 * (This implementation) \
57 * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
58 * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
59 */ \
60 \
61 in##26l = vec_mergeh(in##2, in##6); \
62 in##26h = vec_mergel(in##2, in##6); \
63 \
64 tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
65 tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
66 tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
67 tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
68 \
69 tmp0 = vec_add(in##0, in##4); \
70 tmp1 = vec_sub(in##0, in##4); \
71 \
72 tmp0l = vec_unpackh(tmp0); \
73 tmp0h = vec_unpackl(tmp0); \
74 tmp0l = vec_sl(tmp0l, const_bits); \
75 tmp0h = vec_sl(tmp0h, const_bits); \
76 tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
77 tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
78 \
79 tmp10l = vec_add(tmp0l, tmp3l); \
80 tmp10h = vec_add(tmp0h, tmp3h); \
81 tmp13l = vec_sub(tmp0l, tmp3l); \
82 tmp13h = vec_sub(tmp0h, tmp3h); \
83 \
84 tmp1l = vec_unpackh(tmp1); \
85 tmp1h = vec_unpackl(tmp1); \
86 tmp1l = vec_sl(tmp1l, const_bits); \
87 tmp1h = vec_sl(tmp1h, const_bits); \
88 tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
89 tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
90 \
91 tmp11l = vec_add(tmp1l, tmp2l); \
92 tmp11h = vec_add(tmp1h, tmp2h); \
93 tmp12l = vec_sub(tmp1l, tmp2l); \
94 tmp12h = vec_sub(tmp1h, tmp2h); \
95 \
96 /* Odd part */ \
97 \
98 z3 = vec_add(in##3, in##7); \
99 z4 = vec_add(in##1, in##5); \
100 \
101 /* (Original) \
102 * z5 = (z3 + z4) * 1.175875602; \
103 * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
104 * z3 += z5; z4 += z5; \
105 * \
106 * (This implementation) \
107 * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
108 * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
109 */ \
110 \
111 z34l = vec_mergeh(z3, z4); \
112 z34h = vec_mergel(z3, z4); \
113 \
114 z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
115 z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
116 z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
117 z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
118 \
119 /* (Original) \
120 * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
121 * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
122 * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
123 * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
124 * tmp0 += z1 + z3; tmp1 += z2 + z4; \
125 * tmp2 += z2 + z3; tmp3 += z1 + z4; \
126 * \
127 * (This implementation) \
128 * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
129 * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
130 * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
131 * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
132 * tmp0 += z3; tmp1 += z4; \
133 * tmp2 += z3; tmp3 += z4; \
134 */ \
135 \
136 in##71l = vec_mergeh(in##7, in##1); \
137 in##71h = vec_mergel(in##7, in##1); \
138 \
139 tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
140 tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
141 tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
142 tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
143 \
144 in##53l = vec_mergeh(in##5, in##3); \
145 in##53h = vec_mergel(in##5, in##3); \
146 \
147 tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
148 tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
149 tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
150 tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
151 \
152 /* Final output stage */ \
153 \
154 out0l = vec_add(tmp10l, tmp3l); \
155 out0h = vec_add(tmp10h, tmp3h); \
156 out7l = vec_sub(tmp10l, tmp3l); \
157 out7h = vec_sub(tmp10h, tmp3h); \
158 \
159 out0l = vec_sra(out0l, descale_p##PASS); \
160 out0h = vec_sra(out0h, descale_p##PASS); \
161 out7l = vec_sra(out7l, descale_p##PASS); \
162 out7h = vec_sra(out7h, descale_p##PASS); \
163 \
164 out0 = vec_pack(out0l, out0h); \
165 out7 = vec_pack(out7l, out7h); \
166 \
167 out1l = vec_add(tmp11l, tmp2l); \
168 out1h = vec_add(tmp11h, tmp2h); \
169 out6l = vec_sub(tmp11l, tmp2l); \
170 out6h = vec_sub(tmp11h, tmp2h); \
171 \
172 out1l = vec_sra(out1l, descale_p##PASS); \
173 out1h = vec_sra(out1h, descale_p##PASS); \
174 out6l = vec_sra(out6l, descale_p##PASS); \
175 out6h = vec_sra(out6h, descale_p##PASS); \
176 \
177 out1 = vec_pack(out1l, out1h); \
178 out6 = vec_pack(out6l, out6h); \
179 \
180 out2l = vec_add(tmp12l, tmp1l); \
181 out2h = vec_add(tmp12h, tmp1h); \
182 out5l = vec_sub(tmp12l, tmp1l); \
183 out5h = vec_sub(tmp12h, tmp1h); \
184 \
185 out2l = vec_sra(out2l, descale_p##PASS); \
186 out2h = vec_sra(out2h, descale_p##PASS); \
187 out5l = vec_sra(out5l, descale_p##PASS); \
188 out5h = vec_sra(out5h, descale_p##PASS); \
189 \
190 out2 = vec_pack(out2l, out2h); \
191 out5 = vec_pack(out5l, out5h); \
192 \
193 out3l = vec_add(tmp13l, tmp0l); \
194 out3h = vec_add(tmp13h, tmp0h); \
195 out4l = vec_sub(tmp13l, tmp0l); \
196 out4h = vec_sub(tmp13h, tmp0h); \
197 \
198 out3l = vec_sra(out3l, descale_p##PASS); \
199 out3h = vec_sra(out3h, descale_p##PASS); \
200 out4l = vec_sra(out4l, descale_p##PASS); \
201 out4h = vec_sra(out4h, descale_p##PASS); \
202 \
203 out3 = vec_pack(out3l, out3h); \
204 out4 = vec_pack(out4l, out4h); \
205 }
206
207
208 void
jsimd_idct_islow_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)209 jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
210 JSAMPARRAY output_buf, JDIMENSION output_col)
211 {
212 short *dct_table = (short *)dct_table_;
213 int *outptr;
214
215 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
216 col0, col1, col2, col3, col4, col5, col6, col7,
217 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
218 tmp0, tmp1, tmp2, tmp3, z3, z4,
219 z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
220 row71l, row71h, row26l, row26h, row53l, row53h,
221 out0, out1, out2, out3, out4, out5, out6, out7;
222 __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
223 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
224 z3l, z3h, z4l, z4h,
225 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
226 out5l, out5h, out6l, out6h, out7l, out7h;
227 __vector signed char outb;
228
229 /* Constants */
230 __vector short pw_zero = { __8X(0) },
231 pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
232 pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
233 pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
234 pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
235 pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
236 pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
237 pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
238 pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
239 __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
240 __vector int pd_zero = { __4X(0) },
241 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
242 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
243 __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
244 descale_p2 = { __4X(DESCALE_P2) },
245 const_bits = { __4X(CONST_BITS) };
246 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
247
248 /* Pass 1: process columns */
249
250 col0 = vec_ld(0, coef_block);
251 col1 = vec_ld(16, coef_block);
252 col2 = vec_ld(32, coef_block);
253 col3 = vec_ld(48, coef_block);
254 col4 = vec_ld(64, coef_block);
255 col5 = vec_ld(80, coef_block);
256 col6 = vec_ld(96, coef_block);
257 col7 = vec_ld(112, coef_block);
258
259 tmp1 = vec_or(col1, col2);
260 tmp2 = vec_or(col3, col4);
261 tmp1 = vec_or(tmp1, tmp2);
262 tmp3 = vec_or(col5, col6);
263 tmp3 = vec_or(tmp3, col7);
264 tmp1 = vec_or(tmp1, tmp3);
265
266 quant0 = vec_ld(0, dct_table);
267 col0 = vec_mladd(col0, quant0, pw_zero);
268
269 if (vec_all_eq(tmp1, pw_zero)) {
270 /* AC terms all zero */
271
272 col0 = vec_sl(col0, pass1_bits);
273
274 row0 = vec_splat(col0, 0);
275 row1 = vec_splat(col0, 1);
276 row2 = vec_splat(col0, 2);
277 row3 = vec_splat(col0, 3);
278 row4 = vec_splat(col0, 4);
279 row5 = vec_splat(col0, 5);
280 row6 = vec_splat(col0, 6);
281 row7 = vec_splat(col0, 7);
282
283 } else {
284
285 quant1 = vec_ld(16, dct_table);
286 quant2 = vec_ld(32, dct_table);
287 quant3 = vec_ld(48, dct_table);
288 quant4 = vec_ld(64, dct_table);
289 quant5 = vec_ld(80, dct_table);
290 quant6 = vec_ld(96, dct_table);
291 quant7 = vec_ld(112, dct_table);
292
293 col1 = vec_mladd(col1, quant1, pw_zero);
294 col2 = vec_mladd(col2, quant2, pw_zero);
295 col3 = vec_mladd(col3, quant3, pw_zero);
296 col4 = vec_mladd(col4, quant4, pw_zero);
297 col5 = vec_mladd(col5, quant5, pw_zero);
298 col6 = vec_mladd(col6, quant6, pw_zero);
299 col7 = vec_mladd(col7, quant7, pw_zero);
300
301 DO_IDCT(col, 1);
302
303 TRANSPOSE(out, row);
304 }
305
306 /* Pass 2: process rows */
307
308 DO_IDCT(row, 2);
309
310 TRANSPOSE(out, col);
311
312 outb = vec_packs(col0, col0);
313 outb = vec_add(outb, pb_centerjsamp);
314 outptr = (int *)(output_buf[0] + output_col);
315 vec_ste((__vector int)outb, 0, outptr);
316 vec_ste((__vector int)outb, 4, outptr);
317
318 outb = vec_packs(col1, col1);
319 outb = vec_add(outb, pb_centerjsamp);
320 outptr = (int *)(output_buf[1] + output_col);
321 vec_ste((__vector int)outb, 0, outptr);
322 vec_ste((__vector int)outb, 4, outptr);
323
324 outb = vec_packs(col2, col2);
325 outb = vec_add(outb, pb_centerjsamp);
326 outptr = (int *)(output_buf[2] + output_col);
327 vec_ste((__vector int)outb, 0, outptr);
328 vec_ste((__vector int)outb, 4, outptr);
329
330 outb = vec_packs(col3, col3);
331 outb = vec_add(outb, pb_centerjsamp);
332 outptr = (int *)(output_buf[3] + output_col);
333 vec_ste((__vector int)outb, 0, outptr);
334 vec_ste((__vector int)outb, 4, outptr);
335
336 outb = vec_packs(col4, col4);
337 outb = vec_add(outb, pb_centerjsamp);
338 outptr = (int *)(output_buf[4] + output_col);
339 vec_ste((__vector int)outb, 0, outptr);
340 vec_ste((__vector int)outb, 4, outptr);
341
342 outb = vec_packs(col5, col5);
343 outb = vec_add(outb, pb_centerjsamp);
344 outptr = (int *)(output_buf[5] + output_col);
345 vec_ste((__vector int)outb, 0, outptr);
346 vec_ste((__vector int)outb, 4, outptr);
347
348 outb = vec_packs(col6, col6);
349 outb = vec_add(outb, pb_centerjsamp);
350 outptr = (int *)(output_buf[6] + output_col);
351 vec_ste((__vector int)outb, 0, outptr);
352 vec_ste((__vector int)outb, 4, outptr);
353
354 outb = vec_packs(col7, col7);
355 outb = vec_add(outb, pb_centerjsamp);
356 outptr = (int *)(output_buf[7] + output_col);
357 vec_ste((__vector int)outb, 0, outptr);
358 vec_ste((__vector int)outb, 4, outptr);
359 }
360