1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* FAST INTEGER INVERSE DCT
24 *
25 * This is similar to the SSE2 implementation, except that we left-shift the
26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28 * the elements in arg3 + the most significant 17 bits of
29 * (the elements in arg1 * the elements in arg2).
30 */
31
32 #include "jsimd_altivec.h"
33
34
35 #define F_1_082 277 /* FIX(1.082392200) */
36 #define F_1_414 362 /* FIX(1.414213562) */
37 #define F_1_847 473 /* FIX(1.847759065) */
38 #define F_2_613 669 /* FIX(2.613125930) */
39 #define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
40
41 #define CONST_BITS 8
42 #define PASS1_BITS 2
43 #define PRE_MULTIPLY_SCALE_BITS 2
44 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45
46
47 #define DO_IDCT(in) \
48 { \
49 /* Even part */ \
50 \
51 tmp10 = vec_add(in##0, in##4); \
52 tmp11 = vec_sub(in##0, in##4); \
53 tmp13 = vec_add(in##2, in##6); \
54 \
55 tmp12 = vec_sub(in##2, in##6); \
56 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
57 tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
58 tmp12 = vec_sub(tmp12, tmp13); \
59 \
60 tmp0 = vec_add(tmp10, tmp13); \
61 tmp3 = vec_sub(tmp10, tmp13); \
62 tmp1 = vec_add(tmp11, tmp12); \
63 tmp2 = vec_sub(tmp11, tmp12); \
64 \
65 /* Odd part */ \
66 \
67 z13 = vec_add(in##5, in##3); \
68 z10 = vec_sub(in##5, in##3); \
69 z10s = vec_sl(z10, pre_multiply_scale_bits); \
70 z11 = vec_add(in##1, in##7); \
71 z12s = vec_sub(in##1, in##7); \
72 z12s = vec_sl(z12s, pre_multiply_scale_bits); \
73 \
74 tmp11 = vec_sub(z11, z13); \
75 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
76 tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
77 \
78 tmp7 = vec_add(z11, z13); \
79 \
80 /* To avoid overflow... \
81 * \
82 * (Original) \
83 * tmp12 = -2.613125930 * z10 + z5; \
84 * \
85 * (This implementation) \
86 * tmp12 = (-1.613125930 - 1) * z10 + z5; \
87 * = -1.613125930 * z10 - z10 + z5; \
88 */ \
89 \
90 z5 = vec_add(z10s, z12s); \
91 z5 = vec_madds(z5, pw_F1847, pw_zero); \
92 \
93 tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
94 tmp10 = vec_sub(tmp10, z5); \
95 tmp12 = vec_madds(z10s, pw_MF1613, z5); \
96 tmp12 = vec_sub(tmp12, z10); \
97 \
98 tmp6 = vec_sub(tmp12, tmp7); \
99 tmp5 = vec_sub(tmp11, tmp6); \
100 tmp4 = vec_add(tmp10, tmp5); \
101 \
102 out0 = vec_add(tmp0, tmp7); \
103 out1 = vec_add(tmp1, tmp6); \
104 out2 = vec_add(tmp2, tmp5); \
105 out3 = vec_sub(tmp3, tmp4); \
106 out4 = vec_add(tmp3, tmp4); \
107 out5 = vec_sub(tmp2, tmp5); \
108 out6 = vec_sub(tmp1, tmp6); \
109 out7 = vec_sub(tmp0, tmp7); \
110 }
111
112
113 void
jsimd_idct_ifast_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)114 jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
115 JSAMPARRAY output_buf, JDIMENSION output_col)
116 {
117 short *dct_table = (short *)dct_table_;
118 int *outptr;
119
120 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
121 col0, col1, col2, col3, col4, col5, col6, col7,
122 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
123 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
124 z5, z10, z10s, z11, z12s, z13,
125 out0, out1, out2, out3, out4, out5, out6, out7;
126 __vector signed char outb;
127
128 /* Constants */
129 __vector short pw_zero = { __8X(0) },
130 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
131 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
132 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
133 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
134 __vector unsigned short
135 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
136 pass1_bits3 = { __8X(PASS1_BITS + 3) };
137 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
138
139 /* Pass 1: process columns */
140
141 col0 = vec_ld(0, coef_block);
142 col1 = vec_ld(16, coef_block);
143 col2 = vec_ld(32, coef_block);
144 col3 = vec_ld(48, coef_block);
145 col4 = vec_ld(64, coef_block);
146 col5 = vec_ld(80, coef_block);
147 col6 = vec_ld(96, coef_block);
148 col7 = vec_ld(112, coef_block);
149
150 tmp1 = vec_or(col1, col2);
151 tmp2 = vec_or(col3, col4);
152 tmp1 = vec_or(tmp1, tmp2);
153 tmp3 = vec_or(col5, col6);
154 tmp3 = vec_or(tmp3, col7);
155 tmp1 = vec_or(tmp1, tmp3);
156
157 quant0 = vec_ld(0, dct_table);
158 col0 = vec_mladd(col0, quant0, pw_zero);
159
160 if (vec_all_eq(tmp1, pw_zero)) {
161 /* AC terms all zero */
162
163 row0 = vec_splat(col0, 0);
164 row1 = vec_splat(col0, 1);
165 row2 = vec_splat(col0, 2);
166 row3 = vec_splat(col0, 3);
167 row4 = vec_splat(col0, 4);
168 row5 = vec_splat(col0, 5);
169 row6 = vec_splat(col0, 6);
170 row7 = vec_splat(col0, 7);
171
172 } else {
173
174 quant1 = vec_ld(16, dct_table);
175 quant2 = vec_ld(32, dct_table);
176 quant3 = vec_ld(48, dct_table);
177 quant4 = vec_ld(64, dct_table);
178 quant5 = vec_ld(80, dct_table);
179 quant6 = vec_ld(96, dct_table);
180 quant7 = vec_ld(112, dct_table);
181
182 col1 = vec_mladd(col1, quant1, pw_zero);
183 col2 = vec_mladd(col2, quant2, pw_zero);
184 col3 = vec_mladd(col3, quant3, pw_zero);
185 col4 = vec_mladd(col4, quant4, pw_zero);
186 col5 = vec_mladd(col5, quant5, pw_zero);
187 col6 = vec_mladd(col6, quant6, pw_zero);
188 col7 = vec_mladd(col7, quant7, pw_zero);
189
190 DO_IDCT(col);
191
192 TRANSPOSE(out, row);
193 }
194
195 /* Pass 2: process rows */
196
197 DO_IDCT(row);
198
199 out0 = vec_sra(out0, pass1_bits3);
200 out1 = vec_sra(out1, pass1_bits3);
201 out2 = vec_sra(out2, pass1_bits3);
202 out3 = vec_sra(out3, pass1_bits3);
203 out4 = vec_sra(out4, pass1_bits3);
204 out5 = vec_sra(out5, pass1_bits3);
205 out6 = vec_sra(out6, pass1_bits3);
206 out7 = vec_sra(out7, pass1_bits3);
207
208 TRANSPOSE(out, col);
209
210 outb = vec_packs(col0, col0);
211 outb = vec_add(outb, pb_centerjsamp);
212 outptr = (int *)(output_buf[0] + output_col);
213 vec_ste((__vector int)outb, 0, outptr);
214 vec_ste((__vector int)outb, 4, outptr);
215
216 outb = vec_packs(col1, col1);
217 outb = vec_add(outb, pb_centerjsamp);
218 outptr = (int *)(output_buf[1] + output_col);
219 vec_ste((__vector int)outb, 0, outptr);
220 vec_ste((__vector int)outb, 4, outptr);
221
222 outb = vec_packs(col2, col2);
223 outb = vec_add(outb, pb_centerjsamp);
224 outptr = (int *)(output_buf[2] + output_col);
225 vec_ste((__vector int)outb, 0, outptr);
226 vec_ste((__vector int)outb, 4, outptr);
227
228 outb = vec_packs(col3, col3);
229 outb = vec_add(outb, pb_centerjsamp);
230 outptr = (int *)(output_buf[3] + output_col);
231 vec_ste((__vector int)outb, 0, outptr);
232 vec_ste((__vector int)outb, 4, outptr);
233
234 outb = vec_packs(col4, col4);
235 outb = vec_add(outb, pb_centerjsamp);
236 outptr = (int *)(output_buf[4] + output_col);
237 vec_ste((__vector int)outb, 0, outptr);
238 vec_ste((__vector int)outb, 4, outptr);
239
240 outb = vec_packs(col5, col5);
241 outb = vec_add(outb, pb_centerjsamp);
242 outptr = (int *)(output_buf[5] + output_col);
243 vec_ste((__vector int)outb, 0, outptr);
244 vec_ste((__vector int)outb, 4, outptr);
245
246 outb = vec_packs(col6, col6);
247 outb = vec_add(outb, pb_centerjsamp);
248 outptr = (int *)(output_buf[6] + output_col);
249 vec_ste((__vector int)outb, 0, outptr);
250 vec_ste((__vector int)outb, 4, outptr);
251
252 outb = vec_packs(col7, col7);
253 outb = vec_add(outb, pb_centerjsamp);
254 outptr = (int *)(output_buf[7] + output_col);
255 vec_ste((__vector int)outb, 0, outptr);
256 vec_ste((__vector int)outb, 4, outptr);
257 }
258