1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* FAST INTEGER INVERSE DCT
24 *
25 * This is similar to the SSE2 implementation, except that we left-shift the
26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28 * the elements in arg3 + the most significant 17 bits of
29 * (the elements in arg1 * the elements in arg2).
30 */
31
32 #include "jsimd_altivec.h"
33
34
35 #define F_1_082 277 /* FIX(1.082392200) */
36 #define F_1_414 362 /* FIX(1.414213562) */
37 #define F_1_847 473 /* FIX(1.847759065) */
38 #define F_2_613 669 /* FIX(2.613125930) */
39 #define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
40
41 #define CONST_BITS 8
42 #define PASS1_BITS 2
43 #define PRE_MULTIPLY_SCALE_BITS 2
44 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45
46
47 #define DO_IDCT(in) { \
48 /* Even part */ \
49 \
50 tmp10 = vec_add(in##0, in##4); \
51 tmp11 = vec_sub(in##0, in##4); \
52 tmp13 = vec_add(in##2, in##6); \
53 \
54 tmp12 = vec_sub(in##2, in##6); \
55 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
56 tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
57 tmp12 = vec_sub(tmp12, tmp13); \
58 \
59 tmp0 = vec_add(tmp10, tmp13); \
60 tmp3 = vec_sub(tmp10, tmp13); \
61 tmp1 = vec_add(tmp11, tmp12); \
62 tmp2 = vec_sub(tmp11, tmp12); \
63 \
64 /* Odd part */ \
65 \
66 z13 = vec_add(in##5, in##3); \
67 z10 = vec_sub(in##5, in##3); \
68 z10s = vec_sl(z10, pre_multiply_scale_bits); \
69 z11 = vec_add(in##1, in##7); \
70 z12s = vec_sub(in##1, in##7); \
71 z12s = vec_sl(z12s, pre_multiply_scale_bits); \
72 \
73 tmp11 = vec_sub(z11, z13); \
74 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
75 tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
76 \
77 tmp7 = vec_add(z11, z13); \
78 \
79 /* To avoid overflow... \
80 * \
81 * (Original) \
82 * tmp12 = -2.613125930 * z10 + z5; \
83 * \
84 * (This implementation) \
85 * tmp12 = (-1.613125930 - 1) * z10 + z5; \
86 * = -1.613125930 * z10 - z10 + z5; \
87 */ \
88 \
89 z5 = vec_add(z10s, z12s); \
90 z5 = vec_madds(z5, pw_F1847, pw_zero); \
91 \
92 tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
93 tmp10 = vec_sub(tmp10, z5); \
94 tmp12 = vec_madds(z10s, pw_MF1613, z5); \
95 tmp12 = vec_sub(tmp12, z10); \
96 \
97 tmp6 = vec_sub(tmp12, tmp7); \
98 tmp5 = vec_sub(tmp11, tmp6); \
99 tmp4 = vec_add(tmp10, tmp5); \
100 \
101 out0 = vec_add(tmp0, tmp7); \
102 out1 = vec_add(tmp1, tmp6); \
103 out2 = vec_add(tmp2, tmp5); \
104 out3 = vec_sub(tmp3, tmp4); \
105 out4 = vec_add(tmp3, tmp4); \
106 out5 = vec_sub(tmp2, tmp5); \
107 out6 = vec_sub(tmp1, tmp6); \
108 out7 = vec_sub(tmp0, tmp7); \
109 }
110
111
jsimd_idct_ifast_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)112 void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
113 JSAMPARRAY output_buf, JDIMENSION output_col)
114 {
115 short *dct_table = (short *)dct_table_;
116 int *outptr;
117
118 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
119 col0, col1, col2, col3, col4, col5, col6, col7,
120 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
121 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
122 z5, z10, z10s, z11, z12s, z13,
123 out0, out1, out2, out3, out4, out5, out6, out7;
124 __vector signed char outb;
125
126 /* Constants */
127 __vector short pw_zero = { __8X(0) },
128 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
129 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
130 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
131 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
132 __vector unsigned short
133 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
134 pass1_bits3 = { __8X(PASS1_BITS + 3) };
135 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
136
137 /* Pass 1: process columns */
138
139 col0 = vec_ld(0, coef_block);
140 col1 = vec_ld(16, coef_block);
141 col2 = vec_ld(32, coef_block);
142 col3 = vec_ld(48, coef_block);
143 col4 = vec_ld(64, coef_block);
144 col5 = vec_ld(80, coef_block);
145 col6 = vec_ld(96, coef_block);
146 col7 = vec_ld(112, coef_block);
147
148 tmp1 = vec_or(col1, col2);
149 tmp2 = vec_or(col3, col4);
150 tmp1 = vec_or(tmp1, tmp2);
151 tmp3 = vec_or(col5, col6);
152 tmp3 = vec_or(tmp3, col7);
153 tmp1 = vec_or(tmp1, tmp3);
154
155 quant0 = vec_ld(0, dct_table);
156 col0 = vec_mladd(col0, quant0, pw_zero);
157
158 if (vec_all_eq(tmp1, pw_zero)) {
159 /* AC terms all zero */
160
161 row0 = vec_splat(col0, 0);
162 row1 = vec_splat(col0, 1);
163 row2 = vec_splat(col0, 2);
164 row3 = vec_splat(col0, 3);
165 row4 = vec_splat(col0, 4);
166 row5 = vec_splat(col0, 5);
167 row6 = vec_splat(col0, 6);
168 row7 = vec_splat(col0, 7);
169
170 } else {
171
172 quant1 = vec_ld(16, dct_table);
173 quant2 = vec_ld(32, dct_table);
174 quant3 = vec_ld(48, dct_table);
175 quant4 = vec_ld(64, dct_table);
176 quant5 = vec_ld(80, dct_table);
177 quant6 = vec_ld(96, dct_table);
178 quant7 = vec_ld(112, dct_table);
179
180 col1 = vec_mladd(col1, quant1, pw_zero);
181 col2 = vec_mladd(col2, quant2, pw_zero);
182 col3 = vec_mladd(col3, quant3, pw_zero);
183 col4 = vec_mladd(col4, quant4, pw_zero);
184 col5 = vec_mladd(col5, quant5, pw_zero);
185 col6 = vec_mladd(col6, quant6, pw_zero);
186 col7 = vec_mladd(col7, quant7, pw_zero);
187
188 DO_IDCT(col);
189
190 TRANSPOSE(out, row);
191 }
192
193 /* Pass 2: process rows */
194
195 DO_IDCT(row);
196
197 out0 = vec_sra(out0, pass1_bits3);
198 out1 = vec_sra(out1, pass1_bits3);
199 out2 = vec_sra(out2, pass1_bits3);
200 out3 = vec_sra(out3, pass1_bits3);
201 out4 = vec_sra(out4, pass1_bits3);
202 out5 = vec_sra(out5, pass1_bits3);
203 out6 = vec_sra(out6, pass1_bits3);
204 out7 = vec_sra(out7, pass1_bits3);
205
206 TRANSPOSE(out, col);
207
208 outb = vec_packs(col0, col0);
209 outb = vec_add(outb, pb_centerjsamp);
210 outptr = (int *)(output_buf[0] + output_col);
211 vec_ste((__vector int)outb, 0, outptr);
212 vec_ste((__vector int)outb, 4, outptr);
213
214 outb = vec_packs(col1, col1);
215 outb = vec_add(outb, pb_centerjsamp);
216 outptr = (int *)(output_buf[1] + output_col);
217 vec_ste((__vector int)outb, 0, outptr);
218 vec_ste((__vector int)outb, 4, outptr);
219
220 outb = vec_packs(col2, col2);
221 outb = vec_add(outb, pb_centerjsamp);
222 outptr = (int *)(output_buf[2] + output_col);
223 vec_ste((__vector int)outb, 0, outptr);
224 vec_ste((__vector int)outb, 4, outptr);
225
226 outb = vec_packs(col3, col3);
227 outb = vec_add(outb, pb_centerjsamp);
228 outptr = (int *)(output_buf[3] + output_col);
229 vec_ste((__vector int)outb, 0, outptr);
230 vec_ste((__vector int)outb, 4, outptr);
231
232 outb = vec_packs(col4, col4);
233 outb = vec_add(outb, pb_centerjsamp);
234 outptr = (int *)(output_buf[4] + output_col);
235 vec_ste((__vector int)outb, 0, outptr);
236 vec_ste((__vector int)outb, 4, outptr);
237
238 outb = vec_packs(col5, col5);
239 outb = vec_add(outb, pb_centerjsamp);
240 outptr = (int *)(output_buf[5] + output_col);
241 vec_ste((__vector int)outb, 0, outptr);
242 vec_ste((__vector int)outb, 4, outptr);
243
244 outb = vec_packs(col6, col6);
245 outb = vec_add(outb, pb_centerjsamp);
246 outptr = (int *)(output_buf[6] + output_col);
247 vec_ste((__vector int)outb, 0, outptr);
248 vec_ste((__vector int)outb, 4, outptr);
249
250 outb = vec_packs(col7, col7);
251 outb = vec_add(outb, pb_centerjsamp);
252 outptr = (int *)(output_buf[7] + output_col);
253 vec_ste((__vector int)outb, 0, outptr);
254 vec_ste((__vector int)outb, 4, outptr);
255 }
256