1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* ACCURATE INTEGER FORWARD DCT */
24
25 #include "jsimd_altivec.h"
26
27
28 #define F_0_298 2446 /* FIX(0.298631336) */
29 #define F_0_390 3196 /* FIX(0.390180644) */
30 #define F_0_541 4433 /* FIX(0.541196100) */
31 #define F_0_765 6270 /* FIX(0.765366865) */
32 #define F_0_899 7373 /* FIX(0.899976223) */
33 #define F_1_175 9633 /* FIX(1.175875602) */
34 #define F_1_501 12299 /* FIX(1.501321110) */
35 #define F_1_847 15137 /* FIX(1.847759065) */
36 #define F_1_961 16069 /* FIX(1.961570560) */
37 #define F_2_053 16819 /* FIX(2.053119869) */
38 #define F_2_562 20995 /* FIX(2.562915447) */
39 #define F_3_072 25172 /* FIX(3.072711026) */
40
41 #define CONST_BITS 13
42 #define PASS1_BITS 2
43 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44 #define DESCALE_P2 (CONST_BITS + PASS1_BITS)
45
46
47 #define DO_FDCT_COMMON(PASS) { \
48 /* (Original) \
49 * z1 = (tmp12 + tmp13) * 0.541196100; \
50 * data2 = z1 + tmp13 * 0.765366865; \
51 * data6 = z1 + tmp12 * -1.847759065; \
52 * \
53 * (This implementation) \
54 * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
55 * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
56 */ \
57 \
58 tmp1312l = vec_mergeh(tmp13, tmp12); \
59 tmp1312h = vec_mergel(tmp13, tmp12); \
60 \
61 out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
62 out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
63 out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
64 out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
65 \
66 out2l = vec_sra(out2l, descale_p##PASS); \
67 out2h = vec_sra(out2h, descale_p##PASS); \
68 out6l = vec_sra(out6l, descale_p##PASS); \
69 out6h = vec_sra(out6h, descale_p##PASS); \
70 \
71 out2 = vec_pack(out2l, out2h); \
72 out6 = vec_pack(out6l, out6h); \
73 \
74 /* Odd part */ \
75 \
76 z3 = vec_add(tmp4, tmp6); \
77 z4 = vec_add(tmp5, tmp7); \
78 \
79 /* (Original) \
80 * z5 = (z3 + z4) * 1.175875602; \
81 * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
82 * z3 += z5; z4 += z5; \
83 * \
84 * (This implementation) \
85 * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
86 * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
87 */ \
88 \
89 z34l = vec_mergeh(z3, z4); \
90 z34h = vec_mergel(z3, z4); \
91 \
92 z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
93 z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
94 z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
95 z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
96 \
97 /* (Original) \
98 * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
99 * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
100 * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
101 * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
102 * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
103 * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
104 * \
105 * (This implementation) \
106 * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
107 * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
108 * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
109 * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
110 * data7 = tmp4 + z3; data5 = tmp5 + z4; \
111 * data3 = tmp6 + z3; data1 = tmp7 + z4; \
112 */ \
113 \
114 tmp47l = vec_mergeh(tmp4, tmp7); \
115 tmp47h = vec_mergel(tmp4, tmp7); \
116 \
117 out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
118 out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
119 out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
120 out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
121 \
122 out7l = vec_sra(out7l, descale_p##PASS); \
123 out7h = vec_sra(out7h, descale_p##PASS); \
124 out1l = vec_sra(out1l, descale_p##PASS); \
125 out1h = vec_sra(out1h, descale_p##PASS); \
126 \
127 out7 = vec_pack(out7l, out7h); \
128 out1 = vec_pack(out1l, out1h); \
129 \
130 tmp56l = vec_mergeh(tmp5, tmp6); \
131 tmp56h = vec_mergel(tmp5, tmp6); \
132 \
133 out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
134 out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
135 out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
136 out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
137 \
138 out5l = vec_sra(out5l, descale_p##PASS); \
139 out5h = vec_sra(out5h, descale_p##PASS); \
140 out3l = vec_sra(out3l, descale_p##PASS); \
141 out3h = vec_sra(out3h, descale_p##PASS); \
142 \
143 out5 = vec_pack(out5l, out5h); \
144 out3 = vec_pack(out3l, out3h); \
145 }
146
147 #define DO_FDCT_PASS1() { \
148 /* Even part */ \
149 \
150 tmp10 = vec_add(tmp0, tmp3); \
151 tmp13 = vec_sub(tmp0, tmp3); \
152 tmp11 = vec_add(tmp1, tmp2); \
153 tmp12 = vec_sub(tmp1, tmp2); \
154 \
155 out0 = vec_add(tmp10, tmp11); \
156 out0 = vec_sl(out0, pass1_bits); \
157 out4 = vec_sub(tmp10, tmp11); \
158 out4 = vec_sl(out4, pass1_bits); \
159 \
160 DO_FDCT_COMMON(1); \
161 }
162
163 #define DO_FDCT_PASS2() { \
164 /* Even part */ \
165 \
166 tmp10 = vec_add(tmp0, tmp3); \
167 tmp13 = vec_sub(tmp0, tmp3); \
168 tmp11 = vec_add(tmp1, tmp2); \
169 tmp12 = vec_sub(tmp1, tmp2); \
170 \
171 out0 = vec_add(tmp10, tmp11); \
172 out0 = vec_add(out0, pw_descale_p2x); \
173 out0 = vec_sra(out0, pass1_bits); \
174 out4 = vec_sub(tmp10, tmp11); \
175 out4 = vec_add(out4, pw_descale_p2x); \
176 out4 = vec_sra(out4, pass1_bits); \
177 \
178 DO_FDCT_COMMON(2); \
179 }
180
181
jsimd_fdct_islow_altivec(DCTELEM * data)182 void jsimd_fdct_islow_altivec(DCTELEM *data)
183 {
184 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
185 col0, col1, col2, col3, col4, col5, col6, col7,
186 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
187 tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
188 z3, z4, z34l, z34h,
189 out0, out1, out2, out3, out4, out5, out6, out7;
190 __vector int z3l, z3h, z4l, z4h,
191 out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
192 out7l, out7h;
193
194 /* Constants */
195 __vector short
196 pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
197 pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
198 pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
199 pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
200 pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
201 pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
202 pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
203 pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
204 pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
205 __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
206 __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
207 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
208 __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
209 descale_p2 = { __4X(DESCALE_P2) };
210
211 /* Pass 1: process rows */
212
213 row0 = vec_ld(0, data);
214 row1 = vec_ld(16, data);
215 row2 = vec_ld(32, data);
216 row3 = vec_ld(48, data);
217 row4 = vec_ld(64, data);
218 row5 = vec_ld(80, data);
219 row6 = vec_ld(96, data);
220 row7 = vec_ld(112, data);
221
222 TRANSPOSE(row, col);
223
224 tmp0 = vec_add(col0, col7);
225 tmp7 = vec_sub(col0, col7);
226 tmp1 = vec_add(col1, col6);
227 tmp6 = vec_sub(col1, col6);
228 tmp2 = vec_add(col2, col5);
229 tmp5 = vec_sub(col2, col5);
230 tmp3 = vec_add(col3, col4);
231 tmp4 = vec_sub(col3, col4);
232
233 DO_FDCT_PASS1();
234
235 /* Pass 2: process columns */
236
237 TRANSPOSE(out, row);
238
239 tmp0 = vec_add(row0, row7);
240 tmp7 = vec_sub(row0, row7);
241 tmp1 = vec_add(row1, row6);
242 tmp6 = vec_sub(row1, row6);
243 tmp2 = vec_add(row2, row5);
244 tmp5 = vec_sub(row2, row5);
245 tmp3 = vec_add(row3, row4);
246 tmp4 = vec_sub(row3, row4);
247
248 DO_FDCT_PASS2();
249
250 vec_st(out0, 0, data);
251 vec_st(out1, 16, data);
252 vec_st(out2, 32, data);
253 vec_st(out3, 48, data);
254 vec_st(out4, 64, data);
255 vec_st(out5, 80, data);
256 vec_st(out6, 96, data);
257 vec_st(out7, 112, data);
258 }
259