1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
24
25 #include "jsimd_altivec.h"
26
27
28 /* NOTE: The address will either be aligned or offset by 8 bytes, so we can
29 * always get the data we want by using a single vector load (although we may
30 * have to permute the result.)
31 */
32 #if __BIG_ENDIAN__
33
34 #define LOAD_ROW(row) { \
35 elemptr = sample_data[row] + start_col; \
36 in##row = vec_ld(0, elemptr); \
37 if ((size_t)elemptr & 15) \
38 in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
39 }
40
41 #else
42
43 #define LOAD_ROW(row) { \
44 elemptr = sample_data[row] + start_col; \
45 in##row = vec_vsx_ld(0, elemptr); \
46 }
47
48 #endif
49
50
jsimd_convsamp_altivec(JSAMPARRAY sample_data,JDIMENSION start_col,DCTELEM * workspace)51 void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
52 DCTELEM *workspace)
53 {
54 JSAMPROW elemptr;
55
56 __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
57 __vector short out0, out1, out2, out3, out4, out5, out6, out7;
58
59 /* Constants */
60 __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
61 __vector unsigned char pb_zero = { __16X(0) };
62
63 LOAD_ROW(0);
64 LOAD_ROW(1);
65 LOAD_ROW(2);
66 LOAD_ROW(3);
67 LOAD_ROW(4);
68 LOAD_ROW(5);
69 LOAD_ROW(6);
70 LOAD_ROW(7);
71
72 out0 = (__vector short)VEC_UNPACKHU(in0);
73 out1 = (__vector short)VEC_UNPACKHU(in1);
74 out2 = (__vector short)VEC_UNPACKHU(in2);
75 out3 = (__vector short)VEC_UNPACKHU(in3);
76 out4 = (__vector short)VEC_UNPACKHU(in4);
77 out5 = (__vector short)VEC_UNPACKHU(in5);
78 out6 = (__vector short)VEC_UNPACKHU(in6);
79 out7 = (__vector short)VEC_UNPACKHU(in7);
80
81 out0 = vec_sub(out0, pw_centerjsamp);
82 out1 = vec_sub(out1, pw_centerjsamp);
83 out2 = vec_sub(out2, pw_centerjsamp);
84 out3 = vec_sub(out3, pw_centerjsamp);
85 out4 = vec_sub(out4, pw_centerjsamp);
86 out5 = vec_sub(out5, pw_centerjsamp);
87 out6 = vec_sub(out6, pw_centerjsamp);
88 out7 = vec_sub(out7, pw_centerjsamp);
89
90 vec_st(out0, 0, workspace);
91 vec_st(out1, 16, workspace);
92 vec_st(out2, 32, workspace);
93 vec_st(out3, 48, workspace);
94 vec_st(out4, 64, workspace);
95 vec_st(out5, 80, workspace);
96 vec_st(out6, 96, workspace);
97 vec_st(out7, 112, workspace);
98 }
99
100
101 #define WORD_BIT 16
102
103 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
104 We basically need an unsigned equivalent of vec_madds(). */
105
106 #define MULTIPLY(vs0, vs1, out) { \
107 tmpe = vec_mule((__vector unsigned short)vs0, \
108 (__vector unsigned short)vs1); \
109 tmpo = vec_mulo((__vector unsigned short)vs0, \
110 (__vector unsigned short)vs1); \
111 out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
112 (__vector unsigned short)tmpo, \
113 shift_pack_index); \
114 }
115
jsimd_quantize_altivec(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)116 void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
117 DCTELEM *workspace)
118 {
119 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
120 row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
121 corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
122 recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
123 scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
124 __vector unsigned int tmpe, tmpo;
125
126 /* Constants */
127 __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
128 #if __BIG_ENDIAN__
129 __vector unsigned char shift_pack_index =
130 { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
131 #else
132 __vector unsigned char shift_pack_index =
133 { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
134 #endif
135
136 row0 = vec_ld(0, workspace);
137 row1 = vec_ld(16, workspace);
138 row2 = vec_ld(32, workspace);
139 row3 = vec_ld(48, workspace);
140 row4 = vec_ld(64, workspace);
141 row5 = vec_ld(80, workspace);
142 row6 = vec_ld(96, workspace);
143 row7 = vec_ld(112, workspace);
144
145 /* Branch-less absolute value */
146 row0s = vec_sra(row0, pw_word_bit_m1);
147 row1s = vec_sra(row1, pw_word_bit_m1);
148 row2s = vec_sra(row2, pw_word_bit_m1);
149 row3s = vec_sra(row3, pw_word_bit_m1);
150 row4s = vec_sra(row4, pw_word_bit_m1);
151 row5s = vec_sra(row5, pw_word_bit_m1);
152 row6s = vec_sra(row6, pw_word_bit_m1);
153 row7s = vec_sra(row7, pw_word_bit_m1);
154 row0 = vec_xor(row0, row0s);
155 row1 = vec_xor(row1, row1s);
156 row2 = vec_xor(row2, row2s);
157 row3 = vec_xor(row3, row3s);
158 row4 = vec_xor(row4, row4s);
159 row5 = vec_xor(row5, row5s);
160 row6 = vec_xor(row6, row6s);
161 row7 = vec_xor(row7, row7s);
162 row0 = vec_sub(row0, row0s);
163 row1 = vec_sub(row1, row1s);
164 row2 = vec_sub(row2, row2s);
165 row3 = vec_sub(row3, row3s);
166 row4 = vec_sub(row4, row4s);
167 row5 = vec_sub(row5, row5s);
168 row6 = vec_sub(row6, row6s);
169 row7 = vec_sub(row7, row7s);
170
171 corr0 = vec_ld(DCTSIZE2 * 2, divisors);
172 corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
173 corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
174 corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
175 corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
176 corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
177 corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
178 corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
179
180 row0 = vec_add(row0, corr0);
181 row1 = vec_add(row1, corr1);
182 row2 = vec_add(row2, corr2);
183 row3 = vec_add(row3, corr3);
184 row4 = vec_add(row4, corr4);
185 row5 = vec_add(row5, corr5);
186 row6 = vec_add(row6, corr6);
187 row7 = vec_add(row7, corr7);
188
189 recip0 = vec_ld(0, divisors);
190 recip1 = vec_ld(16, divisors);
191 recip2 = vec_ld(32, divisors);
192 recip3 = vec_ld(48, divisors);
193 recip4 = vec_ld(64, divisors);
194 recip5 = vec_ld(80, divisors);
195 recip6 = vec_ld(96, divisors);
196 recip7 = vec_ld(112, divisors);
197
198 MULTIPLY(row0, recip0, row0);
199 MULTIPLY(row1, recip1, row1);
200 MULTIPLY(row2, recip2, row2);
201 MULTIPLY(row3, recip3, row3);
202 MULTIPLY(row4, recip4, row4);
203 MULTIPLY(row5, recip5, row5);
204 MULTIPLY(row6, recip6, row6);
205 MULTIPLY(row7, recip7, row7);
206
207 scale0 = vec_ld(DCTSIZE2 * 4, divisors);
208 scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
209 scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
210 scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
211 scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
212 scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
213 scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
214 scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
215
216 MULTIPLY(row0, scale0, row0);
217 MULTIPLY(row1, scale1, row1);
218 MULTIPLY(row2, scale2, row2);
219 MULTIPLY(row3, scale3, row3);
220 MULTIPLY(row4, scale4, row4);
221 MULTIPLY(row5, scale5, row5);
222 MULTIPLY(row6, scale6, row6);
223 MULTIPLY(row7, scale7, row7);
224
225 row0 = vec_xor(row0, row0s);
226 row1 = vec_xor(row1, row1s);
227 row2 = vec_xor(row2, row2s);
228 row3 = vec_xor(row3, row3s);
229 row4 = vec_xor(row4, row4s);
230 row5 = vec_xor(row5, row5s);
231 row6 = vec_xor(row6, row6s);
232 row7 = vec_xor(row7, row7s);
233 row0 = vec_sub(row0, row0s);
234 row1 = vec_sub(row1, row1s);
235 row2 = vec_sub(row2, row2s);
236 row3 = vec_sub(row3, row3s);
237 row4 = vec_sub(row4, row4s);
238 row5 = vec_sub(row5, row5s);
239 row6 = vec_sub(row6, row6s);
240 row7 = vec_sub(row7, row7s);
241
242 vec_st(row0, 0, coef_block);
243 vec_st(row1, 16, coef_block);
244 vec_st(row2, 32, coef_block);
245 vec_st(row3, 48, coef_block);
246 vec_st(row4, 64, coef_block);
247 vec_st(row5, 80, coef_block);
248 vec_st(row6, 96, coef_block);
249 vec_st(row7, 112, coef_block);
250 }
251