1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
24
25 #include "jsimd_altivec.h"
26
27
28 /* NOTE: The address will either be aligned or offset by 8 bytes, so we can
29 * always get the data we want by using a single vector load (although we may
30 * have to permute the result.)
31 */
32 #if __BIG_ENDIAN__
33
34 #define LOAD_ROW(row) { \
35 elemptr = sample_data[row] + start_col; \
36 in##row = vec_ld(0, elemptr); \
37 if ((size_t)elemptr & 15) \
38 in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
39 }
40
41 #else
42
43 #define LOAD_ROW(row) { \
44 elemptr = sample_data[row] + start_col; \
45 in##row = vec_vsx_ld(0, elemptr); \
46 }
47
48 #endif
49
50
51 void
jsimd_convsamp_altivec(JSAMPARRAY sample_data,JDIMENSION start_col,DCTELEM * workspace)52 jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
53 DCTELEM *workspace)
54 {
55 JSAMPROW elemptr;
56
57 __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
58 __vector short out0, out1, out2, out3, out4, out5, out6, out7;
59
60 /* Constants */
61 __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
62 __vector unsigned char pb_zero = { __16X(0) };
63
64 LOAD_ROW(0);
65 LOAD_ROW(1);
66 LOAD_ROW(2);
67 LOAD_ROW(3);
68 LOAD_ROW(4);
69 LOAD_ROW(5);
70 LOAD_ROW(6);
71 LOAD_ROW(7);
72
73 out0 = (__vector short)VEC_UNPACKHU(in0);
74 out1 = (__vector short)VEC_UNPACKHU(in1);
75 out2 = (__vector short)VEC_UNPACKHU(in2);
76 out3 = (__vector short)VEC_UNPACKHU(in3);
77 out4 = (__vector short)VEC_UNPACKHU(in4);
78 out5 = (__vector short)VEC_UNPACKHU(in5);
79 out6 = (__vector short)VEC_UNPACKHU(in6);
80 out7 = (__vector short)VEC_UNPACKHU(in7);
81
82 out0 = vec_sub(out0, pw_centerjsamp);
83 out1 = vec_sub(out1, pw_centerjsamp);
84 out2 = vec_sub(out2, pw_centerjsamp);
85 out3 = vec_sub(out3, pw_centerjsamp);
86 out4 = vec_sub(out4, pw_centerjsamp);
87 out5 = vec_sub(out5, pw_centerjsamp);
88 out6 = vec_sub(out6, pw_centerjsamp);
89 out7 = vec_sub(out7, pw_centerjsamp);
90
91 vec_st(out0, 0, workspace);
92 vec_st(out1, 16, workspace);
93 vec_st(out2, 32, workspace);
94 vec_st(out3, 48, workspace);
95 vec_st(out4, 64, workspace);
96 vec_st(out5, 80, workspace);
97 vec_st(out6, 96, workspace);
98 vec_st(out7, 112, workspace);
99 }
100
101
102 #define WORD_BIT 16
103
104 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
105 We basically need an unsigned equivalent of vec_madds(). */
106
107 #define MULTIPLY(vs0, vs1, out) { \
108 tmpe = vec_mule((__vector unsigned short)vs0, \
109 (__vector unsigned short)vs1); \
110 tmpo = vec_mulo((__vector unsigned short)vs0, \
111 (__vector unsigned short)vs1); \
112 out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
113 (__vector unsigned short)tmpo, \
114 shift_pack_index); \
115 }
116
117 void
jsimd_quantize_altivec(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)118 jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
119 DCTELEM *workspace)
120 {
121 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
122 row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
123 corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
124 recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
125 scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
126 __vector unsigned int tmpe, tmpo;
127
128 /* Constants */
129 __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
130 #if __BIG_ENDIAN__
131 __vector unsigned char shift_pack_index =
132 {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
133 #else
134 __vector unsigned char shift_pack_index =
135 {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
136 #endif
137
138 row0 = vec_ld(0, workspace);
139 row1 = vec_ld(16, workspace);
140 row2 = vec_ld(32, workspace);
141 row3 = vec_ld(48, workspace);
142 row4 = vec_ld(64, workspace);
143 row5 = vec_ld(80, workspace);
144 row6 = vec_ld(96, workspace);
145 row7 = vec_ld(112, workspace);
146
147 /* Branch-less absolute value */
148 row0s = vec_sra(row0, pw_word_bit_m1);
149 row1s = vec_sra(row1, pw_word_bit_m1);
150 row2s = vec_sra(row2, pw_word_bit_m1);
151 row3s = vec_sra(row3, pw_word_bit_m1);
152 row4s = vec_sra(row4, pw_word_bit_m1);
153 row5s = vec_sra(row5, pw_word_bit_m1);
154 row6s = vec_sra(row6, pw_word_bit_m1);
155 row7s = vec_sra(row7, pw_word_bit_m1);
156 row0 = vec_xor(row0, row0s);
157 row1 = vec_xor(row1, row1s);
158 row2 = vec_xor(row2, row2s);
159 row3 = vec_xor(row3, row3s);
160 row4 = vec_xor(row4, row4s);
161 row5 = vec_xor(row5, row5s);
162 row6 = vec_xor(row6, row6s);
163 row7 = vec_xor(row7, row7s);
164 row0 = vec_sub(row0, row0s);
165 row1 = vec_sub(row1, row1s);
166 row2 = vec_sub(row2, row2s);
167 row3 = vec_sub(row3, row3s);
168 row4 = vec_sub(row4, row4s);
169 row5 = vec_sub(row5, row5s);
170 row6 = vec_sub(row6, row6s);
171 row7 = vec_sub(row7, row7s);
172
173 corr0 = vec_ld(DCTSIZE2 * 2, divisors);
174 corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
175 corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
176 corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
177 corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
178 corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
179 corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
180 corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
181
182 row0 = vec_add(row0, corr0);
183 row1 = vec_add(row1, corr1);
184 row2 = vec_add(row2, corr2);
185 row3 = vec_add(row3, corr3);
186 row4 = vec_add(row4, corr4);
187 row5 = vec_add(row5, corr5);
188 row6 = vec_add(row6, corr6);
189 row7 = vec_add(row7, corr7);
190
191 recip0 = vec_ld(0, divisors);
192 recip1 = vec_ld(16, divisors);
193 recip2 = vec_ld(32, divisors);
194 recip3 = vec_ld(48, divisors);
195 recip4 = vec_ld(64, divisors);
196 recip5 = vec_ld(80, divisors);
197 recip6 = vec_ld(96, divisors);
198 recip7 = vec_ld(112, divisors);
199
200 MULTIPLY(row0, recip0, row0);
201 MULTIPLY(row1, recip1, row1);
202 MULTIPLY(row2, recip2, row2);
203 MULTIPLY(row3, recip3, row3);
204 MULTIPLY(row4, recip4, row4);
205 MULTIPLY(row5, recip5, row5);
206 MULTIPLY(row6, recip6, row6);
207 MULTIPLY(row7, recip7, row7);
208
209 scale0 = vec_ld(DCTSIZE2 * 4, divisors);
210 scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
211 scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
212 scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
213 scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
214 scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
215 scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
216 scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
217
218 MULTIPLY(row0, scale0, row0);
219 MULTIPLY(row1, scale1, row1);
220 MULTIPLY(row2, scale2, row2);
221 MULTIPLY(row3, scale3, row3);
222 MULTIPLY(row4, scale4, row4);
223 MULTIPLY(row5, scale5, row5);
224 MULTIPLY(row6, scale6, row6);
225 MULTIPLY(row7, scale7, row7);
226
227 row0 = vec_xor(row0, row0s);
228 row1 = vec_xor(row1, row1s);
229 row2 = vec_xor(row2, row2s);
230 row3 = vec_xor(row3, row3s);
231 row4 = vec_xor(row4, row4s);
232 row5 = vec_xor(row5, row5s);
233 row6 = vec_xor(row6, row6s);
234 row7 = vec_xor(row7, row7s);
235 row0 = vec_sub(row0, row0s);
236 row1 = vec_sub(row1, row1s);
237 row2 = vec_sub(row2, row2s);
238 row3 = vec_sub(row3, row3s);
239 row4 = vec_sub(row4, row4s);
240 row5 = vec_sub(row5, row5s);
241 row6 = vec_sub(row6, row6s);
242 row7 = vec_sub(row7, row7s);
243
244 vec_st(row0, 0, coef_block);
245 vec_st(row1, 16, coef_block);
246 vec_st(row2, 32, coef_block);
247 vec_st(row3, 48, coef_block);
248 vec_st(row4, 64, coef_block);
249 vec_st(row5, 80, coef_block);
250 vec_st(row6, 96, coef_block);
251 vec_st(row7, 112, coef_block);
252 }
253