1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* CHROMA UPSAMPLING */
24
25 #include "jsimd_altivec.h"
26
27
jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)28 void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
29 JDIMENSION downsampled_width,
30 JSAMPARRAY input_data,
31 JSAMPARRAY *output_data_ptr)
32 {
33 JSAMPARRAY output_data = *output_data_ptr;
34 JSAMPROW inptr, outptr;
35 int inrow, incol;
36
37 __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
38 out;
39 __vector short this0e, this0o, this0l, this0h, last0l, last0h,
40 next0l, next0h, outle, outhe, outlo, outho;
41
42 /* Constants */
43 __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
44 last_index_col0 =
45 { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
46 last_index =
47 { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
48 next_index =
49 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
50 next_index_lastcol =
51 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
52 #if __BIG_ENDIAN__
53 merge_pack_index =
54 { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
55 #else
56 merge_pack_index =
57 { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
58 #endif
59 __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
60
61 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
62 inptr = input_data[inrow];
63 outptr = output_data[inrow];
64
65 if (downsampled_width & 15)
66 inptr[downsampled_width] = inptr[downsampled_width - 1];
67
68 this0 = vec_ld(0, inptr);
69 p_last0 = vec_perm(this0, this0, last_index_col0);
70 last0 = this0;
71
72 for (incol = downsampled_width; incol > 0;
73 incol -= 16, inptr += 16, outptr += 32) {
74
75 if (downsampled_width - incol > 0) {
76 p_last0 = vec_perm(last0, this0, last_index);
77 last0 = this0;
78 }
79
80 if (incol <= 16)
81 p_next0 = vec_perm(this0, this0, next_index_lastcol);
82 else {
83 next0 = vec_ld(16, inptr);
84 p_next0 = vec_perm(this0, next0, next_index);
85 }
86
87 this0e = (__vector short)vec_mule(this0, pb_three);
88 this0o = (__vector short)vec_mulo(this0, pb_three);
89 this0l = vec_mergeh(this0e, this0o);
90 this0h = vec_mergel(this0e, this0o);
91
92 last0l = (__vector short)VEC_UNPACKHU(p_last0);
93 last0h = (__vector short)VEC_UNPACKLU(p_last0);
94 last0l = vec_add(last0l, pw_one);
95
96 next0l = (__vector short)VEC_UNPACKHU(p_next0);
97 next0h = (__vector short)VEC_UNPACKLU(p_next0);
98 next0l = vec_add(next0l, pw_two);
99
100 outle = vec_add(this0l, last0l);
101 outlo = vec_add(this0l, next0l);
102 outle = vec_sr(outle, (__vector unsigned short)pw_two);
103 outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
104
105 out = vec_perm((__vector unsigned char)outle,
106 (__vector unsigned char)outlo, merge_pack_index);
107 vec_st(out, 0, outptr);
108
109 if (incol > 8) {
110 last0h = vec_add(last0h, pw_one);
111 next0h = vec_add(next0h, pw_two);
112
113 outhe = vec_add(this0h, last0h);
114 outho = vec_add(this0h, next0h);
115 outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
116 outho = vec_sr(outho, (__vector unsigned short)pw_two);
117
118 out = vec_perm((__vector unsigned char)outhe,
119 (__vector unsigned char)outho, merge_pack_index);
120 vec_st(out, 16, outptr);
121 }
122
123 this0 = next0;
124 }
125 }
126 }
127
128
jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)129 void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
130 JDIMENSION downsampled_width,
131 JSAMPARRAY input_data,
132 JSAMPARRAY *output_data_ptr)
133 {
134 JSAMPARRAY output_data = *output_data_ptr;
135 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
136 int inrow, outrow, incol;
137
138 __vector unsigned char this_1, this0, this1, out;
139 __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
140 lastcolsum_1h, lastcolsum1h,
141 p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
142 thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
143 nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
144 nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
145 p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
146 tmpl, tmph, outle, outhe, outlo, outho;
147
148 /* Constants */
149 __vector unsigned char pb_zero = { __16X(0) },
150 last_index_col0 =
151 { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
152 last_index =
153 { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
154 next_index =
155 { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
156 next_index_lastcol =
157 { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
158 #if __BIG_ENDIAN__
159 merge_pack_index =
160 { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
161 #else
162 merge_pack_index =
163 { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
164 #endif
165 __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
166 pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
167 __vector unsigned short pw_four = { __8X(4) };
168
169 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
170
171 inptr_1 = input_data[inrow - 1];
172 inptr0 = input_data[inrow];
173 inptr1 = input_data[inrow + 1];
174 outptr0 = output_data[outrow++];
175 outptr1 = output_data[outrow++];
176
177 if (downsampled_width & 15) {
178 inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
179 inptr0[downsampled_width] = inptr0[downsampled_width - 1];
180 inptr1[downsampled_width] = inptr1[downsampled_width - 1];
181 }
182
183 this0 = vec_ld(0, inptr0);
184 this0l = (__vector short)VEC_UNPACKHU(this0);
185 this0h = (__vector short)VEC_UNPACKLU(this0);
186 this0l = vec_mladd(this0l, pw_three, pw_zero);
187 this0h = vec_mladd(this0h, pw_three, pw_zero);
188
189 this_1 = vec_ld(0, inptr_1);
190 this_1l = (__vector short)VEC_UNPACKHU(this_1);
191 this_1h = (__vector short)VEC_UNPACKLU(this_1);
192 thiscolsum_1l = vec_add(this0l, this_1l);
193 thiscolsum_1h = vec_add(this0h, this_1h);
194 lastcolsum_1h = thiscolsum_1h;
195 p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
196 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
197
198 this1 = vec_ld(0, inptr1);
199 this1l = (__vector short)VEC_UNPACKHU(this1);
200 this1h = (__vector short)VEC_UNPACKLU(this1);
201 thiscolsum1l = vec_add(this0l, this1l);
202 thiscolsum1h = vec_add(this0h, this1h);
203 lastcolsum1h = thiscolsum1h;
204 p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
205 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
206
207 for (incol = downsampled_width; incol > 0;
208 incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
209 outptr0 += 32, outptr1 += 32) {
210
211 if (downsampled_width - incol > 0) {
212 p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
213 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
214 p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
215 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
216 lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
217 }
218
219 if (incol <= 16) {
220 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
221 p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
222 next_index_lastcol);
223 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
224 p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
225 next_index_lastcol);
226 } else {
227 this0 = vec_ld(16, inptr0);
228 this0l = (__vector short)VEC_UNPACKHU(this0);
229 this0h = (__vector short)VEC_UNPACKLU(this0);
230 this0l = vec_mladd(this0l, pw_three, pw_zero);
231 this0h = vec_mladd(this0h, pw_three, pw_zero);
232
233 this_1 = vec_ld(16, inptr_1);
234 this_1l = (__vector short)VEC_UNPACKHU(this_1);
235 this_1h = (__vector short)VEC_UNPACKLU(this_1);
236 nextcolsum_1l = vec_add(this0l, this_1l);
237 nextcolsum_1h = vec_add(this0h, this_1h);
238 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
239 p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
240
241 this1 = vec_ld(16, inptr1);
242 this1l = (__vector short)VEC_UNPACKHU(this1);
243 this1h = (__vector short)VEC_UNPACKLU(this1);
244 nextcolsum1l = vec_add(this0l, this1l);
245 nextcolsum1h = vec_add(this0h, this1h);
246 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
247 p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
248 }
249
250 /* Process the upper row */
251
252 tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
253 outle = vec_add(tmpl, p_lastcolsum_1l);
254 outle = vec_add(outle, pw_eight);
255 outle = vec_sr(outle, pw_four);
256
257 outlo = vec_add(tmpl, p_nextcolsum_1l);
258 outlo = vec_add(outlo, pw_seven);
259 outlo = vec_sr(outlo, pw_four);
260
261 out = vec_perm((__vector unsigned char)outle,
262 (__vector unsigned char)outlo, merge_pack_index);
263 vec_st(out, 0, outptr0);
264
265 if (incol > 8) {
266 tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
267 outhe = vec_add(tmph, p_lastcolsum_1h);
268 outhe = vec_add(outhe, pw_eight);
269 outhe = vec_sr(outhe, pw_four);
270
271 outho = vec_add(tmph, p_nextcolsum_1h);
272 outho = vec_add(outho, pw_seven);
273 outho = vec_sr(outho, pw_four);
274
275 out = vec_perm((__vector unsigned char)outhe,
276 (__vector unsigned char)outho, merge_pack_index);
277 vec_st(out, 16, outptr0);
278 }
279
280 /* Process the lower row */
281
282 tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
283 outle = vec_add(tmpl, p_lastcolsum1l);
284 outle = vec_add(outle, pw_eight);
285 outle = vec_sr(outle, pw_four);
286
287 outlo = vec_add(tmpl, p_nextcolsum1l);
288 outlo = vec_add(outlo, pw_seven);
289 outlo = vec_sr(outlo, pw_four);
290
291 out = vec_perm((__vector unsigned char)outle,
292 (__vector unsigned char)outlo, merge_pack_index);
293 vec_st(out, 0, outptr1);
294
295 if (incol > 8) {
296 tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
297 outhe = vec_add(tmph, p_lastcolsum1h);
298 outhe = vec_add(outhe, pw_eight);
299 outhe = vec_sr(outhe, pw_four);
300
301 outho = vec_add(tmph, p_nextcolsum1h);
302 outho = vec_add(outho, pw_seven);
303 outho = vec_sr(outho, pw_four);
304
305 out = vec_perm((__vector unsigned char)outhe,
306 (__vector unsigned char)outho, merge_pack_index);
307 vec_st(out, 16, outptr1);
308 }
309
310 thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
311 thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
312 }
313 }
314 }
315
316
317 /* These are rarely used (mainly just for decompressing YCCK images) */
318
jsimd_h2v1_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)319 void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
320 JDIMENSION output_width,
321 JSAMPARRAY input_data,
322 JSAMPARRAY *output_data_ptr)
323 {
324 JSAMPARRAY output_data = *output_data_ptr;
325 JSAMPROW inptr, outptr;
326 int inrow, incol;
327
328 __vector unsigned char in, inl, inh;
329
330 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
331 inptr = input_data[inrow];
332 outptr = output_data[inrow];
333
334 for (incol = (output_width + 31) & (~31); incol > 0;
335 incol -= 64, inptr += 32, outptr += 64) {
336
337 in = vec_ld(0, inptr);
338 inl = vec_mergeh(in, in);
339 inh = vec_mergel(in, in);
340
341 vec_st(inl, 0, outptr);
342 vec_st(inh, 16, outptr);
343
344 if (incol > 32) {
345 in = vec_ld(16, inptr);
346 inl = vec_mergeh(in, in);
347 inh = vec_mergel(in, in);
348
349 vec_st(inl, 32, outptr);
350 vec_st(inh, 48, outptr);
351 }
352 }
353 }
354 }
355
356
jsimd_h2v2_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)357 void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
358 JDIMENSION output_width,
359 JSAMPARRAY input_data,
360 JSAMPARRAY *output_data_ptr)
361 {
362 JSAMPARRAY output_data = *output_data_ptr;
363 JSAMPROW inptr, outptr0, outptr1;
364 int inrow, outrow, incol;
365
366 __vector unsigned char in, inl, inh;
367
368 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
369
370 inptr = input_data[inrow];
371 outptr0 = output_data[outrow++];
372 outptr1 = output_data[outrow++];
373
374 for (incol = (output_width + 31) & (~31); incol > 0;
375 incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
376
377 in = vec_ld(0, inptr);
378 inl = vec_mergeh(in, in);
379 inh = vec_mergel(in, in);
380
381 vec_st(inl, 0, outptr0);
382 vec_st(inl, 0, outptr1);
383
384 vec_st(inh, 16, outptr0);
385 vec_st(inh, 16, outptr1);
386
387 if (incol > 32) {
388 in = vec_ld(16, inptr);
389 inl = vec_mergeh(in, in);
390 inh = vec_mergel(in, in);
391
392 vec_st(inl, 32, outptr0);
393 vec_st(inl, 32, outptr1);
394
395 vec_st(inh, 48, outptr0);
396 vec_st(inh, 48, outptr1);
397 }
398 }
399 }
400 }
401