1 // qcms
2 // Copyright (C) 2009 Mozilla Foundation
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining
5 // a copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the Software
9 // is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
16 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22 #include <emmintrin.h>
23
24 #include "qcmsint.h"
25
26 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
27 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
28 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
29 static const ALIGN float floatScaleX4[4] =
30 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
31 static const ALIGN float clampMaxValueX4[4] =
32 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
33
qcms_transform_data_rgb_out_lut_sse2(qcms_transform * transform,unsigned char * src,unsigned char * dest,size_t length,qcms_format_type output_format)34 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
35 unsigned char *src,
36 unsigned char *dest,
37 size_t length,
38 qcms_format_type output_format)
39 {
40 unsigned int i;
41 float (*mat)[4] = transform->matrix;
42 char input_back[32];
43 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
44 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
45 * because they don't work on stack variables. gcc 4.4 does do the right thing
46 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
47 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
48 /* share input and output locations to save having to keep the
49 * locations in separate registers */
50 uint32_t const * output = (uint32_t*)input;
51
52 /* deref *transform now to avoid it in loop */
53 const float *igtbl_r = transform->input_gamma_table_r;
54 const float *igtbl_g = transform->input_gamma_table_g;
55 const float *igtbl_b = transform->input_gamma_table_b;
56
57 /* deref *transform now to avoid it in loop */
58 const uint8_t *otdata_r = &transform->output_table_r->data[0];
59 const uint8_t *otdata_g = &transform->output_table_g->data[0];
60 const uint8_t *otdata_b = &transform->output_table_b->data[0];
61
62 /* input matrix values never change */
63 const __m128 mat0 = _mm_load_ps(mat[0]);
64 const __m128 mat1 = _mm_load_ps(mat[1]);
65 const __m128 mat2 = _mm_load_ps(mat[2]);
66
67 /* these values don't change, either */
68 const __m128 max = _mm_load_ps(clampMaxValueX4);
69 const __m128 min = _mm_setzero_ps();
70 const __m128 scale = _mm_load_ps(floatScaleX4);
71
72 /* working variables */
73 __m128 vec_r, vec_g, vec_b, result;
74 const int r_out = output_format.r;
75 const int b_out = output_format.b;
76
77 /* CYA */
78 if (!length)
79 return;
80
81 /* one pixel is handled outside of the loop */
82 length--;
83
84 /* setup for transforming 1st pixel */
85 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
86 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
87 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
88 src += 3;
89
90 /* transform all but final pixel */
91
92 for (i=0; i<length; i++)
93 {
94 /* position values from gamma tables */
95 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
96 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
97 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
98
99 /* gamma * matrix */
100 vec_r = _mm_mul_ps(vec_r, mat0);
101 vec_g = _mm_mul_ps(vec_g, mat1);
102 vec_b = _mm_mul_ps(vec_b, mat2);
103
104 /* crunch, crunch, crunch */
105 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
106 vec_r = _mm_max_ps(min, vec_r);
107 vec_r = _mm_min_ps(max, vec_r);
108 result = _mm_mul_ps(vec_r, scale);
109
110 /* store calc'd output tables indices */
111 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
112
113 /* load for next loop while store completes */
114 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
115 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
116 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
117 src += 3;
118
119 /* use calc'd indices to output RGB values */
120 dest[r_out] = otdata_r[output[0]];
121 dest[1] = otdata_g[output[1]];
122 dest[b_out] = otdata_b[output[2]];
123 dest += 3;
124 }
125
126 /* handle final (maybe only) pixel */
127
128 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
129 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
130 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
131
132 vec_r = _mm_mul_ps(vec_r, mat0);
133 vec_g = _mm_mul_ps(vec_g, mat1);
134 vec_b = _mm_mul_ps(vec_b, mat2);
135
136 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
137 vec_r = _mm_max_ps(min, vec_r);
138 vec_r = _mm_min_ps(max, vec_r);
139 result = _mm_mul_ps(vec_r, scale);
140
141 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
142
143 dest[r_out] = otdata_r[output[0]];
144 dest[1] = otdata_g[output[1]];
145 dest[b_out] = otdata_b[output[2]];
146 }
147
qcms_transform_data_rgba_out_lut_sse2(qcms_transform * transform,unsigned char * src,unsigned char * dest,size_t length,qcms_format_type output_format)148 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
149 unsigned char *src,
150 unsigned char *dest,
151 size_t length,
152 qcms_format_type output_format)
153 {
154 unsigned int i;
155 float (*mat)[4] = transform->matrix;
156 char input_back[32];
157 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
158 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
159 * because they don't work on stack variables. gcc 4.4 does do the right thing
160 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
161 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
162 /* share input and output locations to save having to keep the
163 * locations in separate registers */
164 uint32_t const * output = (uint32_t*)input;
165
166 /* deref *transform now to avoid it in loop */
167 const float *igtbl_r = transform->input_gamma_table_r;
168 const float *igtbl_g = transform->input_gamma_table_g;
169 const float *igtbl_b = transform->input_gamma_table_b;
170
171 /* deref *transform now to avoid it in loop */
172 const uint8_t *otdata_r = &transform->output_table_r->data[0];
173 const uint8_t *otdata_g = &transform->output_table_g->data[0];
174 const uint8_t *otdata_b = &transform->output_table_b->data[0];
175
176 /* input matrix values never change */
177 const __m128 mat0 = _mm_load_ps(mat[0]);
178 const __m128 mat1 = _mm_load_ps(mat[1]);
179 const __m128 mat2 = _mm_load_ps(mat[2]);
180
181 /* these values don't change, either */
182 const __m128 max = _mm_load_ps(clampMaxValueX4);
183 const __m128 min = _mm_setzero_ps();
184 const __m128 scale = _mm_load_ps(floatScaleX4);
185
186 /* working variables */
187 __m128 vec_r, vec_g, vec_b, result;
188 const int r_out = output_format.r;
189 const int b_out = output_format.b;
190 unsigned char alpha;
191
192 /* CYA */
193 if (!length)
194 return;
195
196 /* one pixel is handled outside of the loop */
197 length--;
198
199 /* setup for transforming 1st pixel */
200 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
201 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
202 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
203 alpha = src[3];
204 src += 4;
205
206 /* transform all but final pixel */
207
208 for (i=0; i<length; i++)
209 {
210 /* position values from gamma tables */
211 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
212 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
213 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
214
215 /* gamma * matrix */
216 vec_r = _mm_mul_ps(vec_r, mat0);
217 vec_g = _mm_mul_ps(vec_g, mat1);
218 vec_b = _mm_mul_ps(vec_b, mat2);
219
220 /* store alpha for this pixel; load alpha for next */
221 dest[3] = alpha;
222 alpha = src[3];
223
224 /* crunch, crunch, crunch */
225 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
226 vec_r = _mm_max_ps(min, vec_r);
227 vec_r = _mm_min_ps(max, vec_r);
228 result = _mm_mul_ps(vec_r, scale);
229
230 /* store calc'd output tables indices */
231 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
232
233 /* load gamma values for next loop while store completes */
234 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
235 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
236 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
237 src += 4;
238
239 /* use calc'd indices to output RGB values */
240 dest[r_out] = otdata_r[output[0]];
241 dest[1] = otdata_g[output[1]];
242 dest[b_out] = otdata_b[output[2]];
243 dest += 4;
244 }
245
246 /* handle final (maybe only) pixel */
247
248 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
249 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
250 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
251
252 vec_r = _mm_mul_ps(vec_r, mat0);
253 vec_g = _mm_mul_ps(vec_g, mat1);
254 vec_b = _mm_mul_ps(vec_b, mat2);
255
256 dest[3] = alpha;
257
258 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
259 vec_r = _mm_max_ps(min, vec_r);
260 vec_r = _mm_min_ps(max, vec_r);
261 result = _mm_mul_ps(vec_r, scale);
262
263 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
264
265 dest[r_out] = otdata_r[output[0]];
266 dest[1] = otdata_g[output[1]];
267 dest[b_out] = otdata_b[output[2]];
268 }
269