1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // SSE2 variant of methods for lossless decoder
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_SSE2)
17
18 #include "src/dsp/common_sse2.h"
19 #include "src/dsp/lossless.h"
20 #include "src/dsp/lossless_common.h"
21 #include <emmintrin.h>
22
23 //------------------------------------------------------------------------------
24 // Predictor Transform
25
ClampedAddSubtractFull_SSE2(uint32_t c0,uint32_t c1,uint32_t c2)26 static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
27 uint32_t c1,
28 uint32_t c2) {
29 const __m128i zero = _mm_setzero_si128();
30 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
31 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
32 const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
33 const __m128i V1 = _mm_add_epi16(C0, C1);
34 const __m128i V2 = _mm_sub_epi16(V1, C2);
35 const __m128i b = _mm_packus_epi16(V2, V2);
36 return (uint32_t)_mm_cvtsi128_si32(b);
37 }
38
ClampedAddSubtractHalf_SSE2(uint32_t c0,uint32_t c1,uint32_t c2)39 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
40 uint32_t c1,
41 uint32_t c2) {
42 const __m128i zero = _mm_setzero_si128();
43 const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
44 const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
45 const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
46 const __m128i avg = _mm_add_epi16(C1, C0);
47 const __m128i A0 = _mm_srli_epi16(avg, 1);
48 const __m128i A1 = _mm_sub_epi16(A0, B0);
49 const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
50 const __m128i A2 = _mm_sub_epi16(A1, BgtA);
51 const __m128i A3 = _mm_srai_epi16(A2, 1);
52 const __m128i A4 = _mm_add_epi16(A0, A3);
53 const __m128i A5 = _mm_packus_epi16(A4, A4);
54 return (uint32_t)_mm_cvtsi128_si32(A5);
55 }
56
Select_SSE2(uint32_t a,uint32_t b,uint32_t c)57 static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
58 int pa_minus_pb;
59 const __m128i zero = _mm_setzero_si128();
60 const __m128i A0 = _mm_cvtsi32_si128((int)a);
61 const __m128i B0 = _mm_cvtsi32_si128((int)b);
62 const __m128i C0 = _mm_cvtsi32_si128((int)c);
63 const __m128i AC0 = _mm_subs_epu8(A0, C0);
64 const __m128i CA0 = _mm_subs_epu8(C0, A0);
65 const __m128i BC0 = _mm_subs_epu8(B0, C0);
66 const __m128i CB0 = _mm_subs_epu8(C0, B0);
67 const __m128i AC = _mm_or_si128(AC0, CA0);
68 const __m128i BC = _mm_or_si128(BC0, CB0);
69 const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
70 const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
71 const __m128i diff = _mm_sub_epi16(pb, pa);
72 {
73 int16_t out[8];
74 _mm_storeu_si128((__m128i*)out, diff);
75 pa_minus_pb = out[0] + out[1] + out[2] + out[3];
76 }
77 return (pa_minus_pb <= 0) ? a : b;
78 }
79
Average2_m128i(const __m128i * const a0,const __m128i * const a1,__m128i * const avg)80 static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
81 const __m128i* const a1,
82 __m128i* const avg) {
83 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
84 const __m128i ones = _mm_set1_epi8(1);
85 const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
86 const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
87 *avg = _mm_sub_epi8(avg1, one);
88 }
89
Average2_uint32_SSE2(const uint32_t a0,const uint32_t a1,__m128i * const avg)90 static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
91 const uint32_t a1,
92 __m128i* const avg) {
93 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
94 const __m128i ones = _mm_set1_epi8(1);
95 const __m128i A0 = _mm_cvtsi32_si128((int)a0);
96 const __m128i A1 = _mm_cvtsi32_si128((int)a1);
97 const __m128i avg1 = _mm_avg_epu8(A0, A1);
98 const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
99 *avg = _mm_sub_epi8(avg1, one);
100 }
101
Average2_uint32_16_SSE2(uint32_t a0,uint32_t a1)102 static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
103 const __m128i zero = _mm_setzero_si128();
104 const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
105 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
106 const __m128i sum = _mm_add_epi16(A1, A0);
107 return _mm_srli_epi16(sum, 1);
108 }
109
Average2_SSE2(uint32_t a0,uint32_t a1)110 static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
111 __m128i output;
112 Average2_uint32_SSE2(a0, a1, &output);
113 return (uint32_t)_mm_cvtsi128_si32(output);
114 }
115
Average3_SSE2(uint32_t a0,uint32_t a1,uint32_t a2)116 static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
117 uint32_t a2) {
118 const __m128i zero = _mm_setzero_si128();
119 const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
120 const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
121 const __m128i sum = _mm_add_epi16(avg1, A1);
122 const __m128i avg2 = _mm_srli_epi16(sum, 1);
123 const __m128i A2 = _mm_packus_epi16(avg2, avg2);
124 return (uint32_t)_mm_cvtsi128_si32(A2);
125 }
126
Average4_SSE2(uint32_t a0,uint32_t a1,uint32_t a2,uint32_t a3)127 static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
128 uint32_t a2, uint32_t a3) {
129 const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
130 const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
131 const __m128i sum = _mm_add_epi16(avg2, avg1);
132 const __m128i avg3 = _mm_srli_epi16(sum, 1);
133 const __m128i A0 = _mm_packus_epi16(avg3, avg3);
134 return (uint32_t)_mm_cvtsi128_si32(A0);
135 }
136
Predictor5_SSE2(const uint32_t * const left,const uint32_t * const top)137 static uint32_t Predictor5_SSE2(const uint32_t* const left,
138 const uint32_t* const top) {
139 const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
140 return pred;
141 }
Predictor6_SSE2(const uint32_t * const left,const uint32_t * const top)142 static uint32_t Predictor6_SSE2(const uint32_t* const left,
143 const uint32_t* const top) {
144 const uint32_t pred = Average2_SSE2(*left, top[-1]);
145 return pred;
146 }
Predictor7_SSE2(const uint32_t * const left,const uint32_t * const top)147 static uint32_t Predictor7_SSE2(const uint32_t* const left,
148 const uint32_t* const top) {
149 const uint32_t pred = Average2_SSE2(*left, top[0]);
150 return pred;
151 }
Predictor8_SSE2(const uint32_t * const left,const uint32_t * const top)152 static uint32_t Predictor8_SSE2(const uint32_t* const left,
153 const uint32_t* const top) {
154 const uint32_t pred = Average2_SSE2(top[-1], top[0]);
155 (void)left;
156 return pred;
157 }
Predictor9_SSE2(const uint32_t * const left,const uint32_t * const top)158 static uint32_t Predictor9_SSE2(const uint32_t* const left,
159 const uint32_t* const top) {
160 const uint32_t pred = Average2_SSE2(top[0], top[1]);
161 (void)left;
162 return pred;
163 }
Predictor10_SSE2(const uint32_t * const left,const uint32_t * const top)164 static uint32_t Predictor10_SSE2(const uint32_t* const left,
165 const uint32_t* const top) {
166 const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
167 return pred;
168 }
Predictor11_SSE2(const uint32_t * const left,const uint32_t * const top)169 static uint32_t Predictor11_SSE2(const uint32_t* const left,
170 const uint32_t* const top) {
171 const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
172 return pred;
173 }
Predictor12_SSE2(const uint32_t * const left,const uint32_t * const top)174 static uint32_t Predictor12_SSE2(const uint32_t* const left,
175 const uint32_t* const top) {
176 const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
177 return pred;
178 }
Predictor13_SSE2(const uint32_t * const left,const uint32_t * const top)179 static uint32_t Predictor13_SSE2(const uint32_t* const left,
180 const uint32_t* const top) {
181 const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
182 return pred;
183 }
184
185 // Batch versions of those functions.
186
187 // Predictor0: ARGB_BLACK.
PredictorAdd0_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * WEBP_RESTRICT out)188 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
189 int num_pixels, uint32_t* WEBP_RESTRICT out) {
190 int i;
191 const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
192 for (i = 0; i + 4 <= num_pixels; i += 4) {
193 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
194 const __m128i res = _mm_add_epi8(src, black);
195 _mm_storeu_si128((__m128i*)&out[i], res);
196 }
197 if (i != num_pixels) {
198 VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
199 }
200 (void)upper;
201 }
202
203 // Predictor1: left.
PredictorAdd1_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * WEBP_RESTRICT out)204 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
205 int num_pixels, uint32_t* WEBP_RESTRICT out) {
206 int i;
207 __m128i prev = _mm_set1_epi32((int)out[-1]);
208 for (i = 0; i + 4 <= num_pixels; i += 4) {
209 // a | b | c | d
210 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
211 // 0 | a | b | c
212 const __m128i shift0 = _mm_slli_si128(src, 4);
213 // a | a + b | b + c | c + d
214 const __m128i sum0 = _mm_add_epi8(src, shift0);
215 // 0 | 0 | a | a + b
216 const __m128i shift1 = _mm_slli_si128(sum0, 8);
217 // a | a + b | a + b + c | a + b + c + d
218 const __m128i sum1 = _mm_add_epi8(sum0, shift1);
219 const __m128i res = _mm_add_epi8(sum1, prev);
220 _mm_storeu_si128((__m128i*)&out[i], res);
221 // replicate prev output on the four lanes
222 prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
223 }
224 if (i != num_pixels) {
225 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
226 }
227 }
228
229 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
230 // per 8 bit channel.
231 #define GENERATE_PREDICTOR_1(X, IN) \
232 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
233 int num_pixels, \
234 uint32_t* WEBP_RESTRICT out) { \
235 int i; \
236 for (i = 0; i + 4 <= num_pixels; i += 4) { \
237 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
238 const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
239 const __m128i res = _mm_add_epi8(src, other); \
240 _mm_storeu_si128((__m128i*)&out[i], res); \
241 } \
242 if (i != num_pixels) { \
243 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
244 } \
245 }
246
247 // Predictor2: Top.
248 GENERATE_PREDICTOR_1(2, upper[i])
249 // Predictor3: Top-right.
250 GENERATE_PREDICTOR_1(3, upper[i + 1])
251 // Predictor4: Top-left.
252 GENERATE_PREDICTOR_1(4, upper[i - 1])
253 #undef GENERATE_PREDICTOR_1
254
255 // Due to averages with integers, values cannot be accumulated in parallel for
256 // predictors 5 to 7.
GENERATE_PREDICTOR_ADD(Predictor5_SSE2,PredictorAdd5_SSE2)257 GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
258 GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
259 GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
260
261 #define GENERATE_PREDICTOR_2(X, IN) \
262 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
263 int num_pixels, \
264 uint32_t* WEBP_RESTRICT out) { \
265 int i; \
266 for (i = 0; i + 4 <= num_pixels; i += 4) { \
267 const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
268 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \
269 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
270 __m128i avg, res; \
271 Average2_m128i(&T, &Tother, &avg); \
272 res = _mm_add_epi8(avg, src); \
273 _mm_storeu_si128((__m128i*)&out[i], res); \
274 } \
275 if (i != num_pixels) { \
276 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
277 } \
278 }
279 // Predictor8: average TL T.
280 GENERATE_PREDICTOR_2(8, upper[i - 1])
281 // Predictor9: average T TR.
282 GENERATE_PREDICTOR_2(9, upper[i + 1])
283 #undef GENERATE_PREDICTOR_2
284
285 // Predictor10: average of (average of (L,TL), average of (T, TR)).
286 #define DO_PRED10(OUT) do { \
287 __m128i avgLTL, avg; \
288 Average2_m128i(&L, &TL, &avgLTL); \
289 Average2_m128i(&avgTTR, &avgLTL, &avg); \
290 L = _mm_add_epi8(avg, src); \
291 out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
292 } while (0)
293
294 #define DO_PRED10_SHIFT do { \
295 /* Rotate the pre-computed values for the next iteration.*/ \
296 avgTTR = _mm_srli_si128(avgTTR, 4); \
297 TL = _mm_srli_si128(TL, 4); \
298 src = _mm_srli_si128(src, 4); \
299 } while (0)
300
301 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
302 int num_pixels, uint32_t* WEBP_RESTRICT out) {
303 int i;
304 __m128i L = _mm_cvtsi32_si128((int)out[-1]);
305 for (i = 0; i + 4 <= num_pixels; i += 4) {
306 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
307 __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
308 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
309 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
310 __m128i avgTTR;
311 Average2_m128i(&T, &TR, &avgTTR);
312 DO_PRED10(0);
313 DO_PRED10_SHIFT;
314 DO_PRED10(1);
315 DO_PRED10_SHIFT;
316 DO_PRED10(2);
317 DO_PRED10_SHIFT;
318 DO_PRED10(3);
319 }
320 if (i != num_pixels) {
321 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
322 }
323 }
324 #undef DO_PRED10
325 #undef DO_PRED10_SHIFT
326
327 // Predictor11: select.
328 #define DO_PRED11(OUT) do { \
329 const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
330 const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
331 const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
332 const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
333 const __m128i A = _mm_and_si128(mask, L); \
334 const __m128i B = _mm_andnot_si128(mask, T); \
335 const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
336 L = _mm_add_epi8(src, pred); \
337 out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \
338 } while (0)
339
340 #define DO_PRED11_SHIFT do { \
341 /* Shift the pre-computed value for the next iteration.*/ \
342 T = _mm_srli_si128(T, 4); \
343 TL = _mm_srli_si128(TL, 4); \
344 src = _mm_srli_si128(src, 4); \
345 pa = _mm_srli_si128(pa, 4); \
346 } while (0)
347
PredictorAdd11_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * WEBP_RESTRICT out)348 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
349 int num_pixels, uint32_t* WEBP_RESTRICT out) {
350 int i;
351 __m128i pa;
352 __m128i L = _mm_cvtsi32_si128((int)out[-1]);
353 for (i = 0; i + 4 <= num_pixels; i += 4) {
354 __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
355 __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
356 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
357 {
358 // We can unpack with any value on the upper 32 bits, provided it's the
359 // same on both operands (so that their sum of abs diff is zero). Here we
360 // use T.
361 const __m128i T_lo = _mm_unpacklo_epi32(T, T);
362 const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
363 const __m128i T_hi = _mm_unpackhi_epi32(T, T);
364 const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
365 const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
366 const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
367 pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
368 }
369 DO_PRED11(0);
370 DO_PRED11_SHIFT;
371 DO_PRED11(1);
372 DO_PRED11_SHIFT;
373 DO_PRED11(2);
374 DO_PRED11_SHIFT;
375 DO_PRED11(3);
376 }
377 if (i != num_pixels) {
378 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
379 }
380 }
381 #undef DO_PRED11
382 #undef DO_PRED11_SHIFT
383
384 // Predictor12: ClampedAddSubtractFull.
385 #define DO_PRED12(DIFF, LANE, OUT) do { \
386 const __m128i all = _mm_add_epi16(L, (DIFF)); \
387 const __m128i alls = _mm_packus_epi16(all, all); \
388 const __m128i res = _mm_add_epi8(src, alls); \
389 out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
390 L = _mm_unpacklo_epi8(res, zero); \
391 } while (0)
392
393 #define DO_PRED12_SHIFT(DIFF, LANE) do { \
394 /* Shift the pre-computed value for the next iteration.*/ \
395 if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
396 src = _mm_srli_si128(src, 4); \
397 } while (0)
398
PredictorAdd12_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * WEBP_RESTRICT out)399 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
400 int num_pixels, uint32_t* WEBP_RESTRICT out) {
401 int i;
402 const __m128i zero = _mm_setzero_si128();
403 const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
404 __m128i L = _mm_unpacklo_epi8(L8, zero);
405 for (i = 0; i + 4 <= num_pixels; i += 4) {
406 // Load 4 pixels at a time.
407 __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
408 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
409 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
410 const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
411 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
412 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
413 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
414 __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
415 __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
416 DO_PRED12(diff_lo, 0, 0);
417 DO_PRED12_SHIFT(diff_lo, 0);
418 DO_PRED12(diff_lo, 1, 1);
419 DO_PRED12_SHIFT(diff_lo, 1);
420 DO_PRED12(diff_hi, 0, 2);
421 DO_PRED12_SHIFT(diff_hi, 0);
422 DO_PRED12(diff_hi, 1, 3);
423 }
424 if (i != num_pixels) {
425 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
426 }
427 }
428 #undef DO_PRED12
429 #undef DO_PRED12_SHIFT
430
431 // Due to averages with integers, values cannot be accumulated in parallel for
432 // predictors 13.
GENERATE_PREDICTOR_ADD(Predictor13_SSE2,PredictorAdd13_SSE2)433 GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
434
435 //------------------------------------------------------------------------------
436 // Subtract-Green Transform
437
438 static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
439 uint32_t* dst) {
440 int i;
441 for (i = 0; i + 4 <= num_pixels; i += 4) {
442 const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
443 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
444 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
445 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
446 const __m128i out = _mm_add_epi8(in, C);
447 _mm_storeu_si128((__m128i*)&dst[i], out);
448 }
449 // fallthrough and finish off with plain-C
450 if (i != num_pixels) {
451 VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
452 }
453 }
454
455 //------------------------------------------------------------------------------
456 // Color Transform
457
TransformColorInverse_SSE2(const VP8LMultipliers * const m,const uint32_t * const src,int num_pixels,uint32_t * dst)458 static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
459 const uint32_t* const src,
460 int num_pixels, uint32_t* dst) {
461 // sign-extended multiplying constants, pre-shifted by 5.
462 #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
463 #define MK_CST_16(HI, LO) \
464 _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
465 const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
466 const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
467 #undef MK_CST_16
468 #undef CST
469 const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks
470 int i;
471 for (i = 0; i + 4 <= num_pixels; i += 4) {
472 const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
473 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
474 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
475 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
476 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
477 const __m128i E = _mm_add_epi8(in, D); // x r' x b'
478 const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
479 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
480 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
481 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
482 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
483 const __m128i out = _mm_or_si128(J, A);
484 _mm_storeu_si128((__m128i*)&dst[i], out);
485 }
486 // Fall-back to C-version for left-overs.
487 if (i != num_pixels) {
488 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
489 }
490 }
491
492 //------------------------------------------------------------------------------
493 // Color-space conversion functions
494
ConvertBGRAToRGB_SSE2(const uint32_t * WEBP_RESTRICT src,int num_pixels,uint8_t * WEBP_RESTRICT dst)495 static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,
496 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
497 const __m128i* in = (const __m128i*)src;
498 __m128i* out = (__m128i*)dst;
499
500 while (num_pixels >= 32) {
501 // Load the BGRA buffers.
502 __m128i in0 = _mm_loadu_si128(in + 0);
503 __m128i in1 = _mm_loadu_si128(in + 1);
504 __m128i in2 = _mm_loadu_si128(in + 2);
505 __m128i in3 = _mm_loadu_si128(in + 3);
506 __m128i in4 = _mm_loadu_si128(in + 4);
507 __m128i in5 = _mm_loadu_si128(in + 5);
508 __m128i in6 = _mm_loadu_si128(in + 6);
509 __m128i in7 = _mm_loadu_si128(in + 7);
510 VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
511 VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
512 // At this points, in1/in5 contains red only, in2/in6 green only ...
513 // Pack the colors in 24b RGB.
514 VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
515 _mm_storeu_si128(out + 0, in1);
516 _mm_storeu_si128(out + 1, in5);
517 _mm_storeu_si128(out + 2, in2);
518 _mm_storeu_si128(out + 3, in6);
519 _mm_storeu_si128(out + 4, in3);
520 _mm_storeu_si128(out + 5, in7);
521 in += 8;
522 out += 6;
523 num_pixels -= 32;
524 }
525 // left-overs
526 if (num_pixels > 0) {
527 VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
528 }
529 }
530
ConvertBGRAToRGBA_SSE2(const uint32_t * WEBP_RESTRICT src,int num_pixels,uint8_t * WEBP_RESTRICT dst)531 static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,
532 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
533 const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
534 const __m128i* in = (const __m128i*)src;
535 __m128i* out = (__m128i*)dst;
536 while (num_pixels >= 8) {
537 const __m128i A1 = _mm_loadu_si128(in++);
538 const __m128i A2 = _mm_loadu_si128(in++);
539 const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
540 const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
541 const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
542 const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
543 const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
544 const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
545 const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
546 const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
547 const __m128i F1 = _mm_or_si128(E1, C1);
548 const __m128i F2 = _mm_or_si128(E2, C2);
549 _mm_storeu_si128(out++, F1);
550 _mm_storeu_si128(out++, F2);
551 num_pixels -= 8;
552 }
553 // left-overs
554 if (num_pixels > 0) {
555 VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
556 }
557 }
558
ConvertBGRAToRGBA4444_SSE2(const uint32_t * WEBP_RESTRICT src,int num_pixels,uint8_t * WEBP_RESTRICT dst)559 static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
560 int num_pixels,
561 uint8_t* WEBP_RESTRICT dst) {
562 const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
563 const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
564 const __m128i* in = (const __m128i*)src;
565 __m128i* out = (__m128i*)dst;
566 while (num_pixels >= 8) {
567 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
568 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
569 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
570 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
571 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
572 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
573 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
574 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
575 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
576 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
577 const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
578 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
579 const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
580 const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
581 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
582 #if (WEBP_SWAP_16BIT_CSP == 1)
583 const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
584 #else
585 const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
586 #endif
587 _mm_storeu_si128(out++, rgba);
588 num_pixels -= 8;
589 }
590 // left-overs
591 if (num_pixels > 0) {
592 VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
593 }
594 }
595
ConvertBGRAToRGB565_SSE2(const uint32_t * WEBP_RESTRICT src,int num_pixels,uint8_t * WEBP_RESTRICT dst)596 static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
597 int num_pixels,
598 uint8_t* WEBP_RESTRICT dst) {
599 const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
600 const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
601 const __m128i mask_0x07 = _mm_set1_epi8(0x07);
602 const __m128i* in = (const __m128i*)src;
603 __m128i* out = (__m128i*)dst;
604 while (num_pixels >= 8) {
605 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
606 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
607 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
608 const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
609 const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
610 const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
611 const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
612 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
613 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
614 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
615 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
616 const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
617 const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
618 const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
619 const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
620 const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
621 const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
622 const __m128i b1 = _mm_srli_epi16(b0, 3);
623 const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
624 #if (WEBP_SWAP_16BIT_CSP == 1)
625 const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
626 #else
627 const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
628 #endif
629 _mm_storeu_si128(out++, rgba);
630 num_pixels -= 8;
631 }
632 // left-overs
633 if (num_pixels > 0) {
634 VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
635 }
636 }
637
ConvertBGRAToBGR_SSE2(const uint32_t * WEBP_RESTRICT src,int num_pixels,uint8_t * WEBP_RESTRICT dst)638 static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
639 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
640 const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
641 const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
642 const __m128i* in = (const __m128i*)src;
643 const uint8_t* const end = dst + num_pixels * 3;
644 // the last storel_epi64 below writes 8 bytes starting at offset 18
645 while (dst + 26 <= end) {
646 const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
647 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
648 const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
649 const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
650 const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
651 const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
652 const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
653 const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
654 const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
655 const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
656 const __m128i c2 = _mm_srli_si128(c0, 8);
657 const __m128i c6 = _mm_srli_si128(c4, 8);
658 _mm_storel_epi64((__m128i*)(dst + 0), c0);
659 _mm_storel_epi64((__m128i*)(dst + 6), c2);
660 _mm_storel_epi64((__m128i*)(dst + 12), c4);
661 _mm_storel_epi64((__m128i*)(dst + 18), c6);
662 dst += 24;
663 num_pixels -= 8;
664 }
665 // left-overs
666 if (num_pixels > 0) {
667 VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
668 }
669 }
670
671 //------------------------------------------------------------------------------
672 // Entry point
673
674 extern void VP8LDspInitSSE2(void);
675
VP8LDspInitSSE2(void)676 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
677 VP8LPredictors[5] = Predictor5_SSE2;
678 VP8LPredictors[6] = Predictor6_SSE2;
679 VP8LPredictors[7] = Predictor7_SSE2;
680 VP8LPredictors[8] = Predictor8_SSE2;
681 VP8LPredictors[9] = Predictor9_SSE2;
682 VP8LPredictors[10] = Predictor10_SSE2;
683 VP8LPredictors[11] = Predictor11_SSE2;
684 VP8LPredictors[12] = Predictor12_SSE2;
685 VP8LPredictors[13] = Predictor13_SSE2;
686
687 VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
688 VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
689 VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
690 VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
691 VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
692 VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
693 VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
694 VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
695 VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
696 VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
697 VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
698 VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
699 VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
700 VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
701
702 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
703 VP8LTransformColorInverse = TransformColorInverse_SSE2;
704
705 VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
706 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
707 VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
708 VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
709 VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
710 }
711
712 #else // !WEBP_USE_SSE2
713
714 WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
715
716 #endif // WEBP_USE_SSE2
717