1 // Copyright 2017 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // SSE2 version of distortion calculation
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_SSE2)
17
18 #include <assert.h>
19 #include <emmintrin.h>
20
21 #include "src/dsp/common_sse2.h"
22
23 #if !defined(WEBP_DISABLE_STATS)
24
25 // Helper function
SubtractAndSquare_SSE2(const __m128i a,const __m128i b,__m128i * const sum)26 static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
27 __m128i* const sum) {
28 // take abs(a-b) in 8b
29 const __m128i a_b = _mm_subs_epu8(a, b);
30 const __m128i b_a = _mm_subs_epu8(b, a);
31 const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
32 // zero-extend to 16b
33 const __m128i zero = _mm_setzero_si128();
34 const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
35 const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
36 // multiply with self
37 const __m128i sum1 = _mm_madd_epi16(C0, C0);
38 const __m128i sum2 = _mm_madd_epi16(C1, C1);
39 *sum = _mm_add_epi32(sum1, sum2);
40 }
41
42 //------------------------------------------------------------------------------
43 // SSIM / PSNR entry point
44
AccumulateSSE_SSE2(const uint8_t * src1,const uint8_t * src2,int len)45 static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
46 const uint8_t* src2, int len) {
47 int i = 0;
48 uint32_t sse2 = 0;
49 if (len >= 16) {
50 const int limit = len - 32;
51 int32_t tmp[4];
52 __m128i sum1;
53 __m128i sum = _mm_setzero_si128();
54 __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
55 __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
56 i += 16;
57 while (i <= limit) {
58 const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
59 const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
60 __m128i sum2;
61 i += 16;
62 SubtractAndSquare_SSE2(a0, b0, &sum1);
63 sum = _mm_add_epi32(sum, sum1);
64 a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
65 b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
66 i += 16;
67 SubtractAndSquare_SSE2(a1, b1, &sum2);
68 sum = _mm_add_epi32(sum, sum2);
69 }
70 SubtractAndSquare_SSE2(a0, b0, &sum1);
71 sum = _mm_add_epi32(sum, sum1);
72 _mm_storeu_si128((__m128i*)tmp, sum);
73 sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
74 }
75
76 for (; i < len; ++i) {
77 const int32_t diff = src1[i] - src2[i];
78 sse2 += diff * diff;
79 }
80 return sse2;
81 }
82 #endif // !defined(WEBP_DISABLE_STATS)
83
84 #if !defined(WEBP_REDUCE_SIZE)
85
HorizontalAdd16b_SSE2(const __m128i * const m)86 static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
87 uint16_t tmp[8];
88 const __m128i a = _mm_srli_si128(*m, 8);
89 const __m128i b = _mm_add_epi16(*m, a);
90 _mm_storeu_si128((__m128i*)tmp, b);
91 return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
92 }
93
HorizontalAdd32b_SSE2(const __m128i * const m)94 static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
95 const __m128i a = _mm_srli_si128(*m, 8);
96 const __m128i b = _mm_add_epi32(*m, a);
97 const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
98 return (uint32_t)_mm_cvtsi128_si32(c);
99 }
100
101 static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
102
103 #define ACCUMULATE_ROW(WEIGHT) do { \
104 /* compute row weight (Wx * Wy) */ \
105 const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
106 const __m128i W = _mm_mullo_epi16(Wx, Wy); \
107 /* process 8 bytes at a time (7 bytes, actually) */ \
108 const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
109 const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
110 /* convert to 16b and multiply by weight */ \
111 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
112 const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
113 const __m128i wa1 = _mm_mullo_epi16(a1, W); \
114 const __m128i wb1 = _mm_mullo_epi16(b1, W); \
115 /* accumulate */ \
116 xm = _mm_add_epi16(xm, wa1); \
117 ym = _mm_add_epi16(ym, wb1); \
118 xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
119 xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
120 yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
121 src1 += stride1; \
122 src2 += stride2; \
123 } while (0)
124
SSIMGet_SSE2(const uint8_t * src1,int stride1,const uint8_t * src2,int stride2)125 static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
126 const uint8_t* src2, int stride2) {
127 VP8DistoStats stats;
128 const __m128i zero = _mm_setzero_si128();
129 __m128i xm = zero, ym = zero; // 16b accums
130 __m128i xxm = zero, yym = zero, xym = zero; // 32b accum
131 const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
132 assert(2 * VP8_SSIM_KERNEL + 1 == 7);
133 ACCUMULATE_ROW(1);
134 ACCUMULATE_ROW(2);
135 ACCUMULATE_ROW(3);
136 ACCUMULATE_ROW(4);
137 ACCUMULATE_ROW(3);
138 ACCUMULATE_ROW(2);
139 ACCUMULATE_ROW(1);
140 stats.xm = HorizontalAdd16b_SSE2(&xm);
141 stats.ym = HorizontalAdd16b_SSE2(&ym);
142 stats.xxm = HorizontalAdd32b_SSE2(&xxm);
143 stats.xym = HorizontalAdd32b_SSE2(&xym);
144 stats.yym = HorizontalAdd32b_SSE2(&yym);
145 return VP8SSIMFromStats(&stats);
146 }
147
148 #endif // !defined(WEBP_REDUCE_SIZE)
149
150 extern void VP8SSIMDspInitSSE2(void);
151
VP8SSIMDspInitSSE2(void)152 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
153 #if !defined(WEBP_DISABLE_STATS)
154 VP8AccumulateSSE = AccumulateSSE_SSE2;
155 #endif
156 #if !defined(WEBP_REDUCE_SIZE)
157 VP8SSIMGet = SSIMGet_SSE2;
158 #endif
159 }
160
161 #else // !WEBP_USE_SSE2
162
163 WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
164
165 #endif // WEBP_USE_SSE2
166