1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "../util/ssim.h" // NOLINT
12
13 #include <string.h>
14
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18
19 typedef unsigned int uint32; // NOLINT
20 typedef unsigned short uint16; // NOLINT
21
22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23 (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24 #define __SSE2__
25 #endif
26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27 #include <emmintrin.h>
28 #endif
29
30 #ifdef _OPENMP
31 #include <omp.h>
32 #endif
33
34 // SSIM
35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36
37 // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
38 // The maximum value (11 x 11) must be less than 128 to avoid sign
39 // problems during the calls to _mm_mullo_epi16().
40 static const int K[KERNEL_SIZE] = {
41 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
42 };
43 static const double kiW[KERNEL + 1 + 1] = {
44 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
45 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
46 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
47 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
48 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
49 };
50
51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52
53 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
54 #define MAKE_WEIGHT(L) \
55 { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \
56 PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
57
58 // We need this union trick to be able to initialize constant static __m128i
59 // values. We can't call _mm_set_epi16() for static compile-time initialization.
60 static const struct {
61 union {
62 uint16 i16_[8];
63 __m128i m_;
64 } values_;
65 } W0 = MAKE_WEIGHT(0),
66 W1 = MAKE_WEIGHT(1),
67 W2 = MAKE_WEIGHT(2),
68 W3 = MAKE_WEIGHT(3);
69 // ... the rest is symmetric.
70 #undef MAKE_WEIGHT
71 #undef PWEIGHT
72 #endif
73
74 // Common final expression for SSIM, once the weighted sums are known.
FinalizeSSIM(double iw,double xm,double ym,double xxm,double xym,double yym)75 static double FinalizeSSIM(double iw, double xm, double ym,
76 double xxm, double xym, double yym) {
77 const double iwx = xm * iw;
78 const double iwy = ym * iw;
79 double sxx = xxm * iw - iwx * iwx;
80 double syy = yym * iw - iwy * iwy;
81 // small errors are possible, due to rounding. Clamp to zero.
82 if (sxx < 0.) sxx = 0.;
83 if (syy < 0.) syy = 0.;
84 const double sxsy = sqrt(sxx * syy);
85 const double sxy = xym * iw - iwx * iwy;
86 static const double C11 = (0.01 * 0.01) * (255 * 255);
87 static const double C22 = (0.03 * 0.03) * (255 * 255);
88 static const double C33 = (0.015 * 0.015) * (255 * 255);
89 const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
90 const double c = (2. * sxsy + C22) / (sxx + syy + C22);
91 const double s = (sxy + C33) / (sxsy + C33);
92 return l * c * s;
93 }
94
95 // GetSSIM() does clipping. GetSSIMFullKernel() does not
96
97 // TODO(skal): use summed tables?
98 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
99 // with a diff of 255, squared. The maximum error is thus 0x4388241,
100 // which fits into 32 bits integers.
GetSSIM(const uint8 * org,const uint8 * rec,int xo,int yo,int W,int H,int stride)101 double GetSSIM(const uint8 *org, const uint8 *rec,
102 int xo, int yo, int W, int H, int stride) {
103 uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
104 org += (yo - KERNEL) * stride;
105 org += (xo - KERNEL);
106 rec += (yo - KERNEL) * stride;
107 rec += (xo - KERNEL);
108 for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
109 if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
110 const int Wy = K[y_];
111 for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
112 const int Wxy = Wy * K[x_];
113 if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
114 const int org_x = org[x_];
115 const int rec_x = rec[x_];
116 ws += Wxy;
117 xm += Wxy * org_x;
118 ym += Wxy * rec_x;
119 xxm += Wxy * org_x * org_x;
120 xym += Wxy * org_x * rec_x;
121 yym += Wxy * rec_x * rec_x;
122 }
123 }
124 }
125 return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
126 }
127
GetSSIMFullKernel(const uint8 * org,const uint8 * rec,int xo,int yo,int stride,double area_weight)128 double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
129 int xo, int yo, int stride,
130 double area_weight) {
131 uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
132
133 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
134
135 org += yo * stride + xo;
136 rec += yo * stride + xo;
137 for (int y = 1; y <= KERNEL; y++) {
138 const int dy1 = y * stride;
139 const int dy2 = y * stride;
140 const int Wy = K[KERNEL + y];
141
142 for (int x = 1; x <= KERNEL; x++) {
143 // Compute the contributions of upper-left (ul), upper-right (ur)
144 // lower-left (ll) and lower-right (lr) points (see the diagram below).
145 // Symmetric Kernel will have same weight on those points.
146 // - - - - - - -
147 // - ul - - - ur -
148 // - - - - - - -
149 // - - - 0 - - -
150 // - - - - - - -
151 // - ll - - - lr -
152 // - - - - - - -
153 const int Wxy = Wy * K[KERNEL + x];
154 const int ul1 = org[-dy1 - x];
155 const int ur1 = org[-dy1 + x];
156 const int ll1 = org[dy1 - x];
157 const int lr1 = org[dy1 + x];
158
159 const int ul2 = rec[-dy2 - x];
160 const int ur2 = rec[-dy2 + x];
161 const int ll2 = rec[dy2 - x];
162 const int lr2 = rec[dy2 + x];
163
164 xm += Wxy * (ul1 + ur1 + ll1 + lr1);
165 ym += Wxy * (ul2 + ur2 + ll2 + lr2);
166 xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
167 xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
168 yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
169 }
170
171 // Compute the contributions of up (u), down (d), left (l) and right (r)
172 // points across the main axes (see the diagram below).
173 // Symmetric Kernel will have same weight on those points.
174 // - - - - - - -
175 // - - - u - - -
176 // - - - - - - -
177 // - l - 0 - r -
178 // - - - - - - -
179 // - - - d - - -
180 // - - - - - - -
181 const int Wxy = Wy * K[KERNEL];
182 const int u1 = org[-dy1];
183 const int d1 = org[dy1];
184 const int l1 = org[-y];
185 const int r1 = org[y];
186
187 const int u2 = rec[-dy2];
188 const int d2 = rec[dy2];
189 const int l2 = rec[-y];
190 const int r2 = rec[y];
191
192 xm += Wxy * (u1 + d1 + l1 + r1);
193 ym += Wxy * (u2 + d2 + l2 + r2);
194 xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
195 xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
196 yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
197 }
198
199 // Lastly the contribution of (x0, y0) point.
200 const int Wxy = K[KERNEL] * K[KERNEL];
201 const int s1 = org[0];
202 const int s2 = rec[0];
203
204 xm += Wxy * s1;
205 ym += Wxy * s2;
206 xxm += Wxy * s1 * s1;
207 xym += Wxy * s1 * s2;
208 yym += Wxy * s2 * s2;
209
210 #else // __SSE2__
211
212 org += (yo - KERNEL) * stride + (xo - KERNEL);
213 rec += (yo - KERNEL) * stride + (xo - KERNEL);
214
215 const __m128i zero = _mm_setzero_si128();
216 __m128i x = zero;
217 __m128i y = zero;
218 __m128i xx = zero;
219 __m128i xy = zero;
220 __m128i yy = zero;
221
222 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
223 // and acccumulate.
224 #define LOAD_LINE_PAIR(L, WEIGHT) do { \
225 const __m128i v0 = \
226 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
227 const __m128i v1 = \
228 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
229 const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
230 const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
231 const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
232 const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
233 x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
234 y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
235 x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
236 y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
237 xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
238 xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
239 yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
240 } while (0)
241
242 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \
243 uint32 tmp[4]; \
244 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
245 (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
246 } while (0)
247
248 LOAD_LINE_PAIR(0, W0);
249 LOAD_LINE_PAIR(1, W1);
250 LOAD_LINE_PAIR(2, W2);
251 LOAD_LINE_PAIR(3, W3);
252 LOAD_LINE_PAIR(4, W2);
253 LOAD_LINE_PAIR(5, W1);
254 LOAD_LINE_PAIR(6, W0);
255
256 ADD_AND_STORE_FOUR_EPI32(x, xm);
257 ADD_AND_STORE_FOUR_EPI32(y, ym);
258 ADD_AND_STORE_FOUR_EPI32(xx, xxm);
259 ADD_AND_STORE_FOUR_EPI32(xy, xym);
260 ADD_AND_STORE_FOUR_EPI32(yy, yym);
261
262 #undef LOAD_LINE_PAIR
263 #undef ADD_AND_STORE_FOUR_EPI32
264 #endif
265
266 return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
267 }
268
start_max(int x,int y)269 static int start_max(int x, int y) { return (x > y) ? x : y; }
270
CalcSSIM(const uint8 * org,const uint8 * rec,const int image_width,const int image_height)271 double CalcSSIM(const uint8 *org, const uint8 *rec,
272 const int image_width, const int image_height) {
273 double SSIM = 0.;
274 const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
275 const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
276 const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
277 const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
278 const int stride = image_width;
279
280 for (int j = 0; j < KERNEL_Y; ++j) {
281 for (int i = 0; i < image_width; ++i) {
282 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
283 }
284 }
285
286 #ifdef _OPENMP
287 #pragma omp parallel for reduction(+: SSIM)
288 #endif
289 for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
290 for (int i = 0; i < KERNEL_X; ++i) {
291 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
292 }
293 for (int i = KERNEL_X; i < start_x; ++i) {
294 SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
295 }
296 if (start_x < image_width) {
297 // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
298 // copy the 8 rightmost pixels on a cache area, and pad this area with
299 // zeros which won't contribute to the overall SSIM value (but we need
300 // to pass the correct normalizing constant!). By using this cache, we can
301 // still call GetSSIMFullKernel() instead of the slower GetSSIM().
302 // NOTE: we could use similar method for the left-most pixels too.
303 const int kScratchWidth = 8;
304 const int kScratchStride = kScratchWidth + KERNEL + 1;
305 uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
306 uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
307
308 for (int k = 0; k < KERNEL_SIZE; ++k) {
309 const int offset =
310 (j - KERNEL + k) * stride + image_width - kScratchWidth;
311 memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
312 memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
313 }
314 for (int k = 0; k <= KERNEL_X + 1; ++k) {
315 SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
316 KERNEL + k, KERNEL, kScratchStride, kiW[k]);
317 }
318 }
319 }
320
321 for (int j = start_y; j < image_height; ++j) {
322 for (int i = 0; i < image_width; ++i) {
323 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
324 }
325 }
326 return SSIM;
327 }
328
CalcLSSIM(double ssim)329 double CalcLSSIM(double ssim) {
330 return -10.0 * log10(1.0 - ssim);
331 }
332
333 #ifdef __cplusplus
334 } // extern "C"
335 #endif
336
337