1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "../util/ssim.h" // NOLINT
12
13 #include <string.h>
14
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18
19 typedef unsigned int uint32; // NOLINT
20 typedef unsigned short uint16; // NOLINT
21
22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23 (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24 #define __SSE2__
25 #endif
26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27 #include <emmintrin.h>
28 #endif
29
30 #ifdef _OPENMP
31 #include <omp.h>
32 #endif
33
34 // SSIM
35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36
37 // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
38 // The maximum value (11 x 11) must be less than 128 to avoid sign
39 // problems during the calls to _mm_mullo_epi16().
40 static const int K[KERNEL_SIZE] = {
41 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
42 };
43 static const double kiW[KERNEL + 1 + 1] = {
44 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
45 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
46 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
47 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
48 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
49 };
50
51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52
53 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
54 #define MAKE_WEIGHT(L) \
55 { \
56 { \
57 { \
58 PWEIGHT(L, 0) \
59 , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
60 PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \
61 } \
62 } \
63 }
64
65 // We need this union trick to be able to initialize constant static __m128i
66 // values. We can't call _mm_set_epi16() for static compile-time initialization.
67 static const struct {
68 union {
69 uint16 i16_[8];
70 __m128i m_;
71 } values_;
72 } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
73 W3 = MAKE_WEIGHT(3);
74 // ... the rest is symmetric.
75 #undef MAKE_WEIGHT
76 #undef PWEIGHT
77 #endif
78
79 // Common final expression for SSIM, once the weighted sums are known.
FinalizeSSIM(double iw,double xm,double ym,double xxm,double xym,double yym)80 static double FinalizeSSIM(double iw,
81 double xm,
82 double ym,
83 double xxm,
84 double xym,
85 double yym) {
86 const double iwx = xm * iw;
87 const double iwy = ym * iw;
88 double sxx = xxm * iw - iwx * iwx;
89 double syy = yym * iw - iwy * iwy;
90 // small errors are possible, due to rounding. Clamp to zero.
91 if (sxx < 0.)
92 sxx = 0.;
93 if (syy < 0.)
94 syy = 0.;
95 const double sxsy = sqrt(sxx * syy);
96 const double sxy = xym * iw - iwx * iwy;
97 static const double C11 = (0.01 * 0.01) * (255 * 255);
98 static const double C22 = (0.03 * 0.03) * (255 * 255);
99 static const double C33 = (0.015 * 0.015) * (255 * 255);
100 const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
101 const double c = (2. * sxsy + C22) / (sxx + syy + C22);
102 const double s = (sxy + C33) / (sxsy + C33);
103 return l * c * s;
104 }
105
106 // GetSSIM() does clipping. GetSSIMFullKernel() does not
107
108 // TODO(skal): use summed tables?
109 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
110 // with a diff of 255, squared. The maximum error is thus 0x4388241,
111 // which fits into 32 bits integers.
GetSSIM(const uint8 * org,const uint8 * rec,int xo,int yo,int W,int H,int stride)112 double GetSSIM(const uint8* org,
113 const uint8* rec,
114 int xo,
115 int yo,
116 int W,
117 int H,
118 int stride) {
119 uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
120 org += (yo - KERNEL) * stride;
121 org += (xo - KERNEL);
122 rec += (yo - KERNEL) * stride;
123 rec += (xo - KERNEL);
124 for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
125 if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
126 continue;
127 const int Wy = K[y_];
128 for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
129 const int Wxy = Wy * K[x_];
130 if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
131 const int org_x = org[x_];
132 const int rec_x = rec[x_];
133 ws += Wxy;
134 xm += Wxy * org_x;
135 ym += Wxy * rec_x;
136 xxm += Wxy * org_x * org_x;
137 xym += Wxy * org_x * rec_x;
138 yym += Wxy * rec_x * rec_x;
139 }
140 }
141 }
142 return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
143 }
144
GetSSIMFullKernel(const uint8 * org,const uint8 * rec,int xo,int yo,int stride,double area_weight)145 double GetSSIMFullKernel(const uint8* org,
146 const uint8* rec,
147 int xo,
148 int yo,
149 int stride,
150 double area_weight) {
151 uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
152
153 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
154
155 org += yo * stride + xo;
156 rec += yo * stride + xo;
157 for (int y = 1; y <= KERNEL; y++) {
158 const int dy1 = y * stride;
159 const int dy2 = y * stride;
160 const int Wy = K[KERNEL + y];
161
162 for (int x = 1; x <= KERNEL; x++) {
163 // Compute the contributions of upper-left (ul), upper-right (ur)
164 // lower-left (ll) and lower-right (lr) points (see the diagram below).
165 // Symmetric Kernel will have same weight on those points.
166 // - - - - - - -
167 // - ul - - - ur -
168 // - - - - - - -
169 // - - - 0 - - -
170 // - - - - - - -
171 // - ll - - - lr -
172 // - - - - - - -
173 const int Wxy = Wy * K[KERNEL + x];
174 const int ul1 = org[-dy1 - x];
175 const int ur1 = org[-dy1 + x];
176 const int ll1 = org[dy1 - x];
177 const int lr1 = org[dy1 + x];
178
179 const int ul2 = rec[-dy2 - x];
180 const int ur2 = rec[-dy2 + x];
181 const int ll2 = rec[dy2 - x];
182 const int lr2 = rec[dy2 + x];
183
184 xm += Wxy * (ul1 + ur1 + ll1 + lr1);
185 ym += Wxy * (ul2 + ur2 + ll2 + lr2);
186 xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
187 xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
188 yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
189 }
190
191 // Compute the contributions of up (u), down (d), left (l) and right (r)
192 // points across the main axes (see the diagram below).
193 // Symmetric Kernel will have same weight on those points.
194 // - - - - - - -
195 // - - - u - - -
196 // - - - - - - -
197 // - l - 0 - r -
198 // - - - - - - -
199 // - - - d - - -
200 // - - - - - - -
201 const int Wxy = Wy * K[KERNEL];
202 const int u1 = org[-dy1];
203 const int d1 = org[dy1];
204 const int l1 = org[-y];
205 const int r1 = org[y];
206
207 const int u2 = rec[-dy2];
208 const int d2 = rec[dy2];
209 const int l2 = rec[-y];
210 const int r2 = rec[y];
211
212 xm += Wxy * (u1 + d1 + l1 + r1);
213 ym += Wxy * (u2 + d2 + l2 + r2);
214 xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
215 xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
216 yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
217 }
218
219 // Lastly the contribution of (x0, y0) point.
220 const int Wxy = K[KERNEL] * K[KERNEL];
221 const int s1 = org[0];
222 const int s2 = rec[0];
223
224 xm += Wxy * s1;
225 ym += Wxy * s2;
226 xxm += Wxy * s1 * s1;
227 xym += Wxy * s1 * s2;
228 yym += Wxy * s2 * s2;
229
230 #else // __SSE2__
231
232 org += (yo - KERNEL) * stride + (xo - KERNEL);
233 rec += (yo - KERNEL) * stride + (xo - KERNEL);
234
235 const __m128i zero = _mm_setzero_si128();
236 __m128i x = zero;
237 __m128i y = zero;
238 __m128i xx = zero;
239 __m128i xy = zero;
240 __m128i yy = zero;
241
242 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
243 // and acccumulate.
244 #define LOAD_LINE_PAIR(L, WEIGHT) \
245 do { \
246 const __m128i v0 = \
247 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
248 const __m128i v1 = \
249 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
250 const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
251 const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
252 const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
253 const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
254 x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
255 y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
256 x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
257 y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
258 xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
259 xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
260 yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
261 } while (0)
262
263 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) \
264 do { \
265 uint32 tmp[4]; \
266 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
267 (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
268 } while (0)
269
270 LOAD_LINE_PAIR(0, W0);
271 LOAD_LINE_PAIR(1, W1);
272 LOAD_LINE_PAIR(2, W2);
273 LOAD_LINE_PAIR(3, W3);
274 LOAD_LINE_PAIR(4, W2);
275 LOAD_LINE_PAIR(5, W1);
276 LOAD_LINE_PAIR(6, W0);
277
278 ADD_AND_STORE_FOUR_EPI32(x, xm);
279 ADD_AND_STORE_FOUR_EPI32(y, ym);
280 ADD_AND_STORE_FOUR_EPI32(xx, xxm);
281 ADD_AND_STORE_FOUR_EPI32(xy, xym);
282 ADD_AND_STORE_FOUR_EPI32(yy, yym);
283
284 #undef LOAD_LINE_PAIR
285 #undef ADD_AND_STORE_FOUR_EPI32
286 #endif
287
288 return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
289 }
290
start_max(int x,int y)291 static int start_max(int x, int y) {
292 return (x > y) ? x : y;
293 }
294
CalcSSIM(const uint8 * org,const uint8 * rec,const int image_width,const int image_height)295 double CalcSSIM(const uint8* org,
296 const uint8* rec,
297 const int image_width,
298 const int image_height) {
299 double SSIM = 0.;
300 const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
301 const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
302 const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
303 const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
304 const int stride = image_width;
305
306 for (int j = 0; j < KERNEL_Y; ++j) {
307 for (int i = 0; i < image_width; ++i) {
308 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
309 }
310 }
311
312 #ifdef _OPENMP
313 #pragma omp parallel for reduction(+ : SSIM)
314 #endif
315 for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
316 for (int i = 0; i < KERNEL_X; ++i) {
317 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
318 }
319 for (int i = KERNEL_X; i < start_x; ++i) {
320 SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
321 }
322 if (start_x < image_width) {
323 // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
324 // copy the 8 rightmost pixels on a cache area, and pad this area with
325 // zeros which won't contribute to the overall SSIM value (but we need
326 // to pass the correct normalizing constant!). By using this cache, we can
327 // still call GetSSIMFullKernel() instead of the slower GetSSIM().
328 // NOTE: we could use similar method for the left-most pixels too.
329 const int kScratchWidth = 8;
330 const int kScratchStride = kScratchWidth + KERNEL + 1;
331 uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
332 uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
333
334 for (int k = 0; k < KERNEL_SIZE; ++k) {
335 const int offset =
336 (j - KERNEL + k) * stride + image_width - kScratchWidth;
337 memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
338 memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
339 }
340 for (int k = 0; k <= KERNEL_X + 1; ++k) {
341 SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
342 kScratchStride, kiW[k]);
343 }
344 }
345 }
346
347 for (int j = start_y; j < image_height; ++j) {
348 for (int i = 0; i < image_width; ++i) {
349 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
350 }
351 }
352 return SSIM;
353 }
354
CalcLSSIM(double ssim)355 double CalcLSSIM(double ssim) {
356 return -10.0 * log10(1.0 - ssim);
357 }
358
359 #ifdef __cplusplus
360 } // extern "C"
361 #endif
362