• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "../util/ssim.h"  // NOLINT
12 
13 #include <string.h>
14 
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 
19 typedef unsigned int uint32;     // NOLINT
20 typedef unsigned short uint16;   // NOLINT
21 
22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23   (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24 #define __SSE2__
25 #endif
26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27 #include <emmintrin.h>
28 #endif
29 
30 #ifdef _OPENMP
31 #include <omp.h>
32 #endif
33 
34 // SSIM
35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36 
37 // Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
38 // The maximum value (11 x 11) must be less than 128 to avoid sign
39 // problems during the calls to _mm_mullo_epi16().
40 static const int K[KERNEL_SIZE] = {
41   1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)
42 };
43 static const double kiW[KERNEL + 1 + 1] = {
44   1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
45   1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
46   1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]
47   1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]
48   1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]
49 };
50 
51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52 
53 #define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product
54 #define MAKE_WEIGHT(L)                                               \
55   { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \
56         PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
57 
58 // We need this union trick to be able to initialize constant static __m128i
59 // values. We can't call _mm_set_epi16() for static compile-time initialization.
60 static const struct {
61   union {
62     uint16 i16_[8];
63     __m128i m_;
64   } values_;
65 } W0 = MAKE_WEIGHT(0),
66   W1 = MAKE_WEIGHT(1),
67   W2 = MAKE_WEIGHT(2),
68   W3 = MAKE_WEIGHT(3);
69   // ... the rest is symmetric.
70 #undef MAKE_WEIGHT
71 #undef PWEIGHT
72 #endif
73 
74 // Common final expression for SSIM, once the weighted sums are known.
FinalizeSSIM(double iw,double xm,double ym,double xxm,double xym,double yym)75 static double FinalizeSSIM(double iw, double xm, double ym,
76                            double xxm, double xym, double yym) {
77   const double iwx = xm * iw;
78   const double iwy = ym * iw;
79   double sxx = xxm * iw - iwx * iwx;
80   double syy = yym * iw - iwy * iwy;
81   // small errors are possible, due to rounding. Clamp to zero.
82   if (sxx < 0.) sxx = 0.;
83   if (syy < 0.) syy = 0.;
84   const double sxsy = sqrt(sxx * syy);
85   const double sxy = xym * iw - iwx * iwy;
86   static const double C11 = (0.01 * 0.01) * (255 * 255);
87   static const double C22 = (0.03 * 0.03) * (255 * 255);
88   static const double C33 = (0.015 * 0.015) * (255 * 255);
89   const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
90   const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
91   const double s = (sxy + C33) / (sxsy + C33);
92   return l * c * s;
93 }
94 
95 // GetSSIM() does clipping.  GetSSIMFullKernel() does not
96 
97 // TODO(skal): use summed tables?
98 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
99 // with a diff of 255, squared. The maximum error is thus 0x4388241,
100 // which fits into 32 bits integers.
GetSSIM(const uint8 * org,const uint8 * rec,int xo,int yo,int W,int H,int stride)101 double GetSSIM(const uint8 *org, const uint8 *rec,
102                int xo, int yo, int W, int H, int stride) {
103   uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
104   org += (yo - KERNEL) * stride;
105   org += (xo - KERNEL);
106   rec += (yo - KERNEL) * stride;
107   rec += (xo - KERNEL);
108   for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
109     if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
110     const int Wy = K[y_];
111     for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
112       const int Wxy = Wy * K[x_];
113       if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
114         const int org_x = org[x_];
115         const int rec_x = rec[x_];
116         ws += Wxy;
117         xm  += Wxy * org_x;
118         ym  += Wxy * rec_x;
119         xxm += Wxy * org_x * org_x;
120         xym += Wxy * org_x * rec_x;
121         yym += Wxy * rec_x * rec_x;
122       }
123     }
124   }
125   return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
126 }
127 
GetSSIMFullKernel(const uint8 * org,const uint8 * rec,int xo,int yo,int stride,double area_weight)128 double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
129                          int xo, int yo, int stride,
130                          double area_weight) {
131   uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
132 
133 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
134 
135   org += yo * stride + xo;
136   rec += yo * stride + xo;
137   for (int y = 1; y <= KERNEL; y++) {
138     const int dy1 = y * stride;
139     const int dy2 = y * stride;
140     const int Wy = K[KERNEL + y];
141 
142     for (int x = 1; x <= KERNEL; x++) {
143       // Compute the contributions of upper-left (ul), upper-right (ur)
144       // lower-left (ll) and lower-right (lr) points (see the diagram below).
145       // Symmetric Kernel will have same weight on those points.
146       //       -  -  -  -  -  -  -
147       //       -  ul -  -  -  ur -
148       //       -  -  -  -  -  -  -
149       //       -  -  -  0  -  -  -
150       //       -  -  -  -  -  -  -
151       //       -  ll -  -  -  lr -
152       //       -  -  -  -  -  -  -
153       const int Wxy = Wy * K[KERNEL + x];
154       const int ul1 = org[-dy1 - x];
155       const int ur1 = org[-dy1 + x];
156       const int ll1 = org[dy1 - x];
157       const int lr1 = org[dy1 + x];
158 
159       const int ul2 = rec[-dy2 - x];
160       const int ur2 = rec[-dy2 + x];
161       const int ll2 = rec[dy2 - x];
162       const int lr2 = rec[dy2 + x];
163 
164       xm  += Wxy * (ul1 + ur1 + ll1 + lr1);
165       ym  += Wxy * (ul2 + ur2 + ll2 + lr2);
166       xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
167       xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
168       yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
169     }
170 
171     // Compute the contributions of up (u), down (d), left (l) and right (r)
172     // points across the main axes (see the diagram below).
173     // Symmetric Kernel will have same weight on those points.
174     //       -  -  -  -  -  -  -
175     //       -  -  -  u  -  -  -
176     //       -  -  -  -  -  -  -
177     //       -  l  -  0  -  r  -
178     //       -  -  -  -  -  -  -
179     //       -  -  -  d  -  -  -
180     //       -  -  -  -  -  -  -
181     const int Wxy = Wy * K[KERNEL];
182     const int u1 = org[-dy1];
183     const int d1 = org[dy1];
184     const int l1 = org[-y];
185     const int r1 = org[y];
186 
187     const int u2 = rec[-dy2];
188     const int d2 = rec[dy2];
189     const int l2 = rec[-y];
190     const int r2 = rec[y];
191 
192     xm  += Wxy * (u1 + d1 + l1 + r1);
193     ym  += Wxy * (u2 + d2 + l2 + r2);
194     xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
195     xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
196     yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
197   }
198 
199   // Lastly the contribution of (x0, y0) point.
200   const int Wxy = K[KERNEL] * K[KERNEL];
201   const int s1 = org[0];
202   const int s2 = rec[0];
203 
204   xm  += Wxy * s1;
205   ym  += Wxy * s2;
206   xxm += Wxy * s1 * s1;
207   xym += Wxy * s1 * s2;
208   yym += Wxy * s2 * s2;
209 
210 #else   // __SSE2__
211 
212   org += (yo - KERNEL) * stride + (xo - KERNEL);
213   rec += (yo - KERNEL) * stride + (xo - KERNEL);
214 
215   const __m128i zero = _mm_setzero_si128();
216   __m128i x = zero;
217   __m128i y = zero;
218   __m128i xx = zero;
219   __m128i xy = zero;
220   __m128i yy = zero;
221 
222 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
223 // and acccumulate.
224 #define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \
225   const __m128i v0 =                                                         \
226       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
227   const __m128i v1 =                                                         \
228       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
229   const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
230   const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
231   const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
232   const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
233   x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
234   y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
235   x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
236   y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
237   xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
238   xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
239   yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
240 } while (0)
241 
242 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \
243   uint32 tmp[4];                                                             \
244   _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \
245   (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \
246 } while (0)
247 
248   LOAD_LINE_PAIR(0, W0);
249   LOAD_LINE_PAIR(1, W1);
250   LOAD_LINE_PAIR(2, W2);
251   LOAD_LINE_PAIR(3, W3);
252   LOAD_LINE_PAIR(4, W2);
253   LOAD_LINE_PAIR(5, W1);
254   LOAD_LINE_PAIR(6, W0);
255 
256   ADD_AND_STORE_FOUR_EPI32(x, xm);
257   ADD_AND_STORE_FOUR_EPI32(y, ym);
258   ADD_AND_STORE_FOUR_EPI32(xx, xxm);
259   ADD_AND_STORE_FOUR_EPI32(xy, xym);
260   ADD_AND_STORE_FOUR_EPI32(yy, yym);
261 
262 #undef LOAD_LINE_PAIR
263 #undef ADD_AND_STORE_FOUR_EPI32
264 #endif
265 
266   return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
267 }
268 
start_max(int x,int y)269 static int start_max(int x, int y) { return (x > y) ? x : y; }
270 
CalcSSIM(const uint8 * org,const uint8 * rec,const int image_width,const int image_height)271 double CalcSSIM(const uint8 *org, const uint8 *rec,
272                 const int image_width, const int image_height) {
273   double SSIM = 0.;
274   const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
275   const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
276   const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
277   const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
278   const int stride = image_width;
279 
280   for (int j = 0; j < KERNEL_Y; ++j) {
281     for (int i = 0; i < image_width; ++i) {
282       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
283     }
284   }
285 
286 #ifdef _OPENMP
287   #pragma omp parallel for reduction(+: SSIM)
288 #endif
289   for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
290     for (int i = 0; i < KERNEL_X; ++i) {
291       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
292     }
293     for (int i = KERNEL_X; i < start_x; ++i) {
294       SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
295     }
296     if (start_x < image_width) {
297       // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
298       // copy the 8 rightmost pixels on a cache area, and pad this area with
299       // zeros which won't contribute to the overall SSIM value (but we need
300       // to pass the correct normalizing constant!). By using this cache, we can
301       // still call GetSSIMFullKernel() instead of the slower GetSSIM().
302       // NOTE: we could use similar method for the left-most pixels too.
303       const int kScratchWidth = 8;
304       const int kScratchStride = kScratchWidth + KERNEL + 1;
305       uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
306       uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
307 
308       for (int k = 0; k < KERNEL_SIZE; ++k) {
309         const int offset =
310             (j - KERNEL + k) * stride + image_width - kScratchWidth;
311         memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
312         memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
313       }
314       for (int k = 0;  k <= KERNEL_X + 1; ++k) {
315         SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
316                                   KERNEL + k, KERNEL, kScratchStride, kiW[k]);
317       }
318     }
319   }
320 
321   for (int j = start_y; j < image_height; ++j) {
322     for (int i = 0; i < image_width; ++i) {
323       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
324     }
325   }
326   return SSIM;
327 }
328 
CalcLSSIM(double ssim)329 double CalcLSSIM(double ssim) {
330   return -10.0 * log10(1.0 - ssim);
331 }
332 
333 #ifdef __cplusplus
334 }  // extern "C"
335 #endif
336 
337