• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./psnr.h"  // NOLINT
12 
13 #ifdef _OPENMP
14 #include <omp.h>
15 #endif
16 #ifdef _MSC_VER
17 #include <intrin.h>  // For __cpuid()
18 #endif
19 
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23 
24 typedef unsigned int uint32;  // NOLINT
25 #ifdef _MSC_VER
26 typedef unsigned __int64 uint64;
27 #else  // COMPILER_MSVC
28 #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
29 typedef unsigned long uint64;  // NOLINT
30 #else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31 typedef unsigned long long uint64;  // NOLINT
32 #endif  // __LP64__
33 #endif  // _MSC_VER
34 
35 // libyuv provides this function when linking library for jpeg support.
36 #if !defined(HAVE_JPEG)
37 
38 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
39     !defined(__aarch64__)
40 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)41 static uint32 SumSquareError_NEON(const uint8* src_a,
42                                   const uint8* src_b, int count) {
43   volatile uint32 sse;
44   asm volatile (
45     "vmov.u8    q7, #0                         \n"
46     "vmov.u8    q9, #0                         \n"
47     "vmov.u8    q8, #0                         \n"
48     "vmov.u8    q10, #0                        \n"
49 
50   "1:                                          \n"
51     "vld1.u8    {q0}, [%0]!                    \n"
52     "vld1.u8    {q1}, [%1]!                    \n"
53     "vsubl.u8   q2, d0, d2                     \n"
54     "vsubl.u8   q3, d1, d3                     \n"
55     "vmlal.s16  q7, d4, d4                     \n"
56     "vmlal.s16  q8, d6, d6                     \n"
57     "vmlal.s16  q8, d5, d5                     \n"
58     "vmlal.s16  q10, d7, d7                    \n"
59     "subs       %2, %2, #16                    \n"
60     "bhi        1b                             \n"
61 
62     "vadd.u32   q7, q7, q8                     \n"
63     "vadd.u32   q9, q9, q10                    \n"
64     "vadd.u32   q10, q7, q9                    \n"
65     "vpaddl.u32 q1, q10                        \n"
66     "vadd.u64   d0, d2, d3                     \n"
67     "vmov.32    %3, d0[0]                      \n"
68     : "+r"(src_a),
69       "+r"(src_b),
70       "+r"(count),
71       "=r"(sse)
72     :
73     : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
74   return sse;
75 }
76 #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
77 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)78 static uint32 SumSquareError_NEON(const uint8* src_a,
79                                   const uint8* src_b, int count) {
80   volatile uint32 sse;
81   asm volatile (
82     "eor        v16.16b, v16.16b, v16.16b      \n"
83     "eor        v18.16b, v18.16b, v18.16b      \n"
84     "eor        v17.16b, v17.16b, v17.16b      \n"
85     "eor        v19.16b, v19.16b, v19.16b      \n"
86 
87   "1:                                          \n"
88     "ld1        {v0.16b}, [%0], #16            \n"
89     "ld1        {v1.16b}, [%1], #16            \n"
90     "subs       %w2, %w2, #16                  \n"
91     "usubl      v2.8h, v0.8b, v1.8b            \n"
92     "usubl2     v3.8h, v0.16b, v1.16b          \n"
93     "smlal      v16.4s, v2.4h, v2.4h           \n"
94     "smlal      v17.4s, v3.4h, v3.4h           \n"
95     "smlal2     v18.4s, v2.8h, v2.8h           \n"
96     "smlal2     v19.4s, v3.8h, v3.8h           \n"
97     "b.gt       1b                             \n"
98 
99     "add        v16.4s, v16.4s, v17.4s         \n"
100     "add        v18.4s, v18.4s, v19.4s         \n"
101     "add        v19.4s, v16.4s, v18.4s         \n"
102     "addv       s0, v19.4s                     \n"
103     "fmov       %w3, s0                        \n"
104     : "+r"(src_a),
105       "+r"(src_b),
106       "+r"(count),
107       "=r"(sse)
108     :
109     : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
110   return sse;
111 }
112 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
113 #define HAS_SUMSQUAREERROR_SSE2
114 __declspec(naked)
SumSquareError_SSE2(const uint8 *,const uint8 *,int)115 static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
116                                   const uint8* /*src_b*/, int /*count*/) {
117   __asm {
118     mov        eax, [esp + 4]    // src_a
119     mov        edx, [esp + 8]    // src_b
120     mov        ecx, [esp + 12]   // count
121     pxor       xmm0, xmm0
122     pxor       xmm5, xmm5
123     sub        edx, eax
124 
125   wloop:
126     movdqu     xmm1, [eax]
127     movdqu     xmm2, [eax + edx]
128     lea        eax,  [eax + 16]
129     movdqu     xmm3, xmm1
130     psubusb    xmm1, xmm2
131     psubusb    xmm2, xmm3
132     por        xmm1, xmm2
133     movdqu     xmm2, xmm1
134     punpcklbw  xmm1, xmm5
135     punpckhbw  xmm2, xmm5
136     pmaddwd    xmm1, xmm1
137     pmaddwd    xmm2, xmm2
138     paddd      xmm0, xmm1
139     paddd      xmm0, xmm2
140     sub        ecx, 16
141     ja         wloop
142 
143     pshufd     xmm1, xmm0, 0EEh
144     paddd      xmm0, xmm1
145     pshufd     xmm1, xmm0, 01h
146     paddd      xmm0, xmm1
147     movd       eax, xmm0
148     ret
149   }
150 }
151 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
152 #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)153 static uint32 SumSquareError_SSE2(const uint8* src_a,
154                                   const uint8* src_b, int count) {
155   uint32 sse;
156   asm volatile (  // NOLINT
157     "pxor      %%xmm0,%%xmm0                   \n"
158     "pxor      %%xmm5,%%xmm5                   \n"
159     "sub       %0,%1                           \n"
160 
161   "1:                                          \n"
162     "movdqu    (%0),%%xmm1                     \n"
163     "movdqu    (%0,%1,1),%%xmm2                \n"
164     "lea       0x10(%0),%0                     \n"
165     "movdqu    %%xmm1,%%xmm3                   \n"
166     "psubusb   %%xmm2,%%xmm1                   \n"
167     "psubusb   %%xmm3,%%xmm2                   \n"
168     "por       %%xmm2,%%xmm1                   \n"
169     "movdqu    %%xmm1,%%xmm2                   \n"
170     "punpcklbw %%xmm5,%%xmm1                   \n"
171     "punpckhbw %%xmm5,%%xmm2                   \n"
172     "pmaddwd   %%xmm1,%%xmm1                   \n"
173     "pmaddwd   %%xmm2,%%xmm2                   \n"
174     "paddd     %%xmm1,%%xmm0                   \n"
175     "paddd     %%xmm2,%%xmm0                   \n"
176     "sub       $0x10,%2                        \n"
177     "ja        1b                              \n"
178 
179     "pshufd    $0xee,%%xmm0,%%xmm1             \n"
180     "paddd     %%xmm1,%%xmm0                   \n"
181     "pshufd    $0x1,%%xmm0,%%xmm1              \n"
182     "paddd     %%xmm1,%%xmm0                   \n"
183     "movd      %%xmm0,%3                       \n"
184 
185   : "+r"(src_a),      // %0
186     "+r"(src_b),      // %1
187     "+r"(count),      // %2
188     "=g"(sse)         // %3
189   :
190   : "memory", "cc"
191 #if defined(__SSE2__)
192     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
193 #endif
194   );  // NOLINT
195   return sse;
196 }
197 #endif  // LIBYUV_DISABLE_X86 etc
198 
199 #if defined(HAS_SUMSQUAREERROR_SSE2)
200 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
__cpuid(int cpu_info[4],int info_type)201 static __inline void __cpuid(int cpu_info[4], int info_type) {
202   asm volatile (  // NOLINT
203     "mov %%ebx, %%edi                          \n"
204     "cpuid                                     \n"
205     "xchg %%edi, %%ebx                         \n"
206     : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
207     : "a"(info_type));
208 }
209 // For gcc/clang but not clangcl.
210 #elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
__cpuid(int cpu_info[4],int info_type)211 static __inline void __cpuid(int cpu_info[4], int info_type) {
212   asm volatile (  // NOLINT
213     "cpuid                                     \n"
214     : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
215     : "a"(info_type));
216 }
217 #endif
218 
CpuHasSSE2()219 static int CpuHasSSE2() {
220 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
221   int cpu_info[4];
222   __cpuid(cpu_info, 1);
223   if (cpu_info[3] & 0x04000000) {
224     return 1;
225   }
226 #endif
227   return 0;
228 }
229 #endif  // HAS_SUMSQUAREERROR_SSE2
230 
SumSquareError_C(const uint8 * src_a,const uint8 * src_b,int count)231 static uint32 SumSquareError_C(const uint8* src_a,
232                                const uint8* src_b, int count) {
233   uint32 sse = 0u;
234   for (int x = 0; x < count; ++x) {
235     int diff = src_a[x] - src_b[x];
236     sse += static_cast<uint32>(diff * diff);
237   }
238   return sse;
239 }
240 
ComputeSumSquareError(const uint8 * src_a,const uint8 * src_b,int count)241 double ComputeSumSquareError(const uint8* src_a,
242                              const uint8* src_b, int count) {
243   uint32 (*SumSquareError)(const uint8* src_a,
244                            const uint8* src_b, int count) = SumSquareError_C;
245 #if defined(HAS_SUMSQUAREERROR_NEON)
246   SumSquareError = SumSquareError_NEON;
247 #endif
248 #if defined(HAS_SUMSQUAREERROR_SSE2)
249   if (CpuHasSSE2()) {
250     SumSquareError = SumSquareError_SSE2;
251   }
252 #endif
253   const int kBlockSize = 1 << 15;
254   uint64 sse = 0;
255 #ifdef _OPENMP
256 #pragma omp parallel for reduction(+: sse)
257 #endif
258   for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
259     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
260   }
261   src_a += count & ~(kBlockSize - 1);
262   src_b += count & ~(kBlockSize - 1);
263   int remainder = count & (kBlockSize - 1) & ~15;
264   if (remainder) {
265     sse += SumSquareError(src_a, src_b, remainder);
266     src_a += remainder;
267     src_b += remainder;
268   }
269   remainder = count & 15;
270   if (remainder) {
271     sse += SumSquareError_C(src_a, src_b, remainder);
272   }
273   return static_cast<double>(sse);
274 }
275 #endif
276 
277 // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
278 // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
ComputePSNR(double sse,double size)279 double ComputePSNR(double sse, double size) {
280   const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
281   if (sse <= kMINSSE)
282     sse = kMINSSE;  // Produces max PSNR of 128
283   return 10.0 * log10(255.0 * 255.0 * size / sse);
284 }
285 
286 #ifdef __cplusplus
287 }  // extern "C"
288 #endif
289