1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./psnr.h" // NOLINT
12
13 #ifdef _OPENMP
14 #include <omp.h>
15 #endif
16 #ifdef _MSC_VER
17 #include <intrin.h> // For __cpuid()
18 #endif
19
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23
24 typedef unsigned int uint32; // NOLINT
25 #ifdef _MSC_VER
26 typedef unsigned __int64 uint64;
27 #else // COMPILER_MSVC
28 #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
29 typedef unsigned long uint64; // NOLINT
30 #else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31 typedef unsigned long long uint64; // NOLINT
32 #endif // __LP64__
33 #endif // _MSC_VER
34
35 // libyuv provides this function when linking library for jpeg support.
36 #if !defined(HAVE_JPEG)
37
38 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
39 !defined(__aarch64__)
40 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)41 static uint32 SumSquareError_NEON(const uint8* src_a,
42 const uint8* src_b, int count) {
43 volatile uint32 sse;
44 asm volatile (
45 "vmov.u8 q7, #0 \n"
46 "vmov.u8 q9, #0 \n"
47 "vmov.u8 q8, #0 \n"
48 "vmov.u8 q10, #0 \n"
49
50 "1: \n"
51 "vld1.u8 {q0}, [%0]! \n"
52 "vld1.u8 {q1}, [%1]! \n"
53 "vsubl.u8 q2, d0, d2 \n"
54 "vsubl.u8 q3, d1, d3 \n"
55 "vmlal.s16 q7, d4, d4 \n"
56 "vmlal.s16 q8, d6, d6 \n"
57 "vmlal.s16 q8, d5, d5 \n"
58 "vmlal.s16 q10, d7, d7 \n"
59 "subs %2, %2, #16 \n"
60 "bhi 1b \n"
61
62 "vadd.u32 q7, q7, q8 \n"
63 "vadd.u32 q9, q9, q10 \n"
64 "vadd.u32 q10, q7, q9 \n"
65 "vpaddl.u32 q1, q10 \n"
66 "vadd.u64 d0, d2, d3 \n"
67 "vmov.32 %3, d0[0] \n"
68 : "+r"(src_a),
69 "+r"(src_b),
70 "+r"(count),
71 "=r"(sse)
72 :
73 : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
74 return sse;
75 }
76 #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
77 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)78 static uint32 SumSquareError_NEON(const uint8* src_a,
79 const uint8* src_b, int count) {
80 volatile uint32 sse;
81 asm volatile (
82 "eor v16.16b, v16.16b, v16.16b \n"
83 "eor v18.16b, v18.16b, v18.16b \n"
84 "eor v17.16b, v17.16b, v17.16b \n"
85 "eor v19.16b, v19.16b, v19.16b \n"
86
87 "1: \n"
88 "ld1 {v0.16b}, [%0], #16 \n"
89 "ld1 {v1.16b}, [%1], #16 \n"
90 "subs %w2, %w2, #16 \n"
91 "usubl v2.8h, v0.8b, v1.8b \n"
92 "usubl2 v3.8h, v0.16b, v1.16b \n"
93 "smlal v16.4s, v2.4h, v2.4h \n"
94 "smlal v17.4s, v3.4h, v3.4h \n"
95 "smlal2 v18.4s, v2.8h, v2.8h \n"
96 "smlal2 v19.4s, v3.8h, v3.8h \n"
97 "b.gt 1b \n"
98
99 "add v16.4s, v16.4s, v17.4s \n"
100 "add v18.4s, v18.4s, v19.4s \n"
101 "add v19.4s, v16.4s, v18.4s \n"
102 "addv s0, v19.4s \n"
103 "fmov %w3, s0 \n"
104 : "+r"(src_a),
105 "+r"(src_b),
106 "+r"(count),
107 "=r"(sse)
108 :
109 : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
110 return sse;
111 }
112 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
113 #define HAS_SUMSQUAREERROR_SSE2
114 __declspec(naked)
SumSquareError_SSE2(const uint8 *,const uint8 *,int)115 static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
116 const uint8* /*src_b*/, int /*count*/) {
117 __asm {
118 mov eax, [esp + 4] // src_a
119 mov edx, [esp + 8] // src_b
120 mov ecx, [esp + 12] // count
121 pxor xmm0, xmm0
122 pxor xmm5, xmm5
123 sub edx, eax
124
125 wloop:
126 movdqu xmm1, [eax]
127 movdqu xmm2, [eax + edx]
128 lea eax, [eax + 16]
129 movdqu xmm3, xmm1
130 psubusb xmm1, xmm2
131 psubusb xmm2, xmm3
132 por xmm1, xmm2
133 movdqu xmm2, xmm1
134 punpcklbw xmm1, xmm5
135 punpckhbw xmm2, xmm5
136 pmaddwd xmm1, xmm1
137 pmaddwd xmm2, xmm2
138 paddd xmm0, xmm1
139 paddd xmm0, xmm2
140 sub ecx, 16
141 ja wloop
142
143 pshufd xmm1, xmm0, 0EEh
144 paddd xmm0, xmm1
145 pshufd xmm1, xmm0, 01h
146 paddd xmm0, xmm1
147 movd eax, xmm0
148 ret
149 }
150 }
151 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
152 #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)153 static uint32 SumSquareError_SSE2(const uint8* src_a,
154 const uint8* src_b, int count) {
155 uint32 sse;
156 asm volatile ( // NOLINT
157 "pxor %%xmm0,%%xmm0 \n"
158 "pxor %%xmm5,%%xmm5 \n"
159 "sub %0,%1 \n"
160
161 "1: \n"
162 "movdqu (%0),%%xmm1 \n"
163 "movdqu (%0,%1,1),%%xmm2 \n"
164 "lea 0x10(%0),%0 \n"
165 "movdqu %%xmm1,%%xmm3 \n"
166 "psubusb %%xmm2,%%xmm1 \n"
167 "psubusb %%xmm3,%%xmm2 \n"
168 "por %%xmm2,%%xmm1 \n"
169 "movdqu %%xmm1,%%xmm2 \n"
170 "punpcklbw %%xmm5,%%xmm1 \n"
171 "punpckhbw %%xmm5,%%xmm2 \n"
172 "pmaddwd %%xmm1,%%xmm1 \n"
173 "pmaddwd %%xmm2,%%xmm2 \n"
174 "paddd %%xmm1,%%xmm0 \n"
175 "paddd %%xmm2,%%xmm0 \n"
176 "sub $0x10,%2 \n"
177 "ja 1b \n"
178
179 "pshufd $0xee,%%xmm0,%%xmm1 \n"
180 "paddd %%xmm1,%%xmm0 \n"
181 "pshufd $0x1,%%xmm0,%%xmm1 \n"
182 "paddd %%xmm1,%%xmm0 \n"
183 "movd %%xmm0,%3 \n"
184
185 : "+r"(src_a), // %0
186 "+r"(src_b), // %1
187 "+r"(count), // %2
188 "=g"(sse) // %3
189 :
190 : "memory", "cc"
191 #if defined(__SSE2__)
192 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
193 #endif
194 ); // NOLINT
195 return sse;
196 }
197 #endif // LIBYUV_DISABLE_X86 etc
198
199 #if defined(HAS_SUMSQUAREERROR_SSE2)
200 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
__cpuid(int cpu_info[4],int info_type)201 static __inline void __cpuid(int cpu_info[4], int info_type) {
202 asm volatile ( // NOLINT
203 "mov %%ebx, %%edi \n"
204 "cpuid \n"
205 "xchg %%edi, %%ebx \n"
206 : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
207 : "a"(info_type));
208 }
209 // For gcc/clang but not clangcl.
210 #elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
__cpuid(int cpu_info[4],int info_type)211 static __inline void __cpuid(int cpu_info[4], int info_type) {
212 asm volatile ( // NOLINT
213 "cpuid \n"
214 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
215 : "a"(info_type));
216 }
217 #endif
218
CpuHasSSE2()219 static int CpuHasSSE2() {
220 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
221 int cpu_info[4];
222 __cpuid(cpu_info, 1);
223 if (cpu_info[3] & 0x04000000) {
224 return 1;
225 }
226 #endif
227 return 0;
228 }
229 #endif // HAS_SUMSQUAREERROR_SSE2
230
SumSquareError_C(const uint8 * src_a,const uint8 * src_b,int count)231 static uint32 SumSquareError_C(const uint8* src_a,
232 const uint8* src_b, int count) {
233 uint32 sse = 0u;
234 for (int x = 0; x < count; ++x) {
235 int diff = src_a[x] - src_b[x];
236 sse += static_cast<uint32>(diff * diff);
237 }
238 return sse;
239 }
240
ComputeSumSquareError(const uint8 * src_a,const uint8 * src_b,int count)241 double ComputeSumSquareError(const uint8* src_a,
242 const uint8* src_b, int count) {
243 uint32 (*SumSquareError)(const uint8* src_a,
244 const uint8* src_b, int count) = SumSquareError_C;
245 #if defined(HAS_SUMSQUAREERROR_NEON)
246 SumSquareError = SumSquareError_NEON;
247 #endif
248 #if defined(HAS_SUMSQUAREERROR_SSE2)
249 if (CpuHasSSE2()) {
250 SumSquareError = SumSquareError_SSE2;
251 }
252 #endif
253 const int kBlockSize = 1 << 15;
254 uint64 sse = 0;
255 #ifdef _OPENMP
256 #pragma omp parallel for reduction(+: sse)
257 #endif
258 for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
259 sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
260 }
261 src_a += count & ~(kBlockSize - 1);
262 src_b += count & ~(kBlockSize - 1);
263 int remainder = count & (kBlockSize - 1) & ~15;
264 if (remainder) {
265 sse += SumSquareError(src_a, src_b, remainder);
266 src_a += remainder;
267 src_b += remainder;
268 }
269 remainder = count & 15;
270 if (remainder) {
271 sse += SumSquareError_C(src_a, src_b, remainder);
272 }
273 return static_cast<double>(sse);
274 }
275 #endif
276
277 // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
278 // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
ComputePSNR(double sse,double size)279 double ComputePSNR(double sse, double size) {
280 const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
281 if (sse <= kMINSSE)
282 sse = kMINSSE; // Produces max PSNR of 128
283 return 10.0 * log10(255.0 * 255.0 * size / sse);
284 }
285
286 #ifdef __cplusplus
287 } // extern "C"
288 #endif
289