1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./psnr.h" // NOLINT
12
13 #ifdef _OPENMP
14 #include <omp.h>
15 #endif
16 #ifdef _MSC_VER
17 #include <intrin.h> // For __cpuid()
18 #endif
19
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23
24 typedef unsigned int uint32; // NOLINT
25 #ifdef _MSC_VER
26 typedef unsigned __int64 uint64;
27 #else // COMPILER_MSVC
28 #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
29 typedef unsigned long uint64; // NOLINT
30 #else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31 typedef unsigned long long uint64; // NOLINT
32 #endif // __LP64__
33 #endif // _MSC_VER
34
35 // libyuv provides this function when linking library for jpeg support.
36 #if !defined(HAVE_JPEG)
37
38 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
39 !defined(__aarch64__)
40 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)41 static uint32 SumSquareError_NEON(const uint8* src_a,
42 const uint8* src_b,
43 int count) {
44 volatile uint32 sse;
45 asm volatile(
46 "vmov.u8 q7, #0 \n"
47 "vmov.u8 q9, #0 \n"
48 "vmov.u8 q8, #0 \n"
49 "vmov.u8 q10, #0 \n"
50
51 "1: \n"
52 "vld1.u8 {q0}, [%0]! \n"
53 "vld1.u8 {q1}, [%1]! \n"
54 "vsubl.u8 q2, d0, d2 \n"
55 "vsubl.u8 q3, d1, d3 \n"
56 "vmlal.s16 q7, d4, d4 \n"
57 "vmlal.s16 q8, d6, d6 \n"
58 "vmlal.s16 q8, d5, d5 \n"
59 "vmlal.s16 q10, d7, d7 \n"
60 "subs %2, %2, #16 \n"
61 "bhi 1b \n"
62
63 "vadd.u32 q7, q7, q8 \n"
64 "vadd.u32 q9, q9, q10 \n"
65 "vadd.u32 q10, q7, q9 \n"
66 "vpaddl.u32 q1, q10 \n"
67 "vadd.u64 d0, d2, d3 \n"
68 "vmov.32 %3, d0[0] \n"
69 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
70 :
71 : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
72 return sse;
73 }
74 #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
75 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)76 static uint32 SumSquareError_NEON(const uint8* src_a,
77 const uint8* src_b,
78 int count) {
79 volatile uint32 sse;
80 asm volatile(
81 "eor v16.16b, v16.16b, v16.16b \n"
82 "eor v18.16b, v18.16b, v18.16b \n"
83 "eor v17.16b, v17.16b, v17.16b \n"
84 "eor v19.16b, v19.16b, v19.16b \n"
85
86 "1: \n"
87 "ld1 {v0.16b}, [%0], #16 \n"
88 "ld1 {v1.16b}, [%1], #16 \n"
89 "subs %w2, %w2, #16 \n"
90 "usubl v2.8h, v0.8b, v1.8b \n"
91 "usubl2 v3.8h, v0.16b, v1.16b \n"
92 "smlal v16.4s, v2.4h, v2.4h \n"
93 "smlal v17.4s, v3.4h, v3.4h \n"
94 "smlal2 v18.4s, v2.8h, v2.8h \n"
95 "smlal2 v19.4s, v3.8h, v3.8h \n"
96 "b.gt 1b \n"
97
98 "add v16.4s, v16.4s, v17.4s \n"
99 "add v18.4s, v18.4s, v19.4s \n"
100 "add v19.4s, v16.4s, v18.4s \n"
101 "addv s0, v19.4s \n"
102 "fmov %w3, s0 \n"
103 : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
104 :
105 : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
106 return sse;
107 }
108 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
109 #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 *,const uint8 *,int)110 __declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
111 const uint8* /*src_b*/,
112 int /*count*/) {
113 __asm {
114 mov eax, [esp + 4] // src_a
115 mov edx, [esp + 8] // src_b
116 mov ecx, [esp + 12] // count
117 pxor xmm0, xmm0
118 pxor xmm5, xmm5
119 sub edx, eax
120
121 wloop:
122 movdqu xmm1, [eax]
123 movdqu xmm2, [eax + edx]
124 lea eax, [eax + 16]
125 movdqu xmm3, xmm1
126 psubusb xmm1, xmm2
127 psubusb xmm2, xmm3
128 por xmm1, xmm2
129 movdqu xmm2, xmm1
130 punpcklbw xmm1, xmm5
131 punpckhbw xmm2, xmm5
132 pmaddwd xmm1, xmm1
133 pmaddwd xmm2, xmm2
134 paddd xmm0, xmm1
135 paddd xmm0, xmm2
136 sub ecx, 16
137 ja wloop
138
139 pshufd xmm1, xmm0, 0EEh
140 paddd xmm0, xmm1
141 pshufd xmm1, xmm0, 01h
142 paddd xmm0, xmm1
143 movd eax, xmm0
144 ret
145 }
146 }
147 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
148 #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)149 static uint32 SumSquareError_SSE2(const uint8* src_a,
150 const uint8* src_b,
151 int count) {
152 uint32 sse;
153 asm volatile( // NOLINT
154 "pxor %%xmm0,%%xmm0 \n"
155 "pxor %%xmm5,%%xmm5 \n"
156 "sub %0,%1 \n"
157
158 "1: \n"
159 "movdqu (%0),%%xmm1 \n"
160 "movdqu (%0,%1,1),%%xmm2 \n"
161 "lea 0x10(%0),%0 \n"
162 "movdqu %%xmm1,%%xmm3 \n"
163 "psubusb %%xmm2,%%xmm1 \n"
164 "psubusb %%xmm3,%%xmm2 \n"
165 "por %%xmm2,%%xmm1 \n"
166 "movdqu %%xmm1,%%xmm2 \n"
167 "punpcklbw %%xmm5,%%xmm1 \n"
168 "punpckhbw %%xmm5,%%xmm2 \n"
169 "pmaddwd %%xmm1,%%xmm1 \n"
170 "pmaddwd %%xmm2,%%xmm2 \n"
171 "paddd %%xmm1,%%xmm0 \n"
172 "paddd %%xmm2,%%xmm0 \n"
173 "sub $0x10,%2 \n"
174 "ja 1b \n"
175
176 "pshufd $0xee,%%xmm0,%%xmm1 \n"
177 "paddd %%xmm1,%%xmm0 \n"
178 "pshufd $0x1,%%xmm0,%%xmm1 \n"
179 "paddd %%xmm1,%%xmm0 \n"
180 "movd %%xmm0,%3 \n"
181
182 : "+r"(src_a), // %0
183 "+r"(src_b), // %1
184 "+r"(count), // %2
185 "=g"(sse) // %3
186 :
187 : "memory", "cc"
188 #if defined(__SSE2__)
189 ,
190 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
191 #endif
192 ); // NOLINT
193 return sse;
194 }
195 #endif // LIBYUV_DISABLE_X86 etc
196
197 #if defined(HAS_SUMSQUAREERROR_SSE2)
198 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
__cpuid(int cpu_info[4],int info_type)199 static __inline void __cpuid(int cpu_info[4], int info_type) {
200 asm volatile( // NOLINT
201 "mov %%ebx, %%edi \n"
202 "cpuid \n"
203 "xchg %%edi, %%ebx \n"
204 : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
205 "=d"(cpu_info[3])
206 : "a"(info_type));
207 }
208 // For gcc/clang but not clangcl.
209 #elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
__cpuid(int cpu_info[4],int info_type)210 static __inline void __cpuid(int cpu_info[4], int info_type) {
211 asm volatile( // NOLINT
212 "cpuid \n"
213 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
214 "=d"(cpu_info[3])
215 : "a"(info_type));
216 }
217 #endif
218
CpuHasSSE2()219 static int CpuHasSSE2() {
220 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
221 int cpu_info[4];
222 __cpuid(cpu_info, 1);
223 if (cpu_info[3] & 0x04000000) {
224 return 1;
225 }
226 #endif
227 return 0;
228 }
229 #endif // HAS_SUMSQUAREERROR_SSE2
230
SumSquareError_C(const uint8 * src_a,const uint8 * src_b,int count)231 static uint32 SumSquareError_C(const uint8* src_a,
232 const uint8* src_b,
233 int count) {
234 uint32 sse = 0u;
235 for (int x = 0; x < count; ++x) {
236 int diff = src_a[x] - src_b[x];
237 sse += static_cast<uint32>(diff * diff);
238 }
239 return sse;
240 }
241
ComputeSumSquareError(const uint8 * src_a,const uint8 * src_b,int count)242 double ComputeSumSquareError(const uint8* src_a,
243 const uint8* src_b,
244 int count) {
245 uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
246 SumSquareError_C;
247 #if defined(HAS_SUMSQUAREERROR_NEON)
248 SumSquareError = SumSquareError_NEON;
249 #endif
250 #if defined(HAS_SUMSQUAREERROR_SSE2)
251 if (CpuHasSSE2()) {
252 SumSquareError = SumSquareError_SSE2;
253 }
254 #endif
255 const int kBlockSize = 1 << 15;
256 uint64 sse = 0;
257 #ifdef _OPENMP
258 #pragma omp parallel for reduction(+ : sse)
259 #endif
260 for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
261 sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
262 }
263 src_a += count & ~(kBlockSize - 1);
264 src_b += count & ~(kBlockSize - 1);
265 int remainder = count & (kBlockSize - 1) & ~15;
266 if (remainder) {
267 sse += SumSquareError(src_a, src_b, remainder);
268 src_a += remainder;
269 src_b += remainder;
270 }
271 remainder = count & 15;
272 if (remainder) {
273 sse += SumSquareError_C(src_a, src_b, remainder);
274 }
275 return static_cast<double>(sse);
276 }
277 #endif
278
279 // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
280 // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
ComputePSNR(double sse,double size)281 double ComputePSNR(double sse, double size) {
282 const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
283 if (sse <= kMINSSE)
284 sse = kMINSSE; // Produces max PSNR of 128
285 return 10.0 * log10(255.0 * 255.0 * size / sse);
286 }
287
288 #ifdef __cplusplus
289 } // extern "C"
290 #endif
291