1 /*
2 * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/desktop_capture/differ_vector_sse2.h"
12
13 #if defined(_MSC_VER)
14 #include <intrin.h>
15 #else
16 #include <emmintrin.h>
17 #include <mmintrin.h>
18 #endif
19
20 namespace webrtc {
21
VectorDifference_SSE2_W16(const uint8_t * image1,const uint8_t * image2)22 extern bool VectorDifference_SSE2_W16(const uint8_t* image1,
23 const uint8_t* image2) {
24 __m128i acc = _mm_setzero_si128();
25 __m128i v0;
26 __m128i v1;
27 __m128i sad;
28 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
29 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
30 v0 = _mm_loadu_si128(i1);
31 v1 = _mm_loadu_si128(i2);
32 sad = _mm_sad_epu8(v0, v1);
33 acc = _mm_adds_epu16(acc, sad);
34 v0 = _mm_loadu_si128(i1 + 1);
35 v1 = _mm_loadu_si128(i2 + 1);
36 sad = _mm_sad_epu8(v0, v1);
37 acc = _mm_adds_epu16(acc, sad);
38 v0 = _mm_loadu_si128(i1 + 2);
39 v1 = _mm_loadu_si128(i2 + 2);
40 sad = _mm_sad_epu8(v0, v1);
41 acc = _mm_adds_epu16(acc, sad);
42 v0 = _mm_loadu_si128(i1 + 3);
43 v1 = _mm_loadu_si128(i2 + 3);
44 sad = _mm_sad_epu8(v0, v1);
45 acc = _mm_adds_epu16(acc, sad);
46
47 // This essential means sad = acc >> 64. We only care about the lower 16
48 // bits.
49 sad = _mm_shuffle_epi32(acc, 0xEE);
50 sad = _mm_adds_epu16(sad, acc);
51 return _mm_cvtsi128_si32(sad) != 0;
52 }
53
VectorDifference_SSE2_W32(const uint8_t * image1,const uint8_t * image2)54 extern bool VectorDifference_SSE2_W32(const uint8_t* image1,
55 const uint8_t* image2) {
56 __m128i acc = _mm_setzero_si128();
57 __m128i v0;
58 __m128i v1;
59 __m128i sad;
60 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
61 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
62 v0 = _mm_loadu_si128(i1);
63 v1 = _mm_loadu_si128(i2);
64 sad = _mm_sad_epu8(v0, v1);
65 acc = _mm_adds_epu16(acc, sad);
66 v0 = _mm_loadu_si128(i1 + 1);
67 v1 = _mm_loadu_si128(i2 + 1);
68 sad = _mm_sad_epu8(v0, v1);
69 acc = _mm_adds_epu16(acc, sad);
70 v0 = _mm_loadu_si128(i1 + 2);
71 v1 = _mm_loadu_si128(i2 + 2);
72 sad = _mm_sad_epu8(v0, v1);
73 acc = _mm_adds_epu16(acc, sad);
74 v0 = _mm_loadu_si128(i1 + 3);
75 v1 = _mm_loadu_si128(i2 + 3);
76 sad = _mm_sad_epu8(v0, v1);
77 acc = _mm_adds_epu16(acc, sad);
78 v0 = _mm_loadu_si128(i1 + 4);
79 v1 = _mm_loadu_si128(i2 + 4);
80 sad = _mm_sad_epu8(v0, v1);
81 acc = _mm_adds_epu16(acc, sad);
82 v0 = _mm_loadu_si128(i1 + 5);
83 v1 = _mm_loadu_si128(i2 + 5);
84 sad = _mm_sad_epu8(v0, v1);
85 acc = _mm_adds_epu16(acc, sad);
86 v0 = _mm_loadu_si128(i1 + 6);
87 v1 = _mm_loadu_si128(i2 + 6);
88 sad = _mm_sad_epu8(v0, v1);
89 acc = _mm_adds_epu16(acc, sad);
90 v0 = _mm_loadu_si128(i1 + 7);
91 v1 = _mm_loadu_si128(i2 + 7);
92 sad = _mm_sad_epu8(v0, v1);
93 acc = _mm_adds_epu16(acc, sad);
94
95 // This essential means sad = acc >> 64. We only care about the lower 16
96 // bits.
97 sad = _mm_shuffle_epi32(acc, 0xEE);
98 sad = _mm_adds_epu16(sad, acc);
99 return _mm_cvtsi128_si32(sad) != 0;
100 }
101
102 } // namespace webrtc
103