1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_DSP_X86_MEM_SSE2_H_
12 #define VPX_DSP_X86_MEM_SSE2_H_
13
14 #include <emmintrin.h> // SSE2
15
16 #include "./vpx_config.h"
17
load_8bit_4x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)18 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
19 __m128i *const d) {
20 d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
21 d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
22 d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
23 d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
24 }
25
load_8bit_4x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)26 static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
27 __m128i *const d) {
28 load_8bit_4x4(s + 0 * stride, stride, &d[0]);
29 load_8bit_4x4(s + 4 * stride, stride, &d[4]);
30 }
31
load_8bit_8x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)32 static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
33 __m128i *const d) {
34 d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
35 d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
36 d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
37 d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
38 }
39
load_8bit_8x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)40 static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
41 __m128i *const d) {
42 load_8bit_8x4(s + 0 * stride, stride, &d[0]);
43 load_8bit_8x4(s + 4 * stride, stride, &d[4]);
44 }
45
load_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)46 static INLINE void load_8bit_16x8(const uint8_t *const s,
47 const ptrdiff_t stride, __m128i *const d) {
48 d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
49 d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
50 d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
51 d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
52 d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
53 d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
54 d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
55 d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
56 }
57
loadu_8bit_16x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)58 static INLINE void loadu_8bit_16x4(const uint8_t *const s,
59 const ptrdiff_t stride, __m128i *const d) {
60 d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
61 d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
62 d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
63 d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
64 }
65
loadu_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)66 static INLINE void loadu_8bit_16x8(const uint8_t *const s,
67 const ptrdiff_t stride, __m128i *const d) {
68 loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
69 loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
70 }
71
_mm_storeh_epi64(__m128i * const d,const __m128i s)72 static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
73 _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
74 }
75
store_8bit_4x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)76 static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
77 const ptrdiff_t stride) {
78 *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
79 *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
80 *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
81 *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
82 }
83
store_8bit_4x4_sse2(const __m128i s,uint8_t * const d,const ptrdiff_t stride)84 static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
85 const ptrdiff_t stride) {
86 __m128i ss[4];
87
88 ss[0] = s;
89 ss[1] = _mm_srli_si128(s, 4);
90 ss[2] = _mm_srli_si128(s, 8);
91 ss[3] = _mm_srli_si128(s, 12);
92 store_8bit_4x4(ss, d, stride);
93 }
94
store_8bit_8x4_from_16x2(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)95 static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
96 uint8_t *const d,
97 const ptrdiff_t stride) {
98 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
99 _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
100 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
101 _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
102 }
103
store_8bit_8x8(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)104 static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
105 const ptrdiff_t stride) {
106 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
107 _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
108 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
109 _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
110 _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
111 _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
112 _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
113 _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
114 }
115
storeu_8bit_16x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)116 static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
117 const ptrdiff_t stride) {
118 _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
119 _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
120 _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
121 _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
122 }
123
124 #endif // VPX_DSP_X86_MEM_SSE2_H_
125