• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_DSP_X86_MEM_SSE2_H_
12 #define VPX_DSP_X86_MEM_SSE2_H_
13 
14 #include <emmintrin.h>  // SSE2
15 
16 #include "./vpx_config.h"
17 
load_8bit_4x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)18 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
19                                  __m128i *const d) {
20   d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
21   d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
22   d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
23   d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
24 }
25 
load_8bit_4x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)26 static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
27                                  __m128i *const d) {
28   load_8bit_4x4(s + 0 * stride, stride, &d[0]);
29   load_8bit_4x4(s + 4 * stride, stride, &d[4]);
30 }
31 
load_8bit_8x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)32 static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
33                                  __m128i *const d) {
34   d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
35   d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
36   d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
37   d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
38 }
39 
load_8bit_8x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)40 static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
41                                  __m128i *const d) {
42   load_8bit_8x4(s + 0 * stride, stride, &d[0]);
43   load_8bit_8x4(s + 4 * stride, stride, &d[4]);
44 }
45 
load_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)46 static INLINE void load_8bit_16x8(const uint8_t *const s,
47                                   const ptrdiff_t stride, __m128i *const d) {
48   d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
49   d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
50   d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
51   d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
52   d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
53   d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
54   d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
55   d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
56 }
57 
loadu_8bit_16x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)58 static INLINE void loadu_8bit_16x4(const uint8_t *const s,
59                                    const ptrdiff_t stride, __m128i *const d) {
60   d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
61   d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
62   d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
63   d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
64 }
65 
loadu_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)66 static INLINE void loadu_8bit_16x8(const uint8_t *const s,
67                                    const ptrdiff_t stride, __m128i *const d) {
68   loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
69   loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
70 }
71 
_mm_storeh_epi64(__m128i * const d,const __m128i s)72 static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
73   _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
74 }
75 
store_8bit_4x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)76 static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
77                                   const ptrdiff_t stride) {
78   *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
79   *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
80   *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
81   *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
82 }
83 
store_8bit_4x4_sse2(const __m128i s,uint8_t * const d,const ptrdiff_t stride)84 static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
85                                        const ptrdiff_t stride) {
86   __m128i ss[4];
87 
88   ss[0] = s;
89   ss[1] = _mm_srli_si128(s, 4);
90   ss[2] = _mm_srli_si128(s, 8);
91   ss[3] = _mm_srli_si128(s, 12);
92   store_8bit_4x4(ss, d, stride);
93 }
94 
store_8bit_8x4_from_16x2(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)95 static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
96                                             uint8_t *const d,
97                                             const ptrdiff_t stride) {
98   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
99   _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
100   _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
101   _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
102 }
103 
store_8bit_8x8(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)104 static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
105                                   const ptrdiff_t stride) {
106   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
107   _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
108   _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
109   _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
110   _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
111   _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
112   _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
113   _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
114 }
115 
storeu_8bit_16x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)116 static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
117                                     const ptrdiff_t stride) {
118   _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
119   _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
120   _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
121   _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
122 }
123 
124 #endif  // VPX_DSP_X86_MEM_SSE2_H_
125