• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
13 #define AOM_AV1_COMMON_X86_CFL_SIMD_H_
14 
15 #include "av1/common/blockd.h"
16 
17 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
18 void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
19                                  uint16_t *output_q3);
20 void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
21                                  uint16_t *output_q3);
22 void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
23                                   uint16_t *output_q3);
24 
25 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
26 void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
27                                  uint16_t *output_q3);
28 void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
29                                  uint16_t *output_q3);
30 void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
31                                   uint16_t *output_q3);
32 void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
33                                   uint16_t *output_q3);
34 
35 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
36 void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
37                                   uint16_t *output_q3);
38 void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
39                                   uint16_t *output_q3);
40 void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
41                                    uint16_t *output_q3);
42 void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
43                                    uint16_t *output_q3);
44 
45 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
46 void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
47                                  uint16_t *output_q3);
48 void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
49                                  uint16_t *output_q3);
50 void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
51                                   uint16_t *output_q3);
52 
53 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
54 void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
55                                  uint16_t *output_q3);
56 void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
57                                  uint16_t *output_q3);
58 void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
59                                   uint16_t *output_q3);
60 void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
61                                   uint16_t *output_q3);
62 
63 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
64 void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
65                                   uint16_t *output_q3);
66 void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
67                                   uint16_t *output_q3);
68 void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
69                                    uint16_t *output_q3);
70 void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
71                                    uint16_t *output_q3);
72 
73 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
74 void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
75                                  uint16_t *output_q3);
76 void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
77                                  uint16_t *output_q3);
78 void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
79                                   uint16_t *output_q3);
80 
81 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
82 void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
83                                  uint16_t *output_q3);
84 void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
85                                  uint16_t *output_q3);
86 void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
87                                   uint16_t *output_q3);
88 void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
89                                   uint16_t *output_q3);
90 
91 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
92 void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
93                                   uint16_t *output_q3);
94 void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
95                                   uint16_t *output_q3);
96 void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
97                                    uint16_t *output_q3);
98 void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
99                                    uint16_t *output_q3);
100 
101 void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
102                                  uint16_t *output_q3);
103 void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
104                                  uint16_t *output_q3);
105 void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
106                                   uint16_t *output_q3);
107 
108 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
109 void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
110                                  uint16_t *output_q3);
111 void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
112                                  uint16_t *output_q3);
113 void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
114                                   uint16_t *output_q3);
115 void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
116                                   uint16_t *output_q3);
117 
118 // SSSE3 version is faster for with == 16, we reuse it in AVX2
119 void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
120                                   uint16_t *output_q3);
121 void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
122                                   uint16_t *output_q3);
123 void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
124                                    uint16_t *output_q3);
125 void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
126                                    uint16_t *output_q3);
127 
128 void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
129                                  uint16_t *output_q3);
130 void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
131                                  uint16_t *output_q3);
132 void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
133                                   uint16_t *output_q3);
134 
135 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
136 void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
137                                  uint16_t *output_q3);
138 void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
139                                  uint16_t *output_q3);
140 void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
141                                   uint16_t *output_q3);
142 void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
143                                   uint16_t *output_q3);
144 
145 // SSSE3 version is faster for with == 16, we reuse it in AVX2
146 void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
147                                   uint16_t *output_q3);
148 void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
149                                   uint16_t *output_q3);
150 void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
151                                    uint16_t *output_q3);
152 void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
153                                    uint16_t *output_q3);
154 
155 void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
156                                  uint16_t *output_q3);
157 void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
158                                  uint16_t *output_q3);
159 void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
160                                   uint16_t *output_q3);
161 
162 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
163 void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
164                                  uint16_t *output_q3);
165 void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
166                                  uint16_t *output_q3);
167 void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
168                                   uint16_t *output_q3);
169 void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
170                                   uint16_t *output_q3);
171 
172 // SSSE3 version is faster for with == 16, we reuse it in AVX2
173 void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
174                                   uint16_t *output_q3);
175 void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
176                                   uint16_t *output_q3);
177 void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
178                                    uint16_t *output_q3);
179 void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
180                                    uint16_t *output_q3);
181 
182 // SSE2 version is optimal for with == 4, we reuse them in AVX2
183 void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
184 void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
185 void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
186 
187 // SSE2 version is optimal for with == 8, we reuse them in AVX2
188 void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
189 void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
190 void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
191 void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
192 
193 void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
194                            int dst_stride, int alpha_q3);
195 void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
196                            int dst_stride, int alpha_q3);
197 void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
198                             int dst_stride, int alpha_q3);
199 
200 void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
201                            int dst_stride, int alpha_q3);
202 void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
203                            int dst_stride, int alpha_q3);
204 void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
205                             int dst_stride, int alpha_q3);
206 void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
207                             int dst_stride, int alpha_q3);
208 
209 void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
210                             int dst_stride, int alpha_q3);
211 void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
212                             int dst_stride, int alpha_q3);
213 void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
214                              int dst_stride, int alpha_q3);
215 void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
216                              int dst_stride, int alpha_q3);
217 
218 void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
219                            int dst_stride, int alpha_q3, int bd);
220 void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
221                            int dst_stride, int alpha_q3, int bd);
222 void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
223                             int dst_stride, int alpha_q3, int bd);
224 
225 void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
226                            int dst_stride, int alpha_q3, int bd);
227 void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
228                            int dst_stride, int alpha_q3, int bd);
229 void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
230                             int dst_stride, int alpha_q3, int bd);
231 void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
232                             int dst_stride, int alpha_q3, int bd);
233 
234 void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
235                             int dst_stride, int alpha_q3, int bd);
236 void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
237                             int dst_stride, int alpha_q3, int bd);
238 void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
239                              int dst_stride, int alpha_q3, int bd);
240 void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
241                              int dst_stride, int alpha_q3, int bd);
242 
243 #endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
244