1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ 13 #define AOM_AV1_COMMON_X86_CFL_SIMD_H_ 14 15 #include "av1/common/blockd.h" 16 17 // SSSE3 version is optimal for with == 4, we reuse them in AVX2 18 void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride, 19 uint16_t *output_q3); 20 void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride, 21 uint16_t *output_q3); 22 void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride, 23 uint16_t *output_q3); 24 25 // SSSE3 version is optimal for with == 8, we reuse it in AVX2 26 void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride, 27 uint16_t *output_q3); 28 void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride, 29 uint16_t *output_q3); 30 void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride, 31 uint16_t *output_q3); 32 void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride, 33 uint16_t *output_q3); 34 35 // SSSE3 version is optimal for with == 16, we reuse it in AVX2 36 void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride, 37 uint16_t *output_q3); 38 void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride, 39 uint16_t *output_q3); 40 void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride, 41 uint16_t *output_q3); 42 void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride, 43 uint16_t *output_q3); 44 45 // SSSE3 version is optimal for with == 4, we reuse them in AVX2 46 void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride, 47 uint16_t *output_q3); 48 void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride, 49 uint16_t *output_q3); 50 void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride, 51 uint16_t *output_q3); 52 53 // SSSE3 version is optimal for with == 8, we reuse it in AVX2 54 void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride, 55 uint16_t *output_q3); 56 void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride, 57 uint16_t *output_q3); 58 void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride, 59 uint16_t *output_q3); 60 void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride, 61 uint16_t *output_q3); 62 63 // SSSE3 version is optimal for with == 16, we reuse it in AVX2 64 void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride, 65 uint16_t *output_q3); 66 void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride, 67 uint16_t *output_q3); 68 void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride, 69 uint16_t *output_q3); 70 void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride, 71 uint16_t *output_q3); 72 73 // SSSE3 version is optimal for with == 4, we reuse them in AVX2 74 void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride, 75 uint16_t *output_q3); 76 void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride, 77 uint16_t *output_q3); 78 void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride, 79 uint16_t *output_q3); 80 81 // SSSE3 version is optimal for with == 8, we reuse it in AVX2 82 void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride, 83 uint16_t *output_q3); 84 void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride, 85 uint16_t *output_q3); 86 void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride, 87 uint16_t *output_q3); 88 void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride, 89 uint16_t *output_q3); 90 91 // SSSE3 version is optimal for with == 16, we reuse it in AVX2 92 void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride, 93 uint16_t *output_q3); 94 void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride, 95 uint16_t *output_q3); 96 void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride, 97 uint16_t *output_q3); 98 void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride, 99 uint16_t *output_q3); 100 101 void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride, 102 uint16_t *output_q3); 103 void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride, 104 uint16_t *output_q3); 105 void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride, 106 uint16_t *output_q3); 107 108 // SSSE3 version is optimal for with == 8, we reuse it in AVX2 109 void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride, 110 uint16_t *output_q3); 111 void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride, 112 uint16_t *output_q3); 113 void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride, 114 uint16_t *output_q3); 115 void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride, 116 uint16_t *output_q3); 117 118 // SSSE3 version is faster for with == 16, we reuse it in AVX2 119 void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride, 120 uint16_t *output_q3); 121 void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride, 122 uint16_t *output_q3); 123 void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride, 124 uint16_t *output_q3); 125 void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride, 126 uint16_t *output_q3); 127 128 void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride, 129 uint16_t *output_q3); 130 void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride, 131 uint16_t *output_q3); 132 void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride, 133 uint16_t *output_q3); 134 135 // SSSE3 version is optimal for with == 8, we reuse it in AVX2 136 void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride, 137 uint16_t *output_q3); 138 void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride, 139 uint16_t *output_q3); 140 void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride, 141 uint16_t *output_q3); 142 void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride, 143 uint16_t *output_q3); 144 145 // SSSE3 version is faster for with == 16, we reuse it in AVX2 146 void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride, 147 uint16_t *output_q3); 148 void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride, 149 uint16_t *output_q3); 150 void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride, 151 uint16_t *output_q3); 152 void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride, 153 uint16_t *output_q3); 154 155 void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride, 156 uint16_t *output_q3); 157 void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride, 158 uint16_t *output_q3); 159 void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride, 160 uint16_t *output_q3); 161 162 // SSSE3 version is optimal for with == 8, we reuse it in AVX2 163 void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride, 164 uint16_t *output_q3); 165 void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride, 166 uint16_t *output_q3); 167 void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride, 168 uint16_t *output_q3); 169 void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride, 170 uint16_t *output_q3); 171 172 // SSSE3 version is faster for with == 16, we reuse it in AVX2 173 void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride, 174 uint16_t *output_q3); 175 void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride, 176 uint16_t *output_q3); 177 void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride, 178 uint16_t *output_q3); 179 void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride, 180 uint16_t *output_q3); 181 182 // SSE2 version is optimal for with == 4, we reuse them in AVX2 183 void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); 184 void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); 185 void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); 186 187 // SSE2 version is optimal for with == 8, we reuse them in AVX2 188 void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); 189 void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); 190 void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); 191 void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); 192 193 void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 194 int dst_stride, int alpha_q3); 195 void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 196 int dst_stride, int alpha_q3); 197 void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 198 int dst_stride, int alpha_q3); 199 200 void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 201 int dst_stride, int alpha_q3); 202 void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 203 int dst_stride, int alpha_q3); 204 void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 205 int dst_stride, int alpha_q3); 206 void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 207 int dst_stride, int alpha_q3); 208 209 void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 210 int dst_stride, int alpha_q3); 211 void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 212 int dst_stride, int alpha_q3); 213 void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 214 int dst_stride, int alpha_q3); 215 void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, 216 int dst_stride, int alpha_q3); 217 218 void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 219 int dst_stride, int alpha_q3, int bd); 220 void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 221 int dst_stride, int alpha_q3, int bd); 222 void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 223 int dst_stride, int alpha_q3, int bd); 224 225 void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 226 int dst_stride, int alpha_q3, int bd); 227 void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 228 int dst_stride, int alpha_q3, int bd); 229 void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 230 int dst_stride, int alpha_q3, int bd); 231 void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 232 int dst_stride, int alpha_q3, int bd); 233 234 void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 235 int dst_stride, int alpha_q3, int bd); 236 void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 237 int dst_stride, int alpha_q3, int bd); 238 void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 239 int dst_stride, int alpha_q3, int bd); 240 void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, 241 int dst_stride, int alpha_q3, int bd); 242 243 #endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ 244