1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include <immintrin.h> 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_ports/mem.h" 13 14 #define FSAD64_H(h) \ 15 unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ 16 const uint8_t *ref_ptr, int ref_stride) { \ 17 int i, res; \ 18 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 19 __m256i sum_sad = _mm256_setzero_si256(); \ 20 __m256i sum_sad_h; \ 21 __m128i sum_sad128; \ 22 for (i = 0; i < h; i++) { \ 23 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 24 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 25 sad1_reg = _mm256_sad_epu8( \ 26 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 27 sad2_reg = _mm256_sad_epu8( \ 28 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 29 sum_sad = \ 30 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 31 ref_ptr += ref_stride; \ 32 src_ptr += src_stride; \ 33 } \ 34 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 35 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 36 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 37 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 38 res = _mm_cvtsi128_si32(sum_sad128); \ 39 return res; \ 40 } 41 42 #define FSAD32_H(h) \ 43 unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ 44 const uint8_t *ref_ptr, int ref_stride) { \ 45 int i, res; \ 46 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 47 __m256i sum_sad = _mm256_setzero_si256(); \ 48 __m256i sum_sad_h; \ 49 __m128i sum_sad128; \ 50 int ref2_stride = ref_stride << 1; \ 51 int src2_stride = src_stride << 1; \ 52 int max = h >> 1; \ 53 for (i = 0; i < max; i++) { \ 54 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 55 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 56 sad1_reg = _mm256_sad_epu8( \ 57 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 58 sad2_reg = _mm256_sad_epu8( \ 59 ref2_reg, \ 60 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 61 sum_sad = \ 62 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 63 ref_ptr += ref2_stride; \ 64 src_ptr += src2_stride; \ 65 } \ 66 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 67 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 68 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 69 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 70 res = _mm_cvtsi128_si32(sum_sad128); \ 71 return res; \ 72 } 73 74 #define FSAD64 \ 75 FSAD64_H(64); \ 76 FSAD64_H(32); 77 78 #define FSAD32 \ 79 FSAD32_H(64); \ 80 FSAD32_H(32); \ 81 FSAD32_H(16); 82 83 FSAD64; 84 FSAD32; 85 86 #undef FSAD64 87 #undef FSAD32 88 #undef FSAD64_H 89 #undef FSAD32_H 90 91 #define FSADAVG64_H(h) \ 92 unsigned int vpx_sad64x##h##_avg_avx2( \ 93 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 94 int ref_stride, const uint8_t *second_pred) { \ 95 int i, res; \ 96 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 97 __m256i sum_sad = _mm256_setzero_si256(); \ 98 __m256i sum_sad_h; \ 99 __m128i sum_sad128; \ 100 for (i = 0; i < h; i++) { \ 101 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 102 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 103 ref1_reg = _mm256_avg_epu8( \ 104 ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ 105 ref2_reg = _mm256_avg_epu8( \ 106 ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ 107 sad1_reg = _mm256_sad_epu8( \ 108 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 109 sad2_reg = _mm256_sad_epu8( \ 110 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 111 sum_sad = \ 112 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 113 ref_ptr += ref_stride; \ 114 src_ptr += src_stride; \ 115 second_pred += 64; \ 116 } \ 117 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 118 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 119 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 120 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 121 res = _mm_cvtsi128_si32(sum_sad128); \ 122 return res; \ 123 } 124 125 #define FSADAVG32_H(h) \ 126 unsigned int vpx_sad32x##h##_avg_avx2( \ 127 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 128 int ref_stride, const uint8_t *second_pred) { \ 129 int i, res; \ 130 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 131 __m256i sum_sad = _mm256_setzero_si256(); \ 132 __m256i sum_sad_h; \ 133 __m128i sum_sad128; \ 134 int ref2_stride = ref_stride << 1; \ 135 int src2_stride = src_stride << 1; \ 136 int max = h >> 1; \ 137 for (i = 0; i < max; i++) { \ 138 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 139 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 140 ref1_reg = _mm256_avg_epu8( \ 141 ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ 142 ref2_reg = _mm256_avg_epu8( \ 143 ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ 144 sad1_reg = _mm256_sad_epu8( \ 145 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 146 sad2_reg = _mm256_sad_epu8( \ 147 ref2_reg, \ 148 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 149 sum_sad = \ 150 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 151 ref_ptr += ref2_stride; \ 152 src_ptr += src2_stride; \ 153 second_pred += 64; \ 154 } \ 155 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 156 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 157 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 158 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 159 res = _mm_cvtsi128_si32(sum_sad128); \ 160 return res; \ 161 } 162 163 #define FSADAVG64 \ 164 FSADAVG64_H(64); \ 165 FSADAVG64_H(32); 166 167 #define FSADAVG32 \ 168 FSADAVG32_H(64); \ 169 FSADAVG32_H(32); \ 170 FSADAVG32_H(16); 171 172 FSADAVG64; 173 FSADAVG32; 174 175 #undef FSADAVG64 176 #undef FSADAVG32 177 #undef FSADAVG64_H 178 #undef FSADAVG32_H 179