1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include <immintrin.h> 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_ports/mem.h" 13 14 #define FSAD64_H(h) \ 15 unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ 16 const uint8_t *ref_ptr, int ref_stride) { \ 17 int i; \ 18 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 19 __m256i sum_sad = _mm256_setzero_si256(); \ 20 __m256i sum_sad_h; \ 21 __m128i sum_sad128; \ 22 for (i = 0; i < h; i++) { \ 23 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 24 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 25 sad1_reg = _mm256_sad_epu8( \ 26 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 27 sad2_reg = _mm256_sad_epu8( \ 28 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 29 sum_sad = \ 30 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 31 ref_ptr += ref_stride; \ 32 src_ptr += src_stride; \ 33 } \ 34 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 35 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 36 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 37 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 38 return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ 39 } 40 41 #define FSAD32_H(h) \ 42 unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ 43 const uint8_t *ref_ptr, int ref_stride) { \ 44 int i, res; \ 45 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 46 __m256i sum_sad = _mm256_setzero_si256(); \ 47 __m256i sum_sad_h; \ 48 __m128i sum_sad128; \ 49 int ref2_stride = ref_stride << 1; \ 50 int src2_stride = src_stride << 1; \ 51 int max = h >> 1; \ 52 for (i = 0; i < max; i++) { \ 53 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 54 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 55 sad1_reg = _mm256_sad_epu8( \ 56 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 57 sad2_reg = _mm256_sad_epu8( \ 58 ref2_reg, \ 59 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 60 sum_sad = \ 61 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 62 ref_ptr += ref2_stride; \ 63 src_ptr += src2_stride; \ 64 } \ 65 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 66 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 67 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 68 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 69 res = _mm_cvtsi128_si32(sum_sad128); \ 70 return res; \ 71 } 72 73 #define FSAD64 \ 74 FSAD64_H(64) \ 75 FSAD64_H(32) 76 77 #define FSAD32 \ 78 FSAD32_H(64) \ 79 FSAD32_H(32) \ 80 FSAD32_H(16) 81 82 FSAD64 83 FSAD32 84 85 #undef FSAD64 86 #undef FSAD32 87 #undef FSAD64_H 88 #undef FSAD32_H 89 90 #define FSADAVG64_H(h) \ 91 unsigned int vpx_sad64x##h##_avg_avx2( \ 92 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 93 int ref_stride, const uint8_t *second_pred) { \ 94 int i; \ 95 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 96 __m256i sum_sad = _mm256_setzero_si256(); \ 97 __m256i sum_sad_h; \ 98 __m128i sum_sad128; \ 99 for (i = 0; i < h; i++) { \ 100 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 101 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ 102 ref1_reg = _mm256_avg_epu8( \ 103 ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ 104 ref2_reg = _mm256_avg_epu8( \ 105 ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ 106 sad1_reg = _mm256_sad_epu8( \ 107 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 108 sad2_reg = _mm256_sad_epu8( \ 109 ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ 110 sum_sad = \ 111 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 112 ref_ptr += ref_stride; \ 113 src_ptr += src_stride; \ 114 second_pred += 64; \ 115 } \ 116 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 117 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 118 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 119 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 120 return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ 121 } 122 123 #define FSADAVG32_H(h) \ 124 unsigned int vpx_sad32x##h##_avg_avx2( \ 125 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ 126 int ref_stride, const uint8_t *second_pred) { \ 127 int i; \ 128 __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ 129 __m256i sum_sad = _mm256_setzero_si256(); \ 130 __m256i sum_sad_h; \ 131 __m128i sum_sad128; \ 132 int ref2_stride = ref_stride << 1; \ 133 int src2_stride = src_stride << 1; \ 134 int max = h >> 1; \ 135 for (i = 0; i < max; i++) { \ 136 ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ 137 ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ 138 ref1_reg = _mm256_avg_epu8( \ 139 ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ 140 ref2_reg = _mm256_avg_epu8( \ 141 ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ 142 sad1_reg = _mm256_sad_epu8( \ 143 ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ 144 sad2_reg = _mm256_sad_epu8( \ 145 ref2_reg, \ 146 _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ 147 sum_sad = \ 148 _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ 149 ref_ptr += ref2_stride; \ 150 src_ptr += src2_stride; \ 151 second_pred += 64; \ 152 } \ 153 sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ 154 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ 155 sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ 156 sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ 157 return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ 158 } 159 160 #define FSADAVG64 \ 161 FSADAVG64_H(64) \ 162 FSADAVG64_H(32) 163 164 #define FSADAVG32 \ 165 FSADAVG32_H(64) \ 166 FSADAVG32_H(32) \ 167 FSADAVG32_H(16) 168 169 FSADAVG64 170 FSADAVG32 171 172 #undef FSADAVG64 173 #undef FSADAVG32 174 #undef FSADAVG64_H 175 #undef FSADAVG32_H 176