1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <emmintrin.h> // SSE2
14 #include <smmintrin.h> /* SSE4.1 */
15 #include <immintrin.h> /* AVX2 */
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/x86/mem_sse2.h"
19 #include "av1/common/av1_common_int.h"
20 #include "av1/common/txb_common.h"
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/synonyms_avx2.h"
23
av1_txb_init_levels_avx2(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)24 void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
25 const int height, uint8_t *const levels) {
26 const int stride = width + TX_PAD_HOR;
27 const __m256i y_zeros = _mm256_setzero_si256();
28
29 const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
30 uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
31 uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
32
33 do {
34 yy_storeu_256(bottom_buf, y_zeros);
35 bottom_buf += 32;
36 } while (bottom_buf < bottom_buf_end);
37
38 int i = 0;
39 uint8_t *ls = levels;
40 const tran_low_t *cf = coeff;
41 if (width == 4) {
42 do {
43 const __m256i c0 = yy_loadu_256(cf);
44 const __m256i c1 = yy_loadu_256(cf + 8);
45 const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
46 const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
47 const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
48 const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
49 yy_storeu_256(ls, res);
50 ls += 32;
51 cf += 16;
52 i += 4;
53 } while (i < height);
54 } else if (width == 8) {
55 do {
56 const __m256i coeffA = yy_loadu_256(cf);
57 const __m256i coeffB = yy_loadu_256(cf + 8);
58 const __m256i coeffC = yy_loadu_256(cf + 16);
59 const __m256i coeffD = yy_loadu_256(cf + 24);
60 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
61 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
62 const __m256i absAB = _mm256_abs_epi16(coeffAB);
63 const __m256i absCD = _mm256_abs_epi16(coeffCD);
64 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
65 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
66 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
67 const __m128i res0 = _mm256_castsi256_si128(res);
68 const __m128i res1 = _mm256_extracti128_si256(res, 1);
69 xx_storel_64(ls, res0);
70 *(int32_t *)(ls + width) = 0;
71 xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
72 *(int32_t *)(ls + width + stride) = 0;
73 xx_storel_64(ls + stride * 2, res1);
74 *(int32_t *)(ls + width + stride * 2) = 0;
75 xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
76 *(int32_t *)(ls + width + stride * 3) = 0;
77 cf += 32;
78 ls += stride << 2;
79 i += 4;
80 } while (i < height);
81 } else if (width == 16) {
82 do {
83 const __m256i coeffA = yy_loadu_256(cf);
84 const __m256i coeffB = yy_loadu_256(cf + 8);
85 const __m256i coeffC = yy_loadu_256(cf + 16);
86 const __m256i coeffD = yy_loadu_256(cf + 24);
87 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
88 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
89 const __m256i absAB = _mm256_abs_epi16(coeffAB);
90 const __m256i absCD = _mm256_abs_epi16(coeffCD);
91 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
92 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
93 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
94 xx_storeu_128(ls, _mm256_castsi256_si128(res));
95 xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
96 cf += 32;
97 *(int32_t *)(ls + width) = 0;
98 *(int32_t *)(ls + stride + width) = 0;
99 ls += stride << 1;
100 i += 2;
101 } while (i < height);
102 } else {
103 do {
104 const __m256i coeffA = yy_loadu_256(cf);
105 const __m256i coeffB = yy_loadu_256(cf + 8);
106 const __m256i coeffC = yy_loadu_256(cf + 16);
107 const __m256i coeffD = yy_loadu_256(cf + 24);
108 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
109 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
110 const __m256i absAB = _mm256_abs_epi16(coeffAB);
111 const __m256i absCD = _mm256_abs_epi16(coeffCD);
112 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
113 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
114 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
115 yy_storeu_256(ls, res);
116 cf += 32;
117 *(int32_t *)(ls + width) = 0;
118 ls += stride;
119 i += 1;
120 } while (i < height);
121 }
122 }
123