1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <emmintrin.h> // SSE2
14 #include <smmintrin.h> /* SSE4.1 */
15
16 #include "aom/aom_integer.h"
17 #include "av1/common/onyxc_int.h"
18 #include "av1/common/txb_common.h"
19 #include "aom_dsp/x86/synonyms.h"
20
av1_txb_init_levels_sse4_1(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)21 void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
22 const int height, uint8_t *const levels) {
23 const int stride = width + TX_PAD_HOR;
24 const __m128i zeros = _mm_setzero_si128();
25
26 const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
27 uint8_t *bottom_buf = levels + stride * height;
28 uint8_t *bottom_buf_end = bottom_buf + bottom_len;
29 do {
30 _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
31 bottom_buf += 16;
32 } while (bottom_buf < bottom_buf_end);
33
34 int i = 0;
35 uint8_t *ls = levels;
36 const tran_low_t *cf = coeff;
37 if (width == 4) {
38 do {
39 const __m128i coeffA = xx_loadu_128(cf);
40 const __m128i coeffB = xx_loadu_128(cf + 4);
41 const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
42 const __m128i absAB = _mm_abs_epi16(coeffAB);
43 const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
44 const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
45 xx_storeu_128(ls, lsAB);
46 ls += (stride << 1);
47 cf += (width << 1);
48 i += 2;
49 } while (i < height);
50 } else if (width == 8) {
51 do {
52 const __m128i coeffA = xx_loadu_128(cf);
53 const __m128i coeffB = xx_loadu_128(cf + 4);
54 const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
55 const __m128i absAB = _mm_abs_epi16(coeffAB);
56 const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
57 xx_storeu_128(ls, absAB8);
58 ls += stride;
59 cf += width;
60 i += 1;
61 } while (i < height);
62 } else {
63 do {
64 int j = 0;
65 do {
66 const __m128i coeffA = xx_loadu_128(cf);
67 const __m128i coeffB = xx_loadu_128(cf + 4);
68 const __m128i coeffC = xx_loadu_128(cf + 8);
69 const __m128i coeffD = xx_loadu_128(cf + 12);
70 const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
71 const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
72 const __m128i absAB = _mm_abs_epi16(coeffAB);
73 const __m128i absCD = _mm_abs_epi16(coeffCD);
74 const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
75 xx_storeu_128(ls + j, absABCD);
76 j += 16;
77 cf += 16;
78 } while (j < width);
79 *(int32_t *)(ls + width) = 0;
80 ls += stride;
81 i += 1;
82 } while (i < height);
83 }
84 }
85