1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_config.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/arm/sum_neon.h"
17
get_lane(const int32x2_t a)18 static INLINE tran_low_t get_lane(const int32x2_t a) {
19 #if CONFIG_VP9_HIGHBITDEPTH
20 return vget_lane_s32(a, 0);
21 #else
22 return vget_lane_s16(vreinterpret_s16_s32(a), 0);
23 #endif // CONFIG_VP9_HIGHBITDETPH
24 }
25
vpx_fdct4x4_1_neon(const int16_t * input,tran_low_t * output,int stride)26 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
27 int16x4_t a0, a1, a2, a3;
28 int16x8_t b0, b1;
29 int16x8_t c;
30 int32x2_t d;
31
32 a0 = vld1_s16(input);
33 input += stride;
34 a1 = vld1_s16(input);
35 input += stride;
36 a2 = vld1_s16(input);
37 input += stride;
38 a3 = vld1_s16(input);
39
40 b0 = vcombine_s16(a0, a1);
41 b1 = vcombine_s16(a2, a3);
42
43 c = vaddq_s16(b0, b1);
44
45 d = horizontal_add_int16x8(c);
46
47 output[0] = get_lane(vshl_n_s32(d, 1));
48 output[1] = 0;
49 }
50
vpx_fdct8x8_1_neon(const int16_t * input,tran_low_t * output,int stride)51 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
52 int r;
53 int16x8_t sum = vld1q_s16(&input[0]);
54
55 for (r = 1; r < 8; ++r) {
56 const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
57 sum = vaddq_s16(sum, input_00);
58 }
59
60 output[0] = get_lane(horizontal_add_int16x8(sum));
61 output[1] = 0;
62 }
63
vpx_fdct16x16_1_neon(const int16_t * input,tran_low_t * output,int stride)64 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
65 int stride) {
66 int r;
67 int16x8_t left = vld1q_s16(input);
68 int16x8_t right = vld1q_s16(input + 8);
69 int32x2_t sum;
70 input += stride;
71
72 for (r = 1; r < 16; ++r) {
73 const int16x8_t a = vld1q_s16(input);
74 const int16x8_t b = vld1q_s16(input + 8);
75 input += stride;
76 left = vaddq_s16(left, a);
77 right = vaddq_s16(right, b);
78 }
79
80 sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
81
82 output[0] = get_lane(vshr_n_s32(sum, 1));
83 output[1] = 0;
84 }
85
vpx_fdct32x32_1_neon(const int16_t * input,tran_low_t * output,int stride)86 void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
87 int stride) {
88 int r;
89 int16x8_t a0 = vld1q_s16(input);
90 int16x8_t a1 = vld1q_s16(input + 8);
91 int16x8_t a2 = vld1q_s16(input + 16);
92 int16x8_t a3 = vld1q_s16(input + 24);
93 int32x2_t sum;
94 input += stride;
95
96 for (r = 1; r < 32; ++r) {
97 const int16x8_t b0 = vld1q_s16(input);
98 const int16x8_t b1 = vld1q_s16(input + 8);
99 const int16x8_t b2 = vld1q_s16(input + 16);
100 const int16x8_t b3 = vld1q_s16(input + 24);
101 input += stride;
102 a0 = vaddq_s16(a0, b0);
103 a1 = vaddq_s16(a1, b1);
104 a2 = vaddq_s16(a2, b2);
105 a3 = vaddq_s16(a3, b3);
106 }
107
108 sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
109 sum = vadd_s32(sum, horizontal_add_int16x8(a2));
110 sum = vadd_s32(sum, horizontal_add_int16x8(a3));
111 output[0] = get_lane(vshr_n_s32(sum, 3));
112 output[1] = 0;
113 }
114