1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_common.h"
13
14 void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
15 int16_t *output,
16 int output_stride);
17 void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
18 int16_t *output,
19 int16_t *pass1Output,
20 int16_t skip_adding,
21 uint8_t *dest,
22 int dest_stride);
23 void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
24 int16_t *output,
25 int output_stride);
26 void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
27 int16_t *output,
28 int16_t *pass1Output,
29 int16_t skip_adding,
30 uint8_t *dest,
31 int dest_stride);
32
33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
34 extern void vp9_push_neon(int64_t *store);
35 extern void vp9_pop_neon(int64_t *store);
36
vp9_idct16x16_256_add_neon(const int16_t * input,uint8_t * dest,int dest_stride)37 void vp9_idct16x16_256_add_neon(const int16_t *input,
38 uint8_t *dest, int dest_stride) {
39 int64_t store_reg[8];
40 int16_t pass1_output[16*16] = {0};
41 int16_t row_idct_output[16*16] = {0};
42
43 // save d8-d15 register values.
44 vp9_push_neon(store_reg);
45
46 /* Parallel idct on the upper 8 rows */
47 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
48 // stage 6 result in pass1_output.
49 vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
50
51 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
52 // with result in pass1(pass1_output) to calculate final result in stage 7
53 // which will be saved into row_idct_output.
54 vp9_idct16x16_256_add_neon_pass2(input+1,
55 row_idct_output,
56 pass1_output,
57 0,
58 dest,
59 dest_stride);
60
61 /* Parallel idct on the lower 8 rows */
62 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
63 // stage 6 result in pass1_output.
64 vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
65
66 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
67 // with result in pass1(pass1_output) to calculate final result in stage 7
68 // which will be saved into row_idct_output.
69 vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
70 row_idct_output+8,
71 pass1_output,
72 0,
73 dest,
74 dest_stride);
75
76 /* Parallel idct on the left 8 columns */
77 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
78 // stage 6 result in pass1_output.
79 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
80
81 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
82 // with result in pass1(pass1_output) to calculate final result in stage 7.
83 // Then add the result to the destination data.
84 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
85 row_idct_output,
86 pass1_output,
87 1,
88 dest,
89 dest_stride);
90
91 /* Parallel idct on the right 8 columns */
92 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
93 // stage 6 result in pass1_output.
94 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
95
96 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
97 // with result in pass1(pass1_output) to calculate final result in stage 7.
98 // Then add the result to the destination data.
99 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
100 row_idct_output+8,
101 pass1_output,
102 1,
103 dest+8,
104 dest_stride);
105
106 // restore d8-d15 register values.
107 vp9_pop_neon(store_reg);
108
109 return;
110 }
111
vp9_idct16x16_10_add_neon(const int16_t * input,uint8_t * dest,int dest_stride)112 void vp9_idct16x16_10_add_neon(const int16_t *input,
113 uint8_t *dest, int dest_stride) {
114 int64_t store_reg[8];
115 int16_t pass1_output[16*16] = {0};
116 int16_t row_idct_output[16*16] = {0};
117
118 // save d8-d15 register values.
119 vp9_push_neon(store_reg);
120
121 /* Parallel idct on the upper 8 rows */
122 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
123 // stage 6 result in pass1_output.
124 vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
125
126 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
127 // with result in pass1(pass1_output) to calculate final result in stage 7
128 // which will be saved into row_idct_output.
129 vp9_idct16x16_10_add_neon_pass2(input+1,
130 row_idct_output,
131 pass1_output,
132 0,
133 dest,
134 dest_stride);
135
136 /* Skip Parallel idct on the lower 8 rows as they are all 0s */
137
138 /* Parallel idct on the left 8 columns */
139 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
140 // stage 6 result in pass1_output.
141 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
142
143 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
144 // with result in pass1(pass1_output) to calculate final result in stage 7.
145 // Then add the result to the destination data.
146 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
147 row_idct_output,
148 pass1_output,
149 1,
150 dest,
151 dest_stride);
152
153 /* Parallel idct on the right 8 columns */
154 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
155 // stage 6 result in pass1_output.
156 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
157
158 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
159 // with result in pass1(pass1_output) to calculate final result in stage 7.
160 // Then add the result to the destination data.
161 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
162 row_idct_output+8,
163 pass1_output,
164 1,
165 dest+8,
166 dest_stride);
167
168 // restore d8-d15 register values.
169 vp9_pop_neon(store_reg);
170
171 return;
172 }
173