1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_sad_compute_neon.c
24 *
25 * @brief
26 * Contains definitions of functions to compute sad
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes */
40 /*****************************************************************************/
41 /* System include files */
42 #include <string.h>
43 #include <assert.h>
44 #include <arm_neon.h>
45
46 /* User include files */
47 #include "ihevc_typedefs.h"
48 #include "ihevc_macros.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_ipe_instr_set_router.h"
52
53 /*****************************************************************************/
54 /* Function Definitions */
55 /*****************************************************************************/
ihevce_4x4_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)56 UWORD16 ihevce_4x4_sad_computer_neon(
57 UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
58 {
59 const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
60 const uint8x16_t ref_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
61 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
62 uint32x4_t b;
63 uint64x2_t c;
64
65 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
66 b = vpaddlq_u16(abs);
67 c = vpaddlq_u32(b);
68 return vget_lane_u32(
69 vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), vreinterpret_u32_u64(vget_high_u64(c))), 0);
70 }
71
ihevce_8xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)72 static UWORD16 ihevce_8xn_sad_computer_neon(
73 UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
74 {
75 uint16x8_t abs = vdupq_n_u16(0);
76 uint32x4_t tmp_a;
77 uint64x2_t tmp_b;
78 uint32x2_t sad;
79 WORD32 i;
80
81 assert(ht <= 8);
82
83 for(i = 0; i < ht; i++)
84 {
85 const uint8x8_t src = vld1_u8(pu1_src);
86 const uint8x8_t pred = vld1_u8(pu1_pred);
87
88 abs = vabal_u8(abs, src, pred);
89 pu1_src += src_strd;
90 pu1_pred += pred_strd;
91 }
92 tmp_a = vpaddlq_u16(abs);
93 tmp_b = vpaddlq_u32(tmp_a);
94 sad = vadd_u32(
95 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
96 return vget_lane_u32(sad, 0);
97 }
98
ihevce_16xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)99 static UWORD32 ihevce_16xn_sad_computer_neon(
100 UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
101 {
102 uint16x8_t abs_0 = vdupq_n_u16(0);
103 uint16x8_t abs_1 = vdupq_n_u16(0);
104 uint32x4_t tmp_a;
105 uint64x2_t tmp_b;
106 uint32x2_t sad;
107 WORD32 i;
108
109 assert(ht <= 16);
110
111 for(i = 0; i < ht; i++)
112 {
113 const uint8x16_t src = vld1q_u8(pu1_src);
114 const uint8x16_t pred = vld1q_u8(pu1_pred);
115
116 abs_0 = vabal_u8(abs_0, vget_low_u8(src), vget_low_u8(pred));
117 abs_1 = vabal_u8(abs_1, vget_high_u8(src), vget_high_u8(pred));
118 pu1_src += src_strd;
119 pu1_pred += pred_strd;
120 }
121 tmp_a = vpaddlq_u16(abs_0);
122 tmp_a = vpadalq_u16(tmp_a, abs_1);
123 tmp_b = vpaddlq_u32(tmp_a);
124 sad = vadd_u32(
125 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
126 return vget_lane_u32(sad, 0);
127 }
128
ihevce_32xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)129 static UWORD32 ihevce_32xn_sad_computer_neon(
130 UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
131 {
132 uint16x8_t abs_0 = vdupq_n_u16(0);
133 uint16x8_t abs_1 = vdupq_n_u16(0);
134 uint32x4_t tmp_a;
135 uint64x2_t tmp_b;
136 uint32x2_t sad;
137 WORD32 i;
138
139 assert(ht <= 32);
140
141 for(i = 0; i < ht; i++)
142 {
143 const uint8x16_t src_0 = vld1q_u8(pu1_src);
144 const uint8x16_t pred_0 = vld1q_u8(pu1_pred);
145 const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
146 const uint8x16_t pred_1 = vld1q_u8(pu1_pred + 16);
147
148 abs_0 = vabal_u8(abs_0, vget_low_u8(src_0), vget_low_u8(pred_0));
149 abs_0 = vabal_u8(abs_0, vget_high_u8(src_0), vget_high_u8(pred_0));
150 abs_1 = vabal_u8(abs_1, vget_low_u8(src_1), vget_low_u8(pred_1));
151 abs_1 = vabal_u8(abs_1, vget_high_u8(src_1), vget_high_u8(pred_1));
152 pu1_src += src_strd;
153 pu1_pred += pred_strd;
154 }
155 tmp_a = vpaddlq_u16(abs_0);
156 tmp_a = vpadalq_u16(tmp_a, abs_1);
157 tmp_b = vpaddlq_u32(tmp_a);
158 sad = vadd_u32(
159 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
160 return vget_lane_u32(sad, 0);
161 }
162
ihevce_64xn_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 ht)163 static UWORD32 ihevce_64xn_sad_computer_neon(
164 UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
165 {
166 uint16x8_t abs_0 = vdupq_n_u16(0);
167 uint16x8_t abs_1 = vdupq_n_u16(0);
168 uint32x4_t tmp_a;
169 uint64x2_t tmp_b;
170 uint32x2_t sad;
171 WORD32 i;
172
173 assert(ht <= 64);
174
175 for(i = 0; i < ht; i++)
176 {
177 const uint8x16_t src_0 = vld1q_u8(pu1_src);
178 const uint8x16_t pred_0 = vld1q_u8(pu1_pred);
179 const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
180 const uint8x16_t pred_1 = vld1q_u8(pu1_pred + 16);
181 const uint8x16_t src_2 = vld1q_u8(pu1_src + 32);
182 const uint8x16_t pred_2 = vld1q_u8(pu1_pred + 32);
183 const uint8x16_t src_3 = vld1q_u8(pu1_src + 48);
184 const uint8x16_t pred_3 = vld1q_u8(pu1_pred + 48);
185
186 abs_0 = vabal_u8(abs_0, vget_low_u8(src_0), vget_low_u8(pred_0));
187 abs_0 = vabal_u8(abs_0, vget_high_u8(src_0), vget_high_u8(pred_0));
188 abs_0 = vabal_u8(abs_0, vget_low_u8(src_1), vget_low_u8(pred_1));
189 abs_0 = vabal_u8(abs_0, vget_high_u8(src_1), vget_high_u8(pred_1));
190 abs_1 = vabal_u8(abs_1, vget_low_u8(src_2), vget_low_u8(pred_2));
191 abs_1 = vabal_u8(abs_1, vget_high_u8(src_2), vget_high_u8(pred_2));
192 abs_1 = vabal_u8(abs_1, vget_low_u8(src_3), vget_low_u8(pred_3));
193 abs_1 = vabal_u8(abs_1, vget_high_u8(src_3), vget_high_u8(pred_3));
194 pu1_src += src_strd;
195 pu1_pred += pred_strd;
196 }
197 tmp_a = vpaddlq_u16(abs_0);
198 tmp_a = vpadalq_u16(tmp_a, abs_1);
199 tmp_b = vpaddlq_u32(tmp_a);
200 sad = vadd_u32(
201 vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
202 return vget_lane_u32(sad, 0);
203 }
204
ihevce_4mx4n_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 blk_wd,WORD32 blk_ht)205 UWORD32 ihevce_4mx4n_sad_computer_neon(
206 UWORD8 *pu1_src,
207 UWORD8 *pu1_pred,
208 WORD32 src_strd,
209 WORD32 pred_strd,
210 WORD32 blk_wd,
211 WORD32 blk_ht)
212 {
213 WORD32 sad = 0;
214 WORD32 i, j;
215
216 assert(blk_wd % 4 == 0);
217 assert(blk_ht % 4 == 0);
218
219 if(((blk_wd & (blk_wd - 1)) == 0) && (blk_wd <= 64))
220 {
221 // blk_wd { 4, 8, 16, 32, 64 }
222 for(i = 0; i < blk_ht;)
223 {
224 WORD32 ht = MIN(blk_wd, blk_ht - i);
225
226 switch(blk_wd)
227 {
228 case 4:
229 sad += ihevce_4x4_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd);
230 break;
231 case 8:
232 sad += ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
233 break;
234 case 16:
235 sad += ihevce_16xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
236 break;
237 case 32:
238 sad += ihevce_32xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
239 break;
240 case 64:
241 sad += ihevce_64xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
242 break;
243 default:
244 // should not be here
245 return -1;
246 }
247 i += ht;
248 pu1_src += (ht * src_strd);
249 pu1_pred += (ht * pred_strd);
250 }
251 }
252 else
253 {
254 // Generic Case
255 for(i = 0; i < blk_ht; i += 4)
256 {
257 for(j = 0; j < blk_wd;)
258 {
259 WORD32 wd = blk_wd - j;
260
261 if(wd >= 32)
262 {
263 sad += ihevce_32xn_sad_computer_neon(
264 pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
265 j += 32;
266 }
267 else if(wd >= 16)
268 {
269 sad += ihevce_16xn_sad_computer_neon(
270 pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
271 j += 16;
272 }
273 else if(wd >= 8)
274 {
275 sad += ihevce_8xn_sad_computer_neon(
276 pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
277 j += 8;
278 }
279 else
280 {
281 sad += ihevce_4x4_sad_computer_neon(
282 pu1_src + j, pu1_pred + j, src_strd, pred_strd);
283 j += 4;
284 }
285 }
286 pu1_src += (4 * src_strd);
287 pu1_pred += (4 * pred_strd);
288 }
289 }
290 return sad;
291 }
292
ihevce_8x8_sad_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)293 UWORD16 ihevce_8x8_sad_computer_neon(
294 UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
295 {
296 return ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 8);
297 }
298
ihevce_nxn_sad_computer_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD32 trans_size)299 WORD32 ihevce_nxn_sad_computer_neon(
300 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, WORD32 trans_size)
301 {
302 switch(trans_size)
303 {
304 case 4:
305 return ihevce_4x4_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd);
306 case 8:
307 return ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 8);
308 case 16:
309 return ihevce_16xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 16);
310 case 32:
311 return ihevce_32xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 32);
312 case 64:
313 return ihevce_64xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 64);
314 default:
315 // should not be here
316 return -1;
317 }
318 }
319