• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_ssd_calculator_neon.c
24 *
25 * @brief
26 *  Contains intrinsic definitions of functions for sad computation
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_cmn_utils_instr_set_router.h"
52 
53 /*****************************************************************************/
54 /* Function Definitions                                                      */
55 /*****************************************************************************/
ihevce_4x4_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd,WORD32 is_chroma)56 static INLINE uint32x4_t ihevce_4x4_ssd_computer_neon(
57     UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 is_chroma)
58 {
59     uint32x4_t ssd_low, ssd_high;
60     uint8x16_t src, pred, abs;
61     uint16x8_t sqabs_low, sqabs_high;
62 
63     if(!is_chroma)
64     {
65         src = load_unaligned_u8q(pu1_src, src_strd);
66         pred = load_unaligned_u8q(pu1_pred, pred_strd);
67     }
68     else
69     {
70         src = load_unaligned_u8qi(pu1_src, src_strd);
71         pred = load_unaligned_u8qi(pu1_pred, pred_strd);
72     }
73     abs = vabdq_u8(src, pred);
74     sqabs_low = vmull_u8(vget_low_u8(abs), vget_low_u8(abs));
75     sqabs_high = vmull_u8(vget_high_u8(abs), vget_high_u8(abs));
76 
77     ssd_low = vaddl_u16(vget_low_u16(sqabs_low), vget_high_u16(sqabs_low));
78     ssd_high = vaddl_u16(vget_low_u16(sqabs_high), vget_high_u16(sqabs_high));
79     return vaddq_u32(ssd_low, ssd_high);
80 }
81 
82 static INLINE uint32x4_t
ihevce_1x8_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 is_chroma)83     ihevce_1x8_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 is_chroma)
84 {
85     uint32x4_t ssd_val;
86     uint8x8_t src, pred, abs;
87     uint16x8_t sqabs;
88 
89     if(!is_chroma)
90     {
91         src = vld1_u8(pu1_src);
92         pred = vld1_u8(pu1_pred);
93     }
94     else
95     {
96         src = vld2_u8(pu1_src).val[0];
97         pred = vld2_u8(pu1_pred).val[0];
98     }
99     abs = vabd_u8(src, pred);
100     sqabs = vmull_u8(abs, abs);
101 
102     ssd_val = vaddl_u16(vget_low_u16(sqabs), vget_high_u16(sqabs));
103     return ssd_val;
104 }
105 
106 static INLINE uint32x4_t
ihevce_1x16_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 is_chroma)107     ihevce_1x16_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 is_chroma)
108 {
109     uint32x4_t ssd_low, ssd_high;
110     uint8x16_t src, pred, abs;
111     uint16x8_t sqabs_low, sqabs_high;
112 
113     if(!is_chroma)
114     {
115         src = vld1q_u8(pu1_src);
116         pred = vld1q_u8(pu1_pred);
117     }
118     else
119     {
120         src = vld2q_u8(pu1_src).val[0];
121         pred = vld2q_u8(pu1_pred).val[0];
122     }
123     abs = vabdq_u8(src, pred);
124     sqabs_low = vmull_u8(vget_low_u8(abs), vget_low_u8(abs));
125     sqabs_high = vmull_u8(vget_high_u8(abs), vget_high_u8(abs));
126 
127     ssd_low = vaddl_u16(vget_low_u16(sqabs_low), vget_high_u16(sqabs_low));
128     ssd_high = vaddl_u16(vget_low_u16(sqabs_high), vget_high_u16(sqabs_high));
129     return vaddq_u32(ssd_low, ssd_high);
130 }
131 
132 static INLINE uint32x4_t
ihevce_1x32_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 is_chroma)133     ihevce_1x32_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 is_chroma)
134 {
135     uint32x4_t ssd_0, ssd_1, ssd_2, ssd_3;
136     uint8x16_t src_0, pred_0, src_1, pred_1, abs_0, abs_1;
137     uint16x8_t sqabs_0, sqabs_1, sqabs_2, sqabs_3;
138 
139     if(!is_chroma)
140     {
141         src_0 = vld1q_u8(pu1_src);
142         pred_0 = vld1q_u8(pu1_pred);
143         src_1 = vld1q_u8(pu1_src + 16);
144         pred_1 = vld1q_u8(pu1_pred + 16);
145     }
146     else
147     {
148         src_0 = vld2q_u8(pu1_src).val[0];
149         pred_0 = vld2q_u8(pu1_pred).val[0];
150         src_1 = vld2q_u8(pu1_src + 32).val[0];
151         pred_1 = vld2q_u8(pu1_pred + 32).val[0];
152     }
153     abs_0 = vabdq_u8(src_0, pred_0);
154     abs_1 = vabdq_u8(src_1, pred_1);
155     sqabs_0 = vmull_u8(vget_low_u8(abs_0), vget_low_u8(abs_0));
156     sqabs_1 = vmull_u8(vget_high_u8(abs_0), vget_high_u8(abs_0));
157     sqabs_2 = vmull_u8(vget_low_u8(abs_1), vget_low_u8(abs_1));
158     sqabs_3 = vmull_u8(vget_high_u8(abs_1), vget_high_u8(abs_1));
159 
160     ssd_0 = vaddl_u16(vget_low_u16(sqabs_0), vget_high_u16(sqabs_0));
161     ssd_1 = vaddl_u16(vget_low_u16(sqabs_1), vget_high_u16(sqabs_1));
162     ssd_2 = vaddl_u16(vget_low_u16(sqabs_2), vget_high_u16(sqabs_2));
163     ssd_3 = vaddl_u16(vget_low_u16(sqabs_3), vget_high_u16(sqabs_3));
164     ssd_0 = vaddq_u32(ssd_0, ssd_1);
165     ssd_2 = vaddq_u32(ssd_2, ssd_3);
166     return vaddq_u32(ssd_0, ssd_2);
167 }
168 
169 static INLINE uint32x4_t
ihevce_1x64_ssd_computer_neon(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 is_chroma)170     ihevce_1x64_ssd_computer_neon(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 is_chroma)
171 {
172     uint32x4_t ssd_0, ssd_1, ssd_2, ssd_3;
173     uint32x4_t ssd_4, ssd_5, ssd_6, ssd_7;
174     uint8x16_t src_0, src_1, src_2, src_3;
175     uint8x16_t pred_0, pred_1, pred_2, pred_3;
176     uint8x16_t abs_0, abs_1, abs_2, abs_3;
177     uint16x8_t sqabs_0, sqabs_1, sqabs_2, sqabs_3;
178     uint16x8_t sqabs_4, sqabs_5, sqabs_6, sqabs_7;
179 
180     if(!is_chroma)
181     {
182         src_0 = vld1q_u8(pu1_src);
183         pred_0 = vld1q_u8(pu1_pred);
184         src_1 = vld1q_u8(pu1_src + 16);
185         pred_1 = vld1q_u8(pu1_pred + 16);
186         src_2 = vld1q_u8(pu1_src + 32);
187         pred_2 = vld1q_u8(pu1_pred + 32);
188         src_3 = vld1q_u8(pu1_src + 48);
189         pred_3 = vld1q_u8(pu1_pred + 48);
190     }
191     else
192     {
193         src_0 = vld2q_u8(pu1_src).val[0];
194         pred_0 = vld2q_u8(pu1_pred).val[0];
195         src_1 = vld2q_u8(pu1_src + 32).val[0];
196         pred_1 = vld2q_u8(pu1_pred + 32).val[0];
197         src_2 = vld2q_u8(pu1_src + 64).val[0];
198         pred_2 = vld2q_u8(pu1_pred + 64).val[0];
199         src_3 = vld2q_u8(pu1_src + 96).val[0];
200         pred_3 = vld2q_u8(pu1_pred + 96).val[0];
201     }
202     abs_0 = vabdq_u8(src_0, pred_0);
203     abs_1 = vabdq_u8(src_1, pred_1);
204     abs_2 = vabdq_u8(src_2, pred_2);
205     abs_3 = vabdq_u8(src_3, pred_3);
206     sqabs_0 = vmull_u8(vget_low_u8(abs_0), vget_low_u8(abs_0));
207     sqabs_1 = vmull_u8(vget_high_u8(abs_0), vget_high_u8(abs_0));
208     sqabs_2 = vmull_u8(vget_low_u8(abs_1), vget_low_u8(abs_1));
209     sqabs_3 = vmull_u8(vget_high_u8(abs_1), vget_high_u8(abs_1));
210     sqabs_4 = vmull_u8(vget_low_u8(abs_2), vget_low_u8(abs_2));
211     sqabs_5 = vmull_u8(vget_high_u8(abs_2), vget_high_u8(abs_2));
212     sqabs_6 = vmull_u8(vget_low_u8(abs_3), vget_low_u8(abs_3));
213     sqabs_7 = vmull_u8(vget_high_u8(abs_3), vget_high_u8(abs_3));
214 
215     ssd_0 = vaddl_u16(vget_low_u16(sqabs_0), vget_high_u16(sqabs_0));
216     ssd_1 = vaddl_u16(vget_low_u16(sqabs_1), vget_high_u16(sqabs_1));
217     ssd_2 = vaddl_u16(vget_low_u16(sqabs_2), vget_high_u16(sqabs_2));
218     ssd_3 = vaddl_u16(vget_low_u16(sqabs_3), vget_high_u16(sqabs_3));
219     ssd_4 = vaddl_u16(vget_low_u16(sqabs_4), vget_high_u16(sqabs_4));
220     ssd_5 = vaddl_u16(vget_low_u16(sqabs_5), vget_high_u16(sqabs_5));
221     ssd_6 = vaddl_u16(vget_low_u16(sqabs_6), vget_high_u16(sqabs_6));
222     ssd_7 = vaddl_u16(vget_low_u16(sqabs_7), vget_high_u16(sqabs_7));
223     ssd_0 = vaddq_u32(ssd_0, ssd_1);
224     ssd_2 = vaddq_u32(ssd_2, ssd_3);
225     ssd_4 = vaddq_u32(ssd_4, ssd_5);
226     ssd_6 = vaddq_u32(ssd_6, ssd_7);
227     ssd_0 = vaddq_u32(ssd_0, ssd_2);
228     ssd_4 = vaddq_u32(ssd_4, ssd_6);
229     return vaddq_u32(ssd_0, ssd_4);
230 }
231 
ihevce_ssd_calculator_plane_neon(UWORD8 * pu1_inp,UWORD8 * pu1_ref,UWORD32 inp_stride,UWORD32 ref_stride,UWORD32 wd,UWORD32 ht,WORD32 is_chroma)232 static LWORD64 ihevce_ssd_calculator_plane_neon(
233     UWORD8 *pu1_inp,
234     UWORD8 *pu1_ref,
235     UWORD32 inp_stride,
236     UWORD32 ref_stride,
237     UWORD32 wd,
238     UWORD32 ht,
239     WORD32 is_chroma)
240 {
241     uint32x4_t ssd = vdupq_n_u32(0);
242     uint32x2_t sum;
243 
244     if(wd >= 8)
245     {
246         UWORD32 row;
247 
248         for(row = ht; row > 0; row--)
249         {
250             if(wd == 8)
251                 ssd = vaddq_u32(ssd, ihevce_1x8_ssd_computer_neon(pu1_inp, pu1_ref, is_chroma));
252             else if(wd == 16)
253                 ssd = vaddq_u32(ssd, ihevce_1x16_ssd_computer_neon(pu1_inp, pu1_ref, is_chroma));
254             else if(wd == 32)
255                 ssd = vaddq_u32(ssd, ihevce_1x32_ssd_computer_neon(pu1_inp, pu1_ref, is_chroma));
256             else if(wd == 64)
257                 ssd = vaddq_u32(ssd, ihevce_1x64_ssd_computer_neon(pu1_inp, pu1_ref, is_chroma));
258             else if(wd % 8 == 0)
259             {
260                 UWORD32 col;
261                 UWORD8 *inp = pu1_inp, *ref = pu1_ref;
262 
263                 for(col = 0; col < wd; col += 8)
264                 {
265                     ssd = vaddq_u32(ssd, ihevce_1x8_ssd_computer_neon(inp, ref, is_chroma));
266                     ref = ref + 8;
267                     inp = inp + 8;
268                 }
269             }
270 
271             pu1_inp += inp_stride;
272             pu1_ref += ref_stride;
273         }
274     }
275     else if(wd == 4)
276     {
277         assert(ht == 4);
278         ssd = ihevce_4x4_ssd_computer_neon(pu1_inp, pu1_ref, inp_stride, ref_stride, is_chroma);
279     }
280 
281     sum = vadd_u32(vget_low_u32(ssd), vget_high_u32(ssd));
282     return vget_lane_u64(vpaddl_u32(sum), 0);
283 }
284 
ihevce_ssd_calculator_neon(UWORD8 * pu1_inp,UWORD8 * pu1_ref,UWORD32 inp_stride,UWORD32 ref_stride,UWORD32 wd,UWORD32 ht)285 LWORD64 ihevce_ssd_calculator_neon(
286     UWORD8 *pu1_inp, UWORD8 *pu1_ref, UWORD32 inp_stride, UWORD32 ref_stride, UWORD32 wd, UWORD32 ht)
287 {
288     return ihevce_ssd_calculator_plane_neon(pu1_inp, pu1_ref, inp_stride, ref_stride, wd, ht, 0);
289 }
290 
ihevce_chroma_interleave_ssd_calculator_neon(UWORD8 * pu1_inp,UWORD8 * pu1_ref,UWORD32 inp_stride,UWORD32 ref_stride,UWORD32 wd,UWORD32 ht)291 LWORD64 ihevce_chroma_interleave_ssd_calculator_neon(
292     UWORD8 *pu1_inp, UWORD8 *pu1_ref, UWORD32 inp_stride, UWORD32 ref_stride, UWORD32 wd, UWORD32 ht)
293 {
294     return ihevce_ssd_calculator_plane_neon(pu1_inp, pu1_ref, inp_stride, ref_stride, wd, ht, 1);
295 }
296