1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 ******************************************************************************
23 * @file ime_distortion_metrics_sse42.c
24 *
25 * @brief
26 * This file contains definitions of routines that compute distortion
27 * between two macro/sub blocks of identical dimensions
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - ime_compute_sad_16x16_sse42()
34 * - ime_compute_sad_16x16_fast_sse42()
35 * - ime_compute_sad_16x16_ea8_sse42()
36 * - ime_compute_sad_16x8_sse42()
37 * - ime_calculate_sad4_prog_sse42()
38 * - ime_sub_pel_compute_sad_16x16_sse42()
39 * - ime_compute_satqd_16x16_lumainter_sse42()
40 *
41 * @remarks
42 * None
43 *
44 *******************************************************************************
45 */
46
47 /*****************************************************************************/
48 /* File Includes */
49 /*****************************************************************************/
50
51 /* System include files */
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55
56 /* User include files */
57 #include "ime_typedefs.h"
58 #include "ime_defs.h"
59 #include "ime_macros.h"
60 #include "ime_statistics.h"
61 #include "ime_platform_macros.h"
62 #include "ime_distortion_metrics.h"
63 #include <immintrin.h>
64
65 /*****************************************************************************/
66 /* Function Definitions */
67 /*****************************************************************************/
68
69 /**
70 ******************************************************************************
71 *
72 * @brief computes distortion (SAD) between 2 16x16 blocks
73 *
74 * @par Description
75 * This functions computes SAD between 2 16x16 blocks. There is a provision
76 * for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
77 * compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
78 *
79 * @param[in] pu1_src
80 * UWORD8 pointer to the source
81 *
82 * @param[out] pu1_dst
83 * UWORD8 pointer to the destination
84 *
85 * @param[in] src_strd
86 * integer source stride
87 *
88 * @param[in] dst_strd
89 * integer destination stride
90 *
91 * @param[in] i4_max_sad
92 * integer maximum allowed distortion
93 *
94 * @param[out] pi4_mb_distortion
95 * integer evaluated sad
96 *
97 * @remarks
98 *
99 ******************************************************************************
100 */
ime_compute_sad_16x16_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)101 void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
102 UWORD8 *pu1_est,
103 WORD32 src_strd,
104 WORD32 est_strd,
105 WORD32 i4_max_sad,
106 WORD32 *pi4_mb_distortion)
107 {
108 __m128i src_r0, src_r1, src_r2, src_r3;
109 __m128i est_r0, est_r1, est_r2, est_r3;
110 __m128i res_r0, res_r1, res_r2, res_r3;
111 __m128i sad_val;
112 int val1, val2;
113 UNUSED (i4_max_sad);
114
115 // Row 0-3 sad calculation
116 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
117 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
118 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
119 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
120
121 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
122 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
123 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
124 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
125
126 res_r0 = _mm_sad_epu8(src_r0, est_r0);
127 res_r1 = _mm_sad_epu8(src_r1, est_r1);
128 res_r2 = _mm_sad_epu8(src_r2, est_r2);
129 res_r3 = _mm_sad_epu8(src_r3, est_r3);
130
131 sad_val = _mm_add_epi64(res_r0, res_r1);
132 sad_val = _mm_add_epi64(sad_val, res_r2);
133 sad_val = _mm_add_epi64(sad_val, res_r3);
134
135 // Row 4-7 sad calculation
136 pu1_src += 4*src_strd;
137 pu1_est += 4*est_strd;
138
139 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
140 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
141 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
142 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
143
144 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
145 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
146 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
147 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
148
149 res_r0 = _mm_sad_epu8(src_r0, est_r0);
150 res_r1 = _mm_sad_epu8(src_r1, est_r1);
151 res_r2 = _mm_sad_epu8(src_r2, est_r2);
152 res_r3 = _mm_sad_epu8(src_r3, est_r3);
153
154 sad_val = _mm_add_epi64(sad_val, res_r0);
155 sad_val = _mm_add_epi64(sad_val, res_r1);
156 sad_val = _mm_add_epi64(sad_val, res_r2);
157 sad_val = _mm_add_epi64(sad_val, res_r3);
158
159 // Row 8-11 sad calculation
160 pu1_src += 4*src_strd;
161 pu1_est += 4*est_strd;
162 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
163 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
164 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
165 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
166
167 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
168 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
169 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
170 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
171
172 res_r0 = _mm_sad_epu8(src_r0, est_r0);
173 res_r1 = _mm_sad_epu8(src_r1, est_r1);
174 res_r2 = _mm_sad_epu8(src_r2, est_r2);
175 res_r3 = _mm_sad_epu8(src_r3, est_r3);
176
177 sad_val = _mm_add_epi64(sad_val, res_r0);
178 sad_val = _mm_add_epi64(sad_val, res_r1);
179 sad_val = _mm_add_epi64(sad_val, res_r2);
180 sad_val = _mm_add_epi64(sad_val, res_r3);
181
182 // Row 12-15 sad calculation
183 pu1_src += 4*src_strd;
184 pu1_est += 4*est_strd;
185 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
186 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
187 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
188 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
189
190 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
191 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
192 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
193 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
194
195 res_r0 = _mm_sad_epu8(src_r0, est_r0);
196 res_r1 = _mm_sad_epu8(src_r1, est_r1);
197 res_r2 = _mm_sad_epu8(src_r2, est_r2);
198 res_r3 = _mm_sad_epu8(src_r3, est_r3);
199
200 sad_val = _mm_add_epi64(sad_val, res_r0);
201 sad_val = _mm_add_epi64(sad_val, res_r1);
202 sad_val = _mm_add_epi64(sad_val, res_r2);
203 sad_val = _mm_add_epi64(sad_val, res_r3);
204
205 val1 = _mm_extract_epi32(sad_val,0);
206 val2 = _mm_extract_epi32(sad_val, 2);
207 *pi4_mb_distortion = (val1+val2);
208
209 return;
210 }
211
212 /**
213 ******************************************************************************
214 *
215 * @brief computes distortion (SAD) between 2 16x8 blocks
216 *
217 *
218 * @par Description
219 * This functions computes SAD between 2 16x8 blocks. There is a provision
220 * for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
221 * compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
222 *
223 * @param[in] pu1_src
224 * UWORD8 pointer to the source
225 *
226 * @param[out] pu1_dst
227 * UWORD8 pointer to the destination
228 *
229 * @param[in] src_strd
230 * integer source stride
231 *
232 * @param[in] dst_strd
233 * integer destination stride
234 *
235 * @param[in] u4_max_sad
236 * integer maximum allowed distortion
237 *
238 * @param[out] pi4_mb_distortion
239 * integer evaluated sad
240 *
241 * @remarks
242 *
243 ******************************************************************************
244 */
ime_compute_sad_16x8_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)245 void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
246 UWORD8 *pu1_est,
247 WORD32 src_strd,
248 WORD32 est_strd,
249 WORD32 i4_max_sad,
250 WORD32 *pi4_mb_distortion)
251 {
252 __m128i src_r0, src_r1, src_r2, src_r3;
253 __m128i est_r0, est_r1, est_r2, est_r3;
254 __m128i res_r0, res_r1, res_r2, res_r3;
255 __m128i sad_val;
256 int val1, val2;
257 UNUSED (i4_max_sad);
258
259 // Row 0-3 sad calculation
260 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
261 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
262 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
263 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
264
265 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
266 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
267 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
268 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
269
270 res_r0 = _mm_sad_epu8(src_r0, est_r0);
271 res_r1 = _mm_sad_epu8(src_r1, est_r1);
272 res_r2 = _mm_sad_epu8(src_r2, est_r2);
273 res_r3 = _mm_sad_epu8(src_r3, est_r3);
274
275 sad_val = _mm_add_epi64(res_r0, res_r1);
276 sad_val = _mm_add_epi64(sad_val, res_r2);
277 sad_val = _mm_add_epi64(sad_val, res_r3);
278
279 // Row 4-7 sad calculation
280 pu1_src += 4*src_strd;
281 pu1_est += 4*est_strd;
282
283 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
284 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
285 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
286 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
287
288 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
289 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
290 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
291 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
292
293 res_r0 = _mm_sad_epu8(src_r0, est_r0);
294 res_r1 = _mm_sad_epu8(src_r1, est_r1);
295 res_r2 = _mm_sad_epu8(src_r2, est_r2);
296 res_r3 = _mm_sad_epu8(src_r3, est_r3);
297
298 sad_val = _mm_add_epi64(sad_val, res_r0);
299 sad_val = _mm_add_epi64(sad_val, res_r1);
300 sad_val = _mm_add_epi64(sad_val, res_r2);
301 sad_val = _mm_add_epi64(sad_val, res_r3);
302
303 val1 = _mm_extract_epi32(sad_val,0);
304 val2 = _mm_extract_epi32(sad_val, 2);
305 *pi4_mb_distortion = (val1+val2);
306 return;
307 }
308
309 /**
310 ******************************************************************************
311 *
312 * @brief computes distortion (SAD) between 2 16x16 blocks
313 *
314 * @par Description
315 * This functions computes SAD between 2 16x16 blocks. There is a provision
316 * for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
317 * compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
318 *
319 * @param[in] pu1_src
320 * UWORD8 pointer to the source
321 *
322 * @param[out] pu1_dst
323 * UWORD8 pointer to the destination
324 *
325 * @param[in] src_strd
326 * integer source stride
327 *
328 * @param[in] dst_strd
329 * integer destination stride
330 *
331 * @param[in] i4_max_sad
332 * integer maximum allowed distortion
333 *
334 * @param[out] pi4_mb_distortion
335 * integer evaluated sad
336 *
337 * @remarks
338 *
339 ******************************************************************************
340 */
ime_compute_sad_16x16_ea8_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)341 void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src,
342 UWORD8 *pu1_est,
343 WORD32 src_strd,
344 WORD32 est_strd,
345 WORD32 i4_max_sad,
346 WORD32 *pi4_mb_distortion)
347 {
348 __m128i src_r0, src_r1, src_r2, src_r3;
349 __m128i est_r0, est_r1, est_r2, est_r3;
350 __m128i res_r0, res_r1, res_r2, res_r3;
351 __m128i sad_val;
352 WORD32 val1, val2;
353 WORD32 i4_sad;
354 UWORD8 *pu1_src_temp = pu1_src + src_strd;
355 UWORD8 *pu1_est_temp = pu1_est + est_strd;
356
357 // Row 0,2,4,6 sad calculation
358 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
359 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
360 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
361 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
362
363 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
364 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
365 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
366 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
367
368 res_r0 = _mm_sad_epu8(src_r0, est_r0);
369 res_r1 = _mm_sad_epu8(src_r1, est_r1);
370 res_r2 = _mm_sad_epu8(src_r2, est_r2);
371 res_r3 = _mm_sad_epu8(src_r3, est_r3);
372
373 sad_val = _mm_add_epi64(res_r0, res_r1);
374 sad_val = _mm_add_epi64(sad_val, res_r2);
375 sad_val = _mm_add_epi64(sad_val, res_r3);
376
377 // Row 8,10,12,14 sad calculation
378 pu1_src += 8*src_strd;
379 pu1_est += 8*est_strd;
380
381 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
382 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
383 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
384 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
385
386 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
387 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
388 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
389 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
390
391 res_r0 = _mm_sad_epu8(src_r0, est_r0);
392 res_r1 = _mm_sad_epu8(src_r1, est_r1);
393 res_r2 = _mm_sad_epu8(src_r2, est_r2);
394 res_r3 = _mm_sad_epu8(src_r3, est_r3);
395
396 sad_val = _mm_add_epi64(sad_val, res_r0);
397 sad_val = _mm_add_epi64(sad_val, res_r1);
398 sad_val = _mm_add_epi64(sad_val, res_r2);
399 sad_val = _mm_add_epi64(sad_val, res_r3);
400
401 pu1_src = pu1_src_temp;
402 pu1_est = pu1_est_temp;
403
404 val1 = _mm_extract_epi32(sad_val, 0);
405 val2 = _mm_extract_epi32(sad_val, 2);
406
407 i4_sad = val1 + val2;
408 if (i4_max_sad < i4_sad)
409 {
410 *pi4_mb_distortion = i4_sad;
411 return ;
412 }
413 // Row 1,3,5,7 sad calculation
414 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
415 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
416 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
417 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
418
419 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
420 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
421 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
422 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
423
424 res_r0 = _mm_sad_epu8(src_r0, est_r0);
425 res_r1 = _mm_sad_epu8(src_r1, est_r1);
426 res_r2 = _mm_sad_epu8(src_r2, est_r2);
427 res_r3 = _mm_sad_epu8(src_r3, est_r3);
428
429 sad_val = _mm_add_epi64(sad_val, res_r0);
430 sad_val = _mm_add_epi64(sad_val, res_r1);
431 sad_val = _mm_add_epi64(sad_val, res_r2);
432 sad_val = _mm_add_epi64(sad_val, res_r3);
433
434 // Row 9,11,13,15 sad calculation
435 pu1_src += 8*src_strd;
436 pu1_est += 8*est_strd;
437 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
438 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
439 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
440 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
441
442 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
443 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
444 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
445 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
446
447 res_r0 = _mm_sad_epu8(src_r0, est_r0);
448 res_r1 = _mm_sad_epu8(src_r1, est_r1);
449 res_r2 = _mm_sad_epu8(src_r2, est_r2);
450 res_r3 = _mm_sad_epu8(src_r3, est_r3);
451
452 sad_val = _mm_add_epi64(sad_val, res_r0);
453 sad_val = _mm_add_epi64(sad_val, res_r1);
454 sad_val = _mm_add_epi64(sad_val, res_r2);
455 sad_val = _mm_add_epi64(sad_val, res_r3);
456
457 val1 = _mm_extract_epi32(sad_val, 0);
458 val2 = _mm_extract_epi32(sad_val, 2);
459 *pi4_mb_distortion = (val1+val2);
460
461 return;
462 }
463
464 /**
465 ******************************************************************************
466 *
467 * @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
468 *
469 * @par Description
470 * This functions computes SAD between 2 16x16 blocks by processing alternate
471 * rows (fast mode). For fast mode it is assumed sad obtained by processing
472 * alternate rows is approximately twice as that for the whole block.
473 *
474 * @param[in] pu1_src
475 * UWORD8 pointer to the source
476 *
477 * @param[out] pu1_dst
478 * UWORD8 pointer to the destination
479 *
480 * @param[in] src_strd
481 * integer source stride
482 *
483 * @param[in] dst_strd
484 * integer destination stride
485 *
486 * @param[in] i4_max_sad
487 * integer maximum allowed distortion
488 *
489 * @param[out] pi4_mb_distortion
490 * integer evaluated sad
491 *
492 * @remarks
493 *
494 ******************************************************************************
495 */
ime_compute_sad_16x16_fast_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)496 void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
497 UWORD8 *pu1_est,
498 WORD32 src_strd,
499 WORD32 est_strd,
500 WORD32 i4_max_sad,
501 WORD32 *pi4_mb_distortion)
502 {
503 __m128i src_r0, src_r1, src_r2, src_r3;
504 __m128i est_r0, est_r1, est_r2, est_r3;
505 __m128i res_r0, res_r1, res_r2, res_r3;
506 __m128i sad_val;
507 WORD32 val1, val2;
508 WORD32 i4_sad;
509 UWORD8 *pu1_src_temp = pu1_src + src_strd;
510 UWORD8 *pu1_est_temp = pu1_est + est_strd;
511 UNUSED (i4_max_sad);
512
513 // Row 0,2,4,6 sad calculation
514 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
515 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
516 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
517 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
518
519 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
520 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
521 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
522 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
523
524 res_r0 = _mm_sad_epu8(src_r0, est_r0);
525 res_r1 = _mm_sad_epu8(src_r1, est_r1);
526 res_r2 = _mm_sad_epu8(src_r2, est_r2);
527 res_r3 = _mm_sad_epu8(src_r3, est_r3);
528
529 sad_val = _mm_add_epi64(res_r0, res_r1);
530 sad_val = _mm_add_epi64(sad_val, res_r2);
531 sad_val = _mm_add_epi64(sad_val, res_r3);
532
533 // Row 8,10,12,14 sad calculation
534 pu1_src += 8 * src_strd;
535 pu1_est += 8 * est_strd;
536
537 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
538 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
539 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
540 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
541
542 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
543 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
544 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
545 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
546
547 res_r0 = _mm_sad_epu8(src_r0, est_r0);
548 res_r1 = _mm_sad_epu8(src_r1, est_r1);
549 res_r2 = _mm_sad_epu8(src_r2, est_r2);
550 res_r3 = _mm_sad_epu8(src_r3, est_r3);
551
552 sad_val = _mm_add_epi64(sad_val, res_r0);
553 sad_val = _mm_add_epi64(sad_val, res_r1);
554 sad_val = _mm_add_epi64(sad_val, res_r2);
555 sad_val = _mm_add_epi64(sad_val, res_r3);
556
557 pu1_src = pu1_src_temp;
558 pu1_est = pu1_est_temp;
559
560 val1 = _mm_extract_epi32(sad_val, 0);
561 val2 = _mm_extract_epi32(sad_val, 2);
562
563 i4_sad = val1 + val2;
564 *pi4_mb_distortion = (i4_sad<<1);
565 return;
566 }
567
568 /**
569 *******************************************************************************
570 *
571 * @brief compute sad
572 *
573 * @par Description: This function computes the sad at vertices of diamond grid
574 * centered at reference pointer and at unit distance from it.
575 *
576 * @param[in] pu1_ref
577 * UWORD8 pointer to the reference
578 *
579 * @param[out] pu1_src
580 * UWORD8 pointer to the source
581 *
582 * @param[in] ref_strd
583 * integer reference stride
584 *
585 * @param[in] src_strd
586 * integer source stride
587 *
588 * @param[out] pi4_sad
589 * pointer to integer array evaluated sad
590 *
591 * @returns sad at all evaluated vertexes
592 *
593 * @remarks none
594 *
595 *******************************************************************************
596 */
ime_calculate_sad4_prog_sse42(UWORD8 * pu1_ref,UWORD8 * pu1_src,WORD32 ref_strd,WORD32 src_strd,WORD32 * pi4_sad)597 void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref,
598 UWORD8 *pu1_src,
599 WORD32 ref_strd,
600 WORD32 src_strd,
601 WORD32 *pi4_sad)
602 {
603 /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
604 UWORD8 *left_ptr = pu1_ref - 1;
605 UWORD8 *right_ptr = pu1_ref + 1;
606 UWORD8 *top_ptr = pu1_ref - ref_strd;
607 UWORD8 *bot_ptr = pu1_ref + ref_strd;
608
609 WORD32 val1, val2;
610 __m128i src, ref_left, ref_right, ref_top, ref_bot;
611 __m128i res_r0, res_r1, res_r2, res_r3;
612 __m128i sad_r0, sad_r1, sad_r2, sad_r3;
613
614 // Row 0 sad calculation
615 src = _mm_loadu_si128((__m128i *) (pu1_src));
616 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
617 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
618 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
619 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
620
621 sad_r0 = _mm_sad_epu8(src, ref_left);
622 sad_r1 = _mm_sad_epu8(src, ref_right);
623 sad_r2 = _mm_sad_epu8(src, ref_top);
624 sad_r3 = _mm_sad_epu8(src, ref_bot);
625
626 pu1_src += src_strd;
627 left_ptr += ref_strd;
628 right_ptr += ref_strd;
629 top_ptr += ref_strd;
630 bot_ptr += ref_strd;
631
632 // Row 1 sad calculation
633 src = _mm_loadu_si128((__m128i *) (pu1_src));
634 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
635 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
636 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
637 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
638
639 res_r0 = _mm_sad_epu8(src, ref_left);
640 res_r1 = _mm_sad_epu8(src, ref_right);
641 res_r2 = _mm_sad_epu8(src, ref_top);
642 res_r3 = _mm_sad_epu8(src, ref_bot);
643
644 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
645 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
646 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
647 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
648
649 pu1_src += src_strd;
650 left_ptr += ref_strd;
651 right_ptr += ref_strd;
652 top_ptr += ref_strd;
653 bot_ptr += ref_strd;
654
655 // Row 2 sad calculation
656 src = _mm_loadu_si128((__m128i *) (pu1_src));
657 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
658 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
659 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
660 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
661
662 res_r0 = _mm_sad_epu8(src, ref_left);
663 res_r1 = _mm_sad_epu8(src, ref_right);
664 res_r2 = _mm_sad_epu8(src, ref_top);
665 res_r3 = _mm_sad_epu8(src, ref_bot);
666
667 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
668 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
669 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
670 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
671
672 pu1_src += src_strd;
673 left_ptr += ref_strd;
674 right_ptr += ref_strd;
675 top_ptr += ref_strd;
676 bot_ptr += ref_strd;
677
678 // Row 3 sad calculation
679 src = _mm_loadu_si128((__m128i *) (pu1_src));
680 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
681 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
682 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
683 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
684
685 res_r0 = _mm_sad_epu8(src, ref_left);
686 res_r1 = _mm_sad_epu8(src, ref_right);
687 res_r2 = _mm_sad_epu8(src, ref_top);
688 res_r3 = _mm_sad_epu8(src, ref_bot);
689
690 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
691 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
692 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
693 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
694
695 pu1_src += src_strd;
696 left_ptr += ref_strd;
697 right_ptr += ref_strd;
698 top_ptr += ref_strd;
699 bot_ptr += ref_strd;
700
701 // Row 4 sad calculation
702 src = _mm_loadu_si128((__m128i *) (pu1_src));
703 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
704 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
705 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
706 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
707
708 res_r0 = _mm_sad_epu8(src, ref_left);
709 res_r1 = _mm_sad_epu8(src, ref_right);
710 res_r2 = _mm_sad_epu8(src, ref_top);
711 res_r3 = _mm_sad_epu8(src, ref_bot);
712
713 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
714 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
715 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
716 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
717
718 pu1_src += src_strd;
719 left_ptr += ref_strd;
720 right_ptr += ref_strd;
721 top_ptr += ref_strd;
722 bot_ptr += ref_strd;
723
724 // Row 5 sad calculation
725 src = _mm_loadu_si128((__m128i *) (pu1_src));
726 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
727 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
728 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
729 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
730
731 res_r0 = _mm_sad_epu8(src, ref_left);
732 res_r1 = _mm_sad_epu8(src, ref_right);
733 res_r2 = _mm_sad_epu8(src, ref_top);
734 res_r3 = _mm_sad_epu8(src, ref_bot);
735
736 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
737 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
738 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
739 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
740
741 pu1_src += src_strd;
742 left_ptr += ref_strd;
743 right_ptr += ref_strd;
744 top_ptr += ref_strd;
745 bot_ptr += ref_strd;
746
747 // Row 6 sad calculation
748 src = _mm_loadu_si128((__m128i *) (pu1_src));
749 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
750 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
751 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
752 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
753
754 res_r0 = _mm_sad_epu8(src, ref_left);
755 res_r1 = _mm_sad_epu8(src, ref_right);
756 res_r2 = _mm_sad_epu8(src, ref_top);
757 res_r3 = _mm_sad_epu8(src, ref_bot);
758
759 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
760 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
761 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
762 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
763
764 pu1_src += src_strd;
765 left_ptr += ref_strd;
766 right_ptr += ref_strd;
767 top_ptr += ref_strd;
768 bot_ptr += ref_strd;
769
770 // Row 7 sad calculation
771 src = _mm_loadu_si128((__m128i *) (pu1_src));
772 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
773 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
774 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
775 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
776
777 res_r0 = _mm_sad_epu8(src, ref_left);
778 res_r1 = _mm_sad_epu8(src, ref_right);
779 res_r2 = _mm_sad_epu8(src, ref_top);
780 res_r3 = _mm_sad_epu8(src, ref_bot);
781
782 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
783 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
784 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
785 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
786
787 pu1_src += src_strd;
788 left_ptr += ref_strd;
789 right_ptr += ref_strd;
790 top_ptr += ref_strd;
791 bot_ptr += ref_strd;
792
793 // Row 8 sad calculation
794 src = _mm_loadu_si128((__m128i *) (pu1_src));
795 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
796 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
797 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
798 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
799
800 res_r0 = _mm_sad_epu8(src, ref_left);
801 res_r1 = _mm_sad_epu8(src, ref_right);
802 res_r2 = _mm_sad_epu8(src, ref_top);
803 res_r3 = _mm_sad_epu8(src, ref_bot);
804
805 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
806 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
807 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
808 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
809
810 pu1_src += src_strd;
811 left_ptr += ref_strd;
812 right_ptr += ref_strd;
813 top_ptr += ref_strd;
814 bot_ptr += ref_strd;
815
816 // Row 9 sad calculation
817 src = _mm_loadu_si128((__m128i *) (pu1_src));
818 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
819 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
820 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
821 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
822
823 res_r0 = _mm_sad_epu8(src, ref_left);
824 res_r1 = _mm_sad_epu8(src, ref_right);
825 res_r2 = _mm_sad_epu8(src, ref_top);
826 res_r3 = _mm_sad_epu8(src, ref_bot);
827
828 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
829 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
830 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
831 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
832
833 pu1_src += src_strd;
834 left_ptr += ref_strd;
835 right_ptr += ref_strd;
836 top_ptr += ref_strd;
837 bot_ptr += ref_strd;
838
839 // Row 10 sad calculation
840 src = _mm_loadu_si128((__m128i *) (pu1_src));
841 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
842 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
843 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
844 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
845
846 res_r0 = _mm_sad_epu8(src, ref_left);
847 res_r1 = _mm_sad_epu8(src, ref_right);
848 res_r2 = _mm_sad_epu8(src, ref_top);
849 res_r3 = _mm_sad_epu8(src, ref_bot);
850
851 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
852 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
853 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
854 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
855
856 pu1_src += src_strd;
857 left_ptr += ref_strd;
858 right_ptr += ref_strd;
859 top_ptr += ref_strd;
860 bot_ptr += ref_strd;
861
862 // Row 11 sad calculation
863 src = _mm_loadu_si128((__m128i *) (pu1_src));
864 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
865 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
866 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
867 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
868
869 res_r0 = _mm_sad_epu8(src, ref_left);
870 res_r1 = _mm_sad_epu8(src, ref_right);
871 res_r2 = _mm_sad_epu8(src, ref_top);
872 res_r3 = _mm_sad_epu8(src, ref_bot);
873
874 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
875 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
876 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
877 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
878
879 pu1_src += src_strd;
880 left_ptr += ref_strd;
881 right_ptr += ref_strd;
882 top_ptr += ref_strd;
883 bot_ptr += ref_strd;
884
885 // Row 12 sad calculation
886 src = _mm_loadu_si128((__m128i *) (pu1_src));
887 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
888 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
889 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
890 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
891
892 res_r0 = _mm_sad_epu8(src, ref_left);
893 res_r1 = _mm_sad_epu8(src, ref_right);
894 res_r2 = _mm_sad_epu8(src, ref_top);
895 res_r3 = _mm_sad_epu8(src, ref_bot);
896
897 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
898 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
899 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
900 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
901
902 pu1_src += src_strd;
903 left_ptr += ref_strd;
904 right_ptr += ref_strd;
905 top_ptr += ref_strd;
906 bot_ptr += ref_strd;
907
908 // Row 13 sad calculation
909 src = _mm_loadu_si128((__m128i *) (pu1_src));
910 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
911 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
912 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
913 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
914
915 res_r0 = _mm_sad_epu8(src, ref_left);
916 res_r1 = _mm_sad_epu8(src, ref_right);
917 res_r2 = _mm_sad_epu8(src, ref_top);
918 res_r3 = _mm_sad_epu8(src, ref_bot);
919
920 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
921 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
922 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
923 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
924
925 pu1_src += src_strd;
926 left_ptr += ref_strd;
927 right_ptr += ref_strd;
928 top_ptr += ref_strd;
929 bot_ptr += ref_strd;
930
931 // Row 14 sad calculation
932 src = _mm_loadu_si128((__m128i *) (pu1_src));
933 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
934 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
935 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
936 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
937
938 res_r0 = _mm_sad_epu8(src, ref_left);
939 res_r1 = _mm_sad_epu8(src, ref_right);
940 res_r2 = _mm_sad_epu8(src, ref_top);
941 res_r3 = _mm_sad_epu8(src, ref_bot);
942
943 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
944 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
945 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
946 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
947
948 pu1_src += src_strd;
949 left_ptr += ref_strd;
950 right_ptr += ref_strd;
951 top_ptr += ref_strd;
952 bot_ptr += ref_strd;
953
954 // Row 15 sad calculation
955 src = _mm_loadu_si128((__m128i *) (pu1_src));
956 ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
957 ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
958 ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
959 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
960
961 res_r0 = _mm_sad_epu8(src, ref_left);
962 res_r1 = _mm_sad_epu8(src, ref_right);
963 res_r2 = _mm_sad_epu8(src, ref_top);
964 res_r3 = _mm_sad_epu8(src, ref_bot);
965
966 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
967 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
968 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
969 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
970
971 val1 = _mm_extract_epi32(sad_r0, 0);
972 val2 = _mm_extract_epi32(sad_r0, 2);
973 pi4_sad[0] = (val1 + val2);
974
975 val1 = _mm_extract_epi32(sad_r1, 0);
976 val2 = _mm_extract_epi32(sad_r1, 2);
977 pi4_sad[1] = (val1 + val2);
978
979 val1 = _mm_extract_epi32(sad_r2, 0);
980 val2 = _mm_extract_epi32(sad_r2, 2);
981 pi4_sad[2] = (val1 + val2);
982
983 val1 = _mm_extract_epi32(sad_r3, 0);
984 val2 = _mm_extract_epi32(sad_r3, 2);
985 pi4_sad[3] = (val1 + val2);
986 }
987
988 /**
989 ******************************************************************************
990 *
991 * @brief computes distortion (SAD) at all subpel points about the src location
992 *
993 * @par Description
994 * This functions computes SAD at all points at a subpel distance from the
995 * current source location.
996 *
997 * @param[in] pu1_src
998 * UWORD8 pointer to the source
999 *
1000 * @param[out] pu1_ref_half_x
1001 * UWORD8 pointer to half pel buffer
1002 *
1003 * @param[out] pu1_ref_half_y
1004 * UWORD8 pointer to half pel buffer
1005 *
1006 * @param[out] pu1_ref_half_xy
1007 * UWORD8 pointer to half pel buffer
1008 *
1009 * @param[in] src_strd
1010 * integer source stride
1011 *
1012 * @param[in] ref_strd
1013 * integer ref stride
1014 *
1015 * @param[out] pi4_sad
1016 * integer evaluated sad
1017 * pi4_sad[0] - half x
1018 * pi4_sad[1] - half x - 1
1019 * pi4_sad[2] - half y
1020 * pi4_sad[3] - half y - 1
1021 * pi4_sad[4] - half xy
1022 * pi4_sad[5] - half xy - 1
1023 * pi4_sad[6] - half xy - strd
1024 * pi4_sad[7] - half xy - 1 - strd
1025 *
1026 * @remarks
1027 *
1028 ******************************************************************************
1029 */
ime_sub_pel_compute_sad_16x16_sse42(UWORD8 * pu1_src,UWORD8 * pu1_ref_half_x,UWORD8 * pu1_ref_half_y,UWORD8 * pu1_ref_half_xy,WORD32 src_strd,WORD32 ref_strd,WORD32 * pi4_sad)1030 void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src,
1031 UWORD8 *pu1_ref_half_x,
1032 UWORD8 *pu1_ref_half_y,
1033 UWORD8 *pu1_ref_half_xy,
1034 WORD32 src_strd,
1035 WORD32 ref_strd,
1036 WORD32 *pi4_sad)
1037 {
1038 UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
1039 UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
1040 UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
1041 UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
1042 UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
1043 WORD32 val1, val2;
1044
1045 __m128i src, ref_half_x, ref_half_y, ref_half_xy;
1046 __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left;
1047 __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7;
1048 __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7;
1049 // Row 0 sad calculation
1050 src = _mm_loadu_si128((__m128i *) (pu1_src));
1051 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1052 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1053 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1054 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1055 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1056 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1057 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1058 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1059
1060 sad_r0 = _mm_sad_epu8(src, ref_half_x);
1061 sad_r1 = _mm_sad_epu8(src, ref_half_x_left);
1062 sad_r2 = _mm_sad_epu8(src, ref_half_y);
1063 sad_r3 = _mm_sad_epu8(src, ref_half_y_top);
1064 sad_r4 = _mm_sad_epu8(src, ref_half_xy);
1065 sad_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1066 sad_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1067 sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1068
1069 pu1_src += src_strd;
1070 pu1_ref_half_x += ref_strd;
1071 pu1_ref_half_x_left += ref_strd;
1072 pu1_ref_half_y += ref_strd;
1073 pu1_ref_half_y_top += ref_strd;
1074 pu1_ref_half_xy += ref_strd;
1075 pu1_ref_half_xy_left += ref_strd;
1076 pu1_ref_half_xy_top += ref_strd;
1077 pu1_ref_half_xy_top_left += ref_strd;
1078
1079 // Row 1 sad calculation
1080 src = _mm_loadu_si128((__m128i *) (pu1_src));
1081 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1082 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1083 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1084 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1085 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1086 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1087 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1088 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1089
1090 res_r0 = _mm_sad_epu8(src, ref_half_x);
1091 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1092 res_r2 = _mm_sad_epu8(src, ref_half_y);
1093 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1094 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1095 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1096 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1097 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1098
1099 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1100 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1101 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1102 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1103 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1104 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1105 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1106 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1107
1108 pu1_src += src_strd;
1109 pu1_ref_half_x += ref_strd;
1110 pu1_ref_half_x_left += ref_strd;
1111 pu1_ref_half_y += ref_strd;
1112 pu1_ref_half_y_top += ref_strd;
1113 pu1_ref_half_xy += ref_strd;
1114 pu1_ref_half_xy_left += ref_strd;
1115 pu1_ref_half_xy_top += ref_strd;
1116 pu1_ref_half_xy_top_left += ref_strd;
1117
1118 // Row 2 sad calculation
1119 src = _mm_loadu_si128((__m128i *) (pu1_src));
1120 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1121 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1122 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1123 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1124 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1125 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1126 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1127 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1128
1129 res_r0 = _mm_sad_epu8(src, ref_half_x);
1130 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1131 res_r2 = _mm_sad_epu8(src, ref_half_y);
1132 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1133 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1134 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1135 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1136 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1137
1138 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1139 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1140 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1141 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1142 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1143 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1144 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1145 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1146
1147 pu1_src += src_strd;
1148 pu1_ref_half_x += ref_strd;
1149 pu1_ref_half_x_left += ref_strd;
1150 pu1_ref_half_y += ref_strd;
1151 pu1_ref_half_y_top += ref_strd;
1152 pu1_ref_half_xy += ref_strd;
1153 pu1_ref_half_xy_left += ref_strd;
1154 pu1_ref_half_xy_top += ref_strd;
1155 pu1_ref_half_xy_top_left += ref_strd;
1156
1157 // Row 3 sad calculation
1158 src = _mm_loadu_si128((__m128i *) (pu1_src));
1159 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1160 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1161 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1162 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1163 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1164 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1165 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1166 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1167
1168 res_r0 = _mm_sad_epu8(src, ref_half_x);
1169 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1170 res_r2 = _mm_sad_epu8(src, ref_half_y);
1171 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1172 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1173 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1174 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1175 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1176
1177 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1178 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1179 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1180 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1181 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1182 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1183 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1184 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1185
1186 pu1_src += src_strd;
1187 pu1_ref_half_x += ref_strd;
1188 pu1_ref_half_x_left += ref_strd;
1189 pu1_ref_half_y += ref_strd;
1190 pu1_ref_half_y_top += ref_strd;
1191 pu1_ref_half_xy += ref_strd;
1192 pu1_ref_half_xy_left += ref_strd;
1193 pu1_ref_half_xy_top += ref_strd;
1194 pu1_ref_half_xy_top_left += ref_strd;
1195
1196 // Row 4 sad calculation
1197 src = _mm_loadu_si128((__m128i *) (pu1_src));
1198 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1199 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1200 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1201 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1202 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1203 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1204 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1205 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1206
1207 res_r0 = _mm_sad_epu8(src, ref_half_x);
1208 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1209 res_r2 = _mm_sad_epu8(src, ref_half_y);
1210 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1211 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1212 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1213 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1214 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1215
1216 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1217 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1218 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1219 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1220 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1221 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1222 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1223 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1224
1225 pu1_src += src_strd;
1226 pu1_ref_half_x += ref_strd;
1227 pu1_ref_half_x_left += ref_strd;
1228 pu1_ref_half_y += ref_strd;
1229 pu1_ref_half_y_top += ref_strd;
1230 pu1_ref_half_xy += ref_strd;
1231 pu1_ref_half_xy_left += ref_strd;
1232 pu1_ref_half_xy_top += ref_strd;
1233 pu1_ref_half_xy_top_left += ref_strd;
1234
1235
1236 // Row 5 sad calculation
1237 src = _mm_loadu_si128((__m128i *) (pu1_src));
1238 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1239 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1240 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1241 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1242 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1243 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1244 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1245 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1246
1247 res_r0 = _mm_sad_epu8(src, ref_half_x);
1248 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1249 res_r2 = _mm_sad_epu8(src, ref_half_y);
1250 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1251 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1252 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1253 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1254 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1255
1256 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1257 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1258 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1259 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1260 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1261 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1262 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1263 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1264
1265 pu1_src += src_strd;
1266 pu1_ref_half_x += ref_strd;
1267 pu1_ref_half_x_left += ref_strd;
1268 pu1_ref_half_y += ref_strd;
1269 pu1_ref_half_y_top += ref_strd;
1270 pu1_ref_half_xy += ref_strd;
1271 pu1_ref_half_xy_left += ref_strd;
1272 pu1_ref_half_xy_top += ref_strd;
1273 pu1_ref_half_xy_top_left += ref_strd;
1274
1275 // Row 6 sad calculation
1276 src = _mm_loadu_si128((__m128i *) (pu1_src));
1277 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1278 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1279 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1280 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1281 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1282 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1283 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1284 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1285
1286 res_r0 = _mm_sad_epu8(src, ref_half_x);
1287 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1288 res_r2 = _mm_sad_epu8(src, ref_half_y);
1289 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1290 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1291 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1292 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1293 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1294
1295 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1296 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1297 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1298 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1299 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1300 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1301 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1302 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1303
1304 pu1_src += src_strd;
1305 pu1_ref_half_x += ref_strd;
1306 pu1_ref_half_x_left += ref_strd;
1307 pu1_ref_half_y += ref_strd;
1308 pu1_ref_half_y_top += ref_strd;
1309 pu1_ref_half_xy += ref_strd;
1310 pu1_ref_half_xy_left += ref_strd;
1311 pu1_ref_half_xy_top += ref_strd;
1312 pu1_ref_half_xy_top_left += ref_strd;
1313
1314 // Row 7 sad calculation
1315 src = _mm_loadu_si128((__m128i *) (pu1_src));
1316 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1317 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1318 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1319 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1320 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1321 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1322 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1323 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1324
1325 res_r0 = _mm_sad_epu8(src, ref_half_x);
1326 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1327 res_r2 = _mm_sad_epu8(src, ref_half_y);
1328 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1329 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1330 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1331 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1332 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1333
1334 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1335 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1336 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1337 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1338 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1339 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1340 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1341 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1342
1343 pu1_src += src_strd;
1344 pu1_ref_half_x += ref_strd;
1345 pu1_ref_half_x_left += ref_strd;
1346 pu1_ref_half_y += ref_strd;
1347 pu1_ref_half_y_top += ref_strd;
1348 pu1_ref_half_xy += ref_strd;
1349 pu1_ref_half_xy_left += ref_strd;
1350 pu1_ref_half_xy_top += ref_strd;
1351 pu1_ref_half_xy_top_left += ref_strd;
1352
1353 // Row 8 sad calculation
1354 src = _mm_loadu_si128((__m128i *) (pu1_src));
1355 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1356 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1357 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1358 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1359 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1360 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1361 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1362 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1363
1364 res_r0 = _mm_sad_epu8(src, ref_half_x);
1365 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1366 res_r2 = _mm_sad_epu8(src, ref_half_y);
1367 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1368 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1369 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1370 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1371 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1372
1373 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1374 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1375 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1376 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1377 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1378 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1379 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1380 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1381
1382 pu1_src += src_strd;
1383 pu1_ref_half_x += ref_strd;
1384 pu1_ref_half_x_left += ref_strd;
1385 pu1_ref_half_y += ref_strd;
1386 pu1_ref_half_y_top += ref_strd;
1387 pu1_ref_half_xy += ref_strd;
1388 pu1_ref_half_xy_left += ref_strd;
1389 pu1_ref_half_xy_top += ref_strd;
1390 pu1_ref_half_xy_top_left += ref_strd;
1391
1392 // Row 9 sad calculation
1393 src = _mm_loadu_si128((__m128i *) (pu1_src));
1394 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1395 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1396 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1397 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1398 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1399 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1400 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1401 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1402
1403 res_r0 = _mm_sad_epu8(src, ref_half_x);
1404 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1405 res_r2 = _mm_sad_epu8(src, ref_half_y);
1406 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1407 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1408 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1409 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1410 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1411
1412 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1413 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1414 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1415 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1416 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1417 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1418 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1419 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1420
1421 pu1_src += src_strd;
1422 pu1_ref_half_x += ref_strd;
1423 pu1_ref_half_x_left += ref_strd;
1424 pu1_ref_half_y += ref_strd;
1425 pu1_ref_half_y_top += ref_strd;
1426 pu1_ref_half_xy += ref_strd;
1427 pu1_ref_half_xy_left += ref_strd;
1428 pu1_ref_half_xy_top += ref_strd;
1429 pu1_ref_half_xy_top_left += ref_strd;
1430
1431 // Row 10 sad calculation
1432 src = _mm_loadu_si128((__m128i *) (pu1_src));
1433 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1434 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1435 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1436 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1437 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1438 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1439 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1440 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1441
1442 res_r0 = _mm_sad_epu8(src, ref_half_x);
1443 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1444 res_r2 = _mm_sad_epu8(src, ref_half_y);
1445 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1446 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1447 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1448 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1449 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1450
1451 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1452 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1453 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1454 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1455 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1456 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1457 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1458 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1459
1460 pu1_src += src_strd;
1461 pu1_ref_half_x += ref_strd;
1462 pu1_ref_half_x_left += ref_strd;
1463 pu1_ref_half_y += ref_strd;
1464 pu1_ref_half_y_top += ref_strd;
1465 pu1_ref_half_xy += ref_strd;
1466 pu1_ref_half_xy_left += ref_strd;
1467 pu1_ref_half_xy_top += ref_strd;
1468 pu1_ref_half_xy_top_left += ref_strd;
1469
1470 // Row 11 sad calculation
1471 src = _mm_loadu_si128((__m128i *) (pu1_src));
1472 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1473 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1474 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1475 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1476 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1477 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1478 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1479 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1480
1481 res_r0 = _mm_sad_epu8(src, ref_half_x);
1482 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1483 res_r2 = _mm_sad_epu8(src, ref_half_y);
1484 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1485 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1486 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1487 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1488 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1489
1490 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1491 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1492 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1493 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1494 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1495 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1496 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1497 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1498
1499 pu1_src += src_strd;
1500 pu1_ref_half_x += ref_strd;
1501 pu1_ref_half_x_left += ref_strd;
1502 pu1_ref_half_y += ref_strd;
1503 pu1_ref_half_y_top += ref_strd;
1504 pu1_ref_half_xy += ref_strd;
1505 pu1_ref_half_xy_left += ref_strd;
1506 pu1_ref_half_xy_top += ref_strd;
1507 pu1_ref_half_xy_top_left += ref_strd;
1508
1509 // Row 12 sad calculation
1510 src = _mm_loadu_si128((__m128i *) (pu1_src));
1511 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1512 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1513 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1514 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1515 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1516 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1517 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1518 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1519
1520 res_r0 = _mm_sad_epu8(src, ref_half_x);
1521 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1522 res_r2 = _mm_sad_epu8(src, ref_half_y);
1523 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1524 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1525 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1526 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1527 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1528
1529 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1530 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1531 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1532 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1533 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1534 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1535 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1536 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1537
1538 pu1_src += src_strd;
1539 pu1_ref_half_x += ref_strd;
1540 pu1_ref_half_x_left += ref_strd;
1541 pu1_ref_half_y += ref_strd;
1542 pu1_ref_half_y_top += ref_strd;
1543 pu1_ref_half_xy += ref_strd;
1544 pu1_ref_half_xy_left += ref_strd;
1545 pu1_ref_half_xy_top += ref_strd;
1546 pu1_ref_half_xy_top_left += ref_strd;
1547
1548 // Row 13 sad calculation
1549 src = _mm_loadu_si128((__m128i *) (pu1_src));
1550 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1551 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1552 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1553 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1554 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1555 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1556 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1557 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1558
1559 res_r0 = _mm_sad_epu8(src, ref_half_x);
1560 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1561 res_r2 = _mm_sad_epu8(src, ref_half_y);
1562 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1563 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1564 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1565 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1566 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1567
1568 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1569 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1570 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1571 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1572 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1573 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1574 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1575 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1576
1577 pu1_src += src_strd;
1578 pu1_ref_half_x += ref_strd;
1579 pu1_ref_half_x_left += ref_strd;
1580 pu1_ref_half_y += ref_strd;
1581 pu1_ref_half_y_top += ref_strd;
1582 pu1_ref_half_xy += ref_strd;
1583 pu1_ref_half_xy_left += ref_strd;
1584 pu1_ref_half_xy_top += ref_strd;
1585 pu1_ref_half_xy_top_left += ref_strd;
1586
1587 // Row 14 sad calculation
1588 src = _mm_loadu_si128((__m128i *) (pu1_src));
1589 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1590 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1591 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1592 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1593 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1594 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1595 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1596 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1597
1598 res_r0 = _mm_sad_epu8(src, ref_half_x);
1599 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1600 res_r2 = _mm_sad_epu8(src, ref_half_y);
1601 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1602 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1603 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1604 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1605 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1606
1607 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1608 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1609 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1610 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1611 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1612 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1613 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1614 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1615
1616 pu1_src += src_strd;
1617 pu1_ref_half_x += ref_strd;
1618 pu1_ref_half_x_left += ref_strd;
1619 pu1_ref_half_y += ref_strd;
1620 pu1_ref_half_y_top += ref_strd;
1621 pu1_ref_half_xy += ref_strd;
1622 pu1_ref_half_xy_left += ref_strd;
1623 pu1_ref_half_xy_top += ref_strd;
1624 pu1_ref_half_xy_top_left += ref_strd;
1625
1626 // Row 15 sad calculation
1627 src = _mm_loadu_si128((__m128i *) (pu1_src));
1628 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1629 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1630 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1631 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1632 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1633 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1634 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1635 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1636
1637 res_r0 = _mm_sad_epu8(src, ref_half_x);
1638 res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1639 res_r2 = _mm_sad_epu8(src, ref_half_y);
1640 res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1641 res_r4 = _mm_sad_epu8(src, ref_half_xy);
1642 res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1643 res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1644 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1645
1646 sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1647 sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1648 sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1649 sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1650 sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1651 sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1652 sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1653 sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1654
1655 val1 = _mm_extract_epi32(sad_r0, 0);
1656 val2 = _mm_extract_epi32(sad_r0, 2);
1657 pi4_sad[0] = (val1 + val2);
1658
1659 val1 = _mm_extract_epi32(sad_r1, 0);
1660 val2 = _mm_extract_epi32(sad_r1, 2);
1661 pi4_sad[1] = (val1 + val2);
1662
1663 val1 = _mm_extract_epi32(sad_r2, 0);
1664 val2 = _mm_extract_epi32(sad_r2, 2);
1665 pi4_sad[2] = (val1 + val2);
1666
1667 val1 = _mm_extract_epi32(sad_r3, 0);
1668 val2 = _mm_extract_epi32(sad_r3, 2);
1669 pi4_sad[3] = (val1 + val2);
1670
1671 val1 = _mm_extract_epi32(sad_r4, 0);
1672 val2 = _mm_extract_epi32(sad_r4, 2);
1673 pi4_sad[4] = (val1 + val2);
1674
1675 val1 = _mm_extract_epi32(sad_r5, 0);
1676 val2 = _mm_extract_epi32(sad_r5, 2);
1677 pi4_sad[5] = (val1 + val2);
1678
1679 val1 = _mm_extract_epi32(sad_r6, 0);
1680 val2 = _mm_extract_epi32(sad_r6, 2);
1681 pi4_sad[6] = (val1 + val2);
1682
1683 val1 = _mm_extract_epi32(sad_r7, 0);
1684 val2 = _mm_extract_epi32(sad_r7, 2);
1685 pi4_sad[7] = (val1 + val2);
1686
1687 return;
1688 }
1689 /*
1690 *
1691 * @brief This function computes SAD between two 16x16 blocks
1692 * It also computes if the block will be zero after H264 transform and quant for
1693 * Intra 16x16 blocks
1694 *
1695 * @param[in] pu1_src
1696 * UWORD8 pointer to the source
1697 *
1698 * @param[out] pu1_dst
1699 * UWORD8 pointer to the destination
1700 *
1701 * @param[in] src_strd
1702 * integer source stride
1703 *
1704 * @param[in] dst_strd
1705 * integer destination stride
1706 *
1707 * @param[in] pu2_thrsh
1708 * Threshold for each element of transofrmed quantized block
1709 *
1710 * @param[out] pi4_mb_distortion
1711 * integer evaluated sad
1712 *
1713 * @param[out] pu4_is_zero
1714 * Poitner to store if the block is zero after transform and quantization
1715 *
1716 * @remarks
1717 *
1718 ******************************************************************************
1719 */
ime_compute_satqd_16x16_lumainter_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,UWORD16 * pu2_thrsh,WORD32 * pi4_mb_distortion,UWORD32 * pu4_is_zero)1720 void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src,
1721 UWORD8 *pu1_est,
1722 WORD32 src_strd,
1723 WORD32 est_strd,
1724 UWORD16 *pu2_thrsh,
1725 WORD32 *pi4_mb_distortion,
1726 UWORD32 *pu4_is_zero)
1727 {
1728 __m128i src_r0, src_r1, src_r2, src_r3;
1729 __m128i est_r0, est_r1, est_r2, est_r3;
1730 __m128i temp0, temp1, temp2, temp3, temp4;
1731 __m128i zero = _mm_setzero_si128(); // all bits reset to zero
1732 __m128i all_one = _mm_set1_epi8(0xFF);
1733 __m128i sad_b1, sad_b2, threshold;
1734 WORD16 sad_1, sad_2;
1735 WORD32 i;
1736 UWORD32 flag = 0;
1737 WORD32 test1, test2;
1738 threshold = _mm_loadu_si128((__m128i *) pu2_thrsh);
1739 (*pi4_mb_distortion) = 0;
1740
1741 for (i=0; i<4; i++)
1742 {
1743 src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2
1744 src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
1745 src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
1746 src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
1747
1748 src_r0 = _mm_cvtepu8_epi16(src_r0);
1749 src_r1 = _mm_cvtepu8_epi16(src_r1);
1750 src_r2 = _mm_cvtepu8_epi16(src_r2);
1751 src_r3 = _mm_cvtepu8_epi16(src_r3);
1752
1753 est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
1754 est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
1755 est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
1756 est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
1757
1758 est_r0 = _mm_cvtepu8_epi16(est_r0);
1759 est_r1 = _mm_cvtepu8_epi16(est_r1);
1760 est_r2 = _mm_cvtepu8_epi16(est_r2);
1761 est_r3 = _mm_cvtepu8_epi16(est_r3);
1762
1763 src_r0 = _mm_sub_epi16(src_r0, est_r0);
1764 src_r1 = _mm_sub_epi16(src_r1, est_r1);
1765 src_r2 = _mm_sub_epi16(src_r2, est_r2);
1766 src_r3 = _mm_sub_epi16(src_r3, est_r3);
1767
1768 src_r0 = _mm_abs_epi16(src_r0);
1769 src_r1 = _mm_abs_epi16(src_r1);
1770 src_r2 = _mm_abs_epi16(src_r2);
1771 src_r3 = _mm_abs_epi16(src_r3);
1772
1773 src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1
1774 src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2
1775
1776 //SAD calculation
1777 temp0 = _mm_add_epi16(src_r0, src_r1); //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2
1778 temp0 = _mm_hadd_epi16(temp0, zero);
1779 temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values
1780
1781 sad_1 = _mm_extract_epi16(temp0, 0);
1782 sad_2 = _mm_extract_epi16(temp0, 1);
1783
1784 (*pi4_mb_distortion) += sad_1 + sad_2;
1785
1786 if (flag == 0) {
1787 sad_b1 = _mm_set1_epi16((sad_1 << 1));
1788 sad_b2 = _mm_set1_epi16((sad_2 << 1));
1789
1790 src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
1791 src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
1792
1793 src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
1794 src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
1795
1796 src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0
1797 src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0
1798
1799 temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
1800 temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
1801
1802 temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
1803 temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
1804
1805 temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
1806 temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
1807
1808 temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
1809
1810 temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0
1811 temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0
1812
1813 temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
1814
1815 temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
1816 temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
1817
1818 temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
1819 temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
1820
1821 sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0
1822 sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1
1823
1824 temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
1825
1826 temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
1827
1828 temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation
1829 temp1 = _mm_xor_si128(temp1, all_one);
1830
1831 test1 = _mm_test_all_zeros(temp0, all_one);
1832 test2 = _mm_test_all_zeros(temp1, all_one);
1833
1834 if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
1835 || pu2_thrsh[8] <= sad_2)
1836 flag = 1;
1837 }
1838
1839 pu1_src += 8;
1840 pu1_est += 8;
1841
1842 src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2
1843 src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
1844 src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
1845 src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
1846
1847 src_r0 = _mm_cvtepu8_epi16(src_r0);
1848 src_r1 = _mm_cvtepu8_epi16(src_r1);
1849 src_r2 = _mm_cvtepu8_epi16(src_r2);
1850 src_r3 = _mm_cvtepu8_epi16(src_r3);
1851
1852 est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
1853 est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
1854 est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
1855 est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
1856
1857 est_r0 = _mm_cvtepu8_epi16(est_r0);
1858 est_r1 = _mm_cvtepu8_epi16(est_r1);
1859 est_r2 = _mm_cvtepu8_epi16(est_r2);
1860 est_r3 = _mm_cvtepu8_epi16(est_r3);
1861
1862 src_r0 = _mm_sub_epi16(src_r0, est_r0);
1863 src_r1 = _mm_sub_epi16(src_r1, est_r1);
1864 src_r2 = _mm_sub_epi16(src_r2, est_r2);
1865 src_r3 = _mm_sub_epi16(src_r3, est_r3);
1866
1867 src_r0 = _mm_abs_epi16(src_r0);
1868 src_r1 = _mm_abs_epi16(src_r1);
1869 src_r2 = _mm_abs_epi16(src_r2);
1870 src_r3 = _mm_abs_epi16(src_r3);
1871
1872 src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1
1873 src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2
1874
1875 //SAD calculation
1876 temp0 = _mm_add_epi16(src_r0, src_r1);
1877 temp0 = _mm_hadd_epi16(temp0, zero);
1878 temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values
1879
1880 sad_1 = _mm_extract_epi16(temp0, 0);
1881 sad_2 = _mm_extract_epi16(temp0, 1);
1882
1883 (*pi4_mb_distortion) += sad_1 + sad_2;
1884
1885 if (flag == 0) {
1886 sad_b1 = _mm_set1_epi16((sad_1 << 1));
1887 sad_b2 = _mm_set1_epi16((sad_2 << 1));
1888
1889 src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
1890 src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
1891
1892 src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
1893 src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
1894
1895 src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0
1896 src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0
1897
1898 temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
1899 temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
1900
1901 temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
1902 temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
1903
1904 temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
1905 temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
1906
1907 temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
1908
1909 temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0
1910 temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0
1911
1912 temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
1913
1914 temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
1915 temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
1916
1917 temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
1918 temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
1919
1920 sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0
1921 sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1
1922
1923 temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
1924
1925 temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
1926
1927 temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation
1928 temp1 = _mm_xor_si128(temp1, all_one);
1929
1930 test1 = _mm_test_all_zeros(temp0, all_one);
1931 test2 = _mm_test_all_zeros(temp1, all_one);
1932
1933 if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
1934 || pu2_thrsh[8] <= sad_2)
1935 flag = 1;
1936 }
1937
1938 pu1_src += 4*src_strd - 8;
1939 pu1_est += 4*est_strd - 8;
1940 }
1941
1942 *pu4_is_zero = flag;
1943 }
1944