1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * impeg2_inter_pred_sse42_intr.c
25 *
26 * @brief
27 * Contains Motion compensation function definitions for MPEG2 decoder
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * - impeg2_copy_mb_sse42()
33 * - impeg2_interpolate_sse42()
34 * - impeg2_mc_halfx_halfy_8x8_sse42()
35 * - impeg2_mc_halfx_fully_8x8_sse42()
36 * - impeg2_mc_fullx_halfy_8x8_sse42()
37 * - impeg2_mc_fullx_fully_8x8_sse42()
38 *
39 * @remarks
40 * None
41 *
42 *******************************************************************************
43 */
44 #include <stdio.h>
45 #include <string.h>
46 #include "iv_datatypedef.h"
47 #include "impeg2_macros.h"
48 #include "impeg2_defs.h"
49 #include "impeg2_inter_pred.h"
50
51 #include <immintrin.h>
52 #include <emmintrin.h>
53 #include <smmintrin.h>
54 #include <tmmintrin.h>
55
56 /*******************************************************************************
57 * Function Name : impeg2_copy_mb
58 *
59 * Description : copies 3 components to the frame from mc_buf
60 *
61 * Arguments :
62 * src_buf : Source Buffer
63 * dst_buf : Destination Buffer
64 * src_wd : Source Width
65 * dst_wd : destination Width
66 *
67 * Values Returned : None
68 *******************************************************************************/
impeg2_copy_mb_sse42(yuv_buf_t * src_buf,yuv_buf_t * dst_buf,UWORD32 src_wd,UWORD32 dst_wd)69 void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
70 yuv_buf_t *dst_buf,
71 UWORD32 src_wd,
72 UWORD32 dst_wd)
73 {
74 UWORD8 *src;
75 UWORD8 *dst;
76 __m128i src_r0, src_r1, src_r2, src_r3;
77
78 /*******************************************************/
79 /* copy Y */
80 /*******************************************************/
81 src = src_buf->pu1_y;
82 dst = dst_buf->pu1_y;
83 // Row 0-3
84 src_r0 = _mm_loadu_si128((__m128i *) (src));
85 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
86 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
87 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
88
89 _mm_storeu_si128((__m128i *) dst, src_r0);
90 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
91 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
92 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
93
94 // Row 4-7
95 src += 4 * src_wd;
96 dst += 4 * dst_wd;
97 src_r0 = _mm_loadu_si128((__m128i *) (src));
98 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
99 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
100 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
101
102 _mm_storeu_si128((__m128i *) dst, src_r0);
103 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
104 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
105 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
106
107 // Row 8-11
108 src += 4 * src_wd;
109 dst += 4 * dst_wd;
110 src_r0 = _mm_loadu_si128((__m128i *) (src));
111 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
112 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
113 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
114
115 _mm_storeu_si128((__m128i *) dst, src_r0);
116 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
117 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
118 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
119
120 // Row 12-15
121 src += 4 * src_wd;
122 dst += 4 * dst_wd;
123 src_r0 = _mm_loadu_si128((__m128i *) (src));
124 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
125 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
126 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
127
128 _mm_storeu_si128((__m128i *) dst, src_r0);
129 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
130 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
131 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
132
133 src_wd >>= 1;
134 dst_wd >>= 1;
135
136 /*******************************************************/
137 /* copy U */
138 /*******************************************************/
139 src = src_buf->pu1_u;
140 dst = dst_buf->pu1_u;
141
142 // Row 0-3
143 src_r0 = _mm_loadl_epi64((__m128i *)src);
144 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
145 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
146 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
147
148 _mm_storel_epi64((__m128i *)dst, src_r0);
149 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
150 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
151 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
152
153 // Row 4-7
154 src += 4 * src_wd;
155 dst += 4 * dst_wd;
156
157 src_r0 = _mm_loadl_epi64((__m128i *)src);
158 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
159 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
160 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
161
162 _mm_storel_epi64((__m128i *)dst, src_r0);
163 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
164 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
165 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
166
167 /*******************************************************/
168 /* copy V */
169 /*******************************************************/
170 src = src_buf->pu1_v;
171 dst = dst_buf->pu1_v;
172 // Row 0-3
173 src_r0 = _mm_loadl_epi64((__m128i *)src);
174 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
175 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
176 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
177
178 _mm_storel_epi64((__m128i *)dst, src_r0);
179 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
180 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
181 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
182
183 // Row 4-7
184 src += 4 * src_wd;
185 dst += 4 * dst_wd;
186
187 src_r0 = _mm_loadl_epi64((__m128i *)src);
188 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
189 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
190 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
191
192 _mm_storel_epi64((__m128i *)dst, src_r0);
193 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
194 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
195 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
196 }
197
198 /*****************************************************************************/
199 /* */
200 /* Function Name : impeg2_interpolate */
201 /* */
202 /* Description : averages the contents of buf_src1 and buf_src2 and stores*/
203 /* result in buf_dst */
204 /* */
205 /* Inputs : buf_src1 - First Source */
206 /* buf_src2 - Second Source */
207 /* */
208 /* Globals : None */
209 /* */
210 /* Processing : Avg the values from two sources and store the result in */
211 /* destination buffer */
212 /* */
213 /* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */
214 /* */
215 /* Returns : None */
216 /* */
217 /* Issues : Assumes that all 3 buffers are of same size */
218 /* */
219 /*****************************************************************************/
impeg2_interpolate_sse42(yuv_buf_t * buf_src1,yuv_buf_t * buf_src2,yuv_buf_t * buf_dst,UWORD32 stride)220 void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
221 yuv_buf_t *buf_src2,
222 yuv_buf_t *buf_dst,
223 UWORD32 stride)
224 {
225 UWORD8 *src1, *src2;
226 UWORD8 *dst;
227 __m128i src1_r0, src1_r1, src1_r2, src1_r3;
228 __m128i src2_r0, src2_r1, src2_r2, src2_r3;
229
230 /*******************************************************/
231 /* interpolate Y */
232 /*******************************************************/
233 src1 = buf_src1->pu1_y;
234 src2 = buf_src2->pu1_y;
235 dst = buf_dst->pu1_y;
236 // Row 0-3
237 src1_r0 = _mm_loadu_si128((__m128i *) (src1));
238 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
239 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
240 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
241
242 src2_r0 = _mm_loadu_si128((__m128i *) (src2));
243 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
244 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
245 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
246
247 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
248 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
249 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
250 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
251
252 _mm_storeu_si128((__m128i *) dst, src1_r0);
253 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
254 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
255 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
256
257 // Row 4-7
258 src1 += 4 * 16;
259 src2 += 4 * 16;
260 dst += 4 * stride;
261 src1_r0 = _mm_loadu_si128((__m128i *) (src1));
262 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
263 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
264 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
265
266 src2_r0 = _mm_loadu_si128((__m128i *) (src2));
267 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
268 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
269 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
270
271 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
272 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
273 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
274 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
275
276 _mm_storeu_si128((__m128i *) dst, src1_r0);
277 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
278 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
279 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
280
281 // Row 8-11
282 src1 += 4 * 16;
283 src2 += 4 * 16;
284 dst += 4 * stride;
285 src1_r0 = _mm_loadu_si128((__m128i *) (src1));
286 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
287 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
288 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
289
290 src2_r0 = _mm_loadu_si128((__m128i *) (src2));
291 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
292 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
293 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
294
295 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
296 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
297 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
298 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
299
300 _mm_storeu_si128((__m128i *) dst, src1_r0);
301 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
302 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
303 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
304
305 // Row 12-15
306 src1 += 4 * 16;
307 src2 += 4 * 16;
308 dst += 4 * stride;
309 src1_r0 = _mm_loadu_si128((__m128i *) (src1));
310 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
311 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
312 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
313
314 src2_r0 = _mm_loadu_si128((__m128i *) (src2));
315 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
316 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
317 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
318
319 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
320 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
321 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
322 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
323
324 _mm_storeu_si128((__m128i *) dst, src1_r0);
325 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
326 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
327 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
328
329 stride >>= 1;
330
331 /*******************************************************/
332 /* interpolate U */
333 /*******************************************************/
334 src1 = buf_src1->pu1_u;
335 src2 = buf_src2->pu1_u;
336 dst = buf_dst->pu1_u;
337 // Row 0-3
338 src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
339 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
340 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
341 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
342
343 src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
344 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
345 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
346 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
347
348 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
349 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
350 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
351 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
352
353 _mm_storel_epi64((__m128i *) dst, src1_r0);
354 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
355 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
356 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
357
358 // Row 4-7
359 src1 += 4 * 8;
360 src2 += 4 * 8;
361 dst += 4 * stride;
362
363 src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
364 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
365 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
366 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
367
368 src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
369 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
370 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
371 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
372
373 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
374 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
375 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
376 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
377
378 _mm_storel_epi64((__m128i *) dst, src1_r0);
379 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
380 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
381 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
382
383 /*******************************************************/
384 /* interpolate V */
385 /*******************************************************/
386 src1 = buf_src1->pu1_v;
387 src2 = buf_src2->pu1_v;
388 dst = buf_dst->pu1_v;
389
390 // Row 0-3
391 src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
392 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
393 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
394 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
395
396 src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
397 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
398 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
399 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
400
401 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
402 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
403 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
404 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
405
406 _mm_storel_epi64((__m128i *) dst, src1_r0);
407 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
408 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
409 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
410
411 // Row 4-7
412 src1 += 4 * 8;
413 src2 += 4 * 8;
414 dst += 4 * stride;
415
416 src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
417 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
418 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
419 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
420
421 src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
422 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
423 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
424 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
425
426 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
427 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
428 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
429 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
430
431 _mm_storel_epi64((__m128i *) dst, src1_r0);
432 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
433 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
434 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
435 }
436
437 /*****************************************************************************/
438 /* */
439 /* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */
440 /* */
441 /* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */
442 /* and the above block of size 8 x 8 will be placed as a */
443 /* block from the current position of out_buf */
444 /* */
445 /* Inputs : ref - Reference frame from which the block will be */
446 /* block will be extracted. */
447 /* ref_wid - WIdth of reference frame */
448 /* out_wid - WIdth of the output frame */
449 /* blk_width - width of the block */
450 /* blk_width - height of the block */
451 /* */
452 /* Globals : None */
453 /* */
454 /* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */
455 /* the ref frame.Interpolate these four values to get the */
456 /* value at(0.5,0.5).Repeat this to get an 8 x 8 block */
457 /* using 9 x 9 block from reference frame */
458 /* */
459 /* Outputs : out - Output containing the extracted block */
460 /* */
461 /* Returns : None */
462 /* */
463 /* Issues : None */
464 /* */
465 /*****************************************************************************/
impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)466 void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
467 UWORD8 *ref,
468 UWORD32 ref_wid,
469 UWORD32 out_wid)
470 {
471 UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
472 /* P0-P3 are the pixels in the reference frame and Q is the value being */
473 /* estimated */
474 /*
475 P0 P1
476 Q
477 P2 P3
478 */
479 __m128i src_r0, src_r0_1, src_r1, src_r1_1;
480 __m128i tmp0, tmp1;
481 __m128i value_2 = _mm_set1_epi16(2);
482
483 ref_p0 = ref;
484 ref_p1 = ref + 1;
485 ref_p2 = ref + ref_wid;
486 ref_p3 = ref + ref_wid + 1;
487
488 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0
489 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
490 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1
491 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
492
493 src_r0 = _mm_cvtepu8_epi16(src_r0);
494 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
495 src_r1 = _mm_cvtepu8_epi16(src_r1);
496 src_r1_1 = _mm_cvtepu8_epi16(src_r1_1);
497
498 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation
499 tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation
500 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation
501 tmp0 = _mm_add_epi16(tmp0, value_2);
502 tmp0 = _mm_srli_epi16(tmp0, 2);
503 tmp0 = _mm_packus_epi16(tmp0, value_2);
504
505 _mm_storel_epi64((__m128i *)out, tmp0);
506
507 //Row 1
508 ref_p2 += ref_wid;
509 ref_p3 += ref_wid;
510 out += out_wid;
511
512 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2
513 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
514
515 src_r0 = _mm_cvtepu8_epi16(src_r0);
516 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
517
518 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation
519 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation
520 tmp1 = _mm_add_epi16(tmp1, value_2);
521 tmp1 = _mm_srli_epi16(tmp1, 2);
522 tmp1 = _mm_packus_epi16(tmp1, value_2);
523
524 _mm_storel_epi64((__m128i *)out, tmp1);
525
526 //Row 2
527 ref_p2 += ref_wid;
528 ref_p3 += ref_wid;
529 out += out_wid;
530
531 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3
532 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
533
534 src_r0 = _mm_cvtepu8_epi16(src_r0);
535 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
536
537 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation
538
539 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation
540 tmp0 = _mm_add_epi16(tmp0, value_2);
541 tmp0 = _mm_srli_epi16(tmp0, 2);
542 tmp0 = _mm_packus_epi16(tmp0, value_2);
543
544 _mm_storel_epi64((__m128i *)out, tmp0);
545
546 //Row 3
547 ref_p2 += ref_wid;
548 ref_p3 += ref_wid;
549 out += out_wid;
550
551 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4
552 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
553
554 src_r0 = _mm_cvtepu8_epi16(src_r0);
555 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
556
557 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation
558
559 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation
560 tmp1 = _mm_add_epi16(tmp1, value_2);
561 tmp1 = _mm_srli_epi16(tmp1, 2);
562 tmp1 = _mm_packus_epi16(tmp1, value_2);
563
564 _mm_storel_epi64((__m128i *)out, tmp1);
565
566 //Row 4
567 ref_p2 += ref_wid;
568 ref_p3 += ref_wid;
569 out += out_wid;
570
571 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5
572 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
573
574 src_r0 = _mm_cvtepu8_epi16(src_r0);
575 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
576
577 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation
578
579 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation
580 tmp0 = _mm_add_epi16(tmp0, value_2);
581 tmp0 = _mm_srli_epi16(tmp0, 2);
582 tmp0 = _mm_packus_epi16(tmp0, value_2);
583
584 _mm_storel_epi64((__m128i *)out, tmp0);
585
586 //Row 5
587 ref_p2 += ref_wid;
588 ref_p3 += ref_wid;
589 out += out_wid;
590
591 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6
592 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
593
594 src_r0 = _mm_cvtepu8_epi16(src_r0);
595 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
596
597 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation
598
599 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation
600 tmp1 = _mm_add_epi16(tmp1, value_2);
601 tmp1 = _mm_srli_epi16(tmp1, 2);
602 tmp1 = _mm_packus_epi16(tmp1, value_2);
603
604 _mm_storel_epi64((__m128i *)out, tmp1);
605
606 //Row 6
607 ref_p2 += ref_wid;
608 ref_p3 += ref_wid;
609 out += out_wid;
610
611 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7
612 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
613
614 src_r0 = _mm_cvtepu8_epi16(src_r0);
615 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
616
617 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation
618
619 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation
620 tmp0 = _mm_add_epi16(tmp0, value_2);
621 tmp0 = _mm_srli_epi16(tmp0, 2);
622 tmp0 = _mm_packus_epi16(tmp0, value_2);
623
624 _mm_storel_epi64((__m128i *)out, tmp0);
625
626 //Row 7
627 ref_p2 += ref_wid;
628 ref_p3 += ref_wid;
629 out += out_wid;
630
631 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8
632 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
633
634 src_r0 = _mm_cvtepu8_epi16(src_r0);
635 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
636
637 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation
638
639 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation
640 tmp1 = _mm_add_epi16(tmp1, value_2);
641 tmp1 = _mm_srli_epi16(tmp1, 2);
642 tmp1 = _mm_packus_epi16(tmp1, value_2);
643
644 _mm_storel_epi64((__m128i *)out, tmp1);
645
646 return;
647 }
648
649 /*****************************************************************************/
650 /* */
651 /* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */
652 /* */
653 /* Description : Gets the buffer from (0.5,0) to (8.5,8) */
654 /* and the above block of size 8 x 8 will be placed as a */
655 /* block from the current position of out_buf */
656 /* */
657 /* Inputs : ref - Reference frame from which the block will be */
658 /* block will be extracted. */
659 /* ref_wid - WIdth of reference frame */
660 /* out_wid - WIdth of the output frame */
661 /* blk_width - width of the block */
662 /* blk_width - height of the block */
663 /* */
664 /* Globals : None */
665 /* */
666 /* Processing : Point to the (0,0) and (1,0) position in the ref frame */
667 /* Interpolate these two values to get the value at(0.5,0) */
668 /* Repeat this to get an 8 x 8 block using 9 x 8 block from */
669 /* reference frame */
670 /* */
671 /* Outputs : out - Output containing the extracted block */
672 /* */
673 /* Returns : None */
674 /* */
675 /* Issues : None */
676 /* */
677 /*****************************************************************************/
impeg2_mc_halfx_fully_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)678 void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
679 UWORD8 *ref,
680 UWORD32 ref_wid,
681 UWORD32 out_wid)
682 {
683 UWORD8 *ref_p0,*ref_p1;
684 __m128i src_r0, src_r0_1, src_r1, src_r1_1;
685 /* P0-P3 are the pixels in the reference frame and Q is the value being */
686 /* estimated */
687 /*
688 P0 Q P1
689 */
690
691 ref_p0 = ref;
692 ref_p1 = ref + 1;
693
694 // Row 0 and 1
695 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0
696 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
697 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1
698 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
699
700 src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
701 src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
702
703 _mm_storel_epi64((__m128i *)out, src_r0);
704 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
705
706 // Row 2 and 3
707 ref_p0 += 2*ref_wid;
708 ref_p1 += 2*ref_wid;
709 out += 2*out_wid;
710
711 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2
712 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
713 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3
714 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
715
716 src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
717 src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
718
719 _mm_storel_epi64((__m128i *)out, src_r0);
720 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
721
722 // Row 4 and 5
723 ref_p0 += 2*ref_wid;
724 ref_p1 += 2*ref_wid;
725 out += 2*out_wid;
726
727 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4
728 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
729 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5
730 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
731
732 src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
733 src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
734
735 _mm_storel_epi64((__m128i *)out, src_r0);
736 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
737
738 // Row 6 and 7
739 ref_p0 += 2*ref_wid;
740 ref_p1 += 2*ref_wid;
741 out += 2*out_wid;
742
743 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6
744 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
745 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7
746 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
747
748 src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
749 src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
750
751 _mm_storel_epi64((__m128i *)out, src_r0);
752 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
753
754 return;
755 }
756
757
758 /*****************************************************************************/
759 /* */
760 /* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */
761 /* */
762 /* Description : Gets the buffer from (0,0.5) to (8,8.5) */
763 /* and the above block of size 8 x 8 will be placed as a */
764 /* block from the current position of out_buf */
765 /* */
766 /* Inputs : ref - Reference frame from which the block will be */
767 /* block will be extracted. */
768 /* ref_wid - WIdth of reference frame */
769 /* out_wid - WIdth of the output frame */
770 /* blk_width - width of the block */
771 /* blk_width - height of the block */
772 /* */
773 /* Globals : None */
774 /* */
775 /* Processing : Point to the (0,0) and (0,1) position in the ref frame */
776 /* Interpolate these two values to get the value at(0,0.5) */
777 /* Repeat this to get an 8 x 8 block using 8 x 9 block from */
778 /* reference frame */
779 /* */
780 /* Outputs : out - Output containing the extracted block */
781 /* */
782 /* Returns : None */
783 /* */
784 /* Issues : None */
785 /* */
786 /*****************************************************************************/
impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)787 void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
788 UWORD8 *ref,
789 UWORD32 ref_wid,
790 UWORD32 out_wid)
791 {
792 __m128i src_r0, src_r1, src_r2, temp0, temp1;
793 /* P0-P3 are the pixels in the reference frame and Q is the value being */
794 /* estimated */
795 /*
796 P0
797 x
798 P1
799 */
800 src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0
801 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1
802 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2
803 temp0 = _mm_avg_epu8(src_r0, src_r1);
804 temp1 = _mm_avg_epu8(src_r1, src_r2);
805 _mm_storel_epi64((__m128i *)out, temp0); //Row 0
806 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1
807
808 ref+= 3*ref_wid;
809 out+= 2*out_wid;
810
811 src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3
812 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4
813 temp0 = _mm_avg_epu8(src_r2, src_r0);
814 temp1 = _mm_avg_epu8(src_r0, src_r1);
815 _mm_storel_epi64((__m128i *)out, temp0); //Row 2
816 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3
817
818 ref += 2*ref_wid;
819 out+= 2*out_wid;
820
821 src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5
822 src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6
823 temp0 = _mm_avg_epu8(src_r1, src_r2);
824 temp1 = _mm_avg_epu8(src_r2, src_r0);
825 _mm_storel_epi64((__m128i *)out, temp0); //Row 4
826 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5
827
828 ref += 2*ref_wid;
829 out+= 2*out_wid;
830
831 src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7
832 src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8
833 temp0 = _mm_avg_epu8(src_r0, src_r1);
834 temp1 = _mm_avg_epu8(src_r1, src_r2);
835 _mm_storel_epi64((__m128i *)out, temp0); //Row 6
836 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7
837
838 return;
839 }
840
841 /*****************************************************************************/
842 /* */
843 /* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */
844 /* */
845 /* Description : Gets the buffer from (x,y) to (x+8,y+8) */
846 /* and the above block of size 8 x 8 will be placed as a */
847 /* block from the current position of out_buf */
848 /* */
849 /* Inputs : ref - Reference frame from which the block will be */
850 /* block will be extracted. */
851 /* ref_wid - WIdth of reference frame */
852 /* out_wid - WIdth of the output frame */
853 /* blk_width - width of the block */
854 /* blk_width - height of the block */
855 /* */
856 /* Globals : None */
857 /* */
858 /* Processing : Point to the (0,0) position in the ref frame */
859 /* Get an 8 x 8 block from reference frame */
860 /* */
861 /* Outputs : out - Output containing the extracted block */
862 /* */
863 /* Returns : None */
864 /* */
865 /* Issues : None */
866 /* */
867 /*****************************************************************************/
impeg2_mc_fullx_fully_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)868 void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
869 UWORD8 *ref,
870 UWORD32 ref_wid,
871 UWORD32 out_wid)
872 {
873 __m128i src_r0, src_r1, src_r2, src_r3;
874 // Row 0-3
875 src_r0 = _mm_loadl_epi64((__m128i *)ref);
876 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));
877 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
878 src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
879
880 _mm_storel_epi64((__m128i *)out, src_r0);
881 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
882 _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
883 _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
884
885 // Row 4-7
886 ref += 4 * ref_wid;
887 out += 4 * out_wid;
888
889 src_r0 = _mm_loadl_epi64((__m128i *)ref);
890 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));
891 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
892 src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
893
894 _mm_storel_epi64((__m128i *)out, src_r0);
895 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
896 _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
897 _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
898 return;
899 }
900