• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h>  // SSE2
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/blend.h"
21 #include "aom_dsp/x86/mem_sse2.h"
22 #include "aom_dsp/x86/synonyms.h"
23 
24 #include "av1/common/av1_common_int.h"
25 #include "av1/common/blockd.h"
26 #include "av1/common/mvref_common.h"
27 #include "av1/common/obmc.h"
28 #include "av1/common/reconinter.h"
29 #include "av1/common/reconintra.h"
30 #include "av1/encoder/reconinter_enc.h"
31 
aom_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)32 void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
33                              int mi_row, int mi_col, const MV *const mv,
34                              uint8_t *comp_pred, int width, int height,
35                              int subpel_x_q3, int subpel_y_q3,
36                              const uint8_t *ref, int ref_stride,
37                              int subpel_search) {
38   // expect xd == NULL only in tests
39   if (xd != NULL) {
40     const MB_MODE_INFO *mi = xd->mi[0];
41     const int ref_num = 0;
42     const int is_intrabc = is_intrabc_block(mi);
43     const struct scale_factors *const sf =
44         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
45     const int is_scaled = av1_is_scaled(sf);
46 
47     if (is_scaled) {
48       int plane = 0;
49       const int mi_x = mi_col * MI_SIZE;
50       const int mi_y = mi_row * MI_SIZE;
51       const struct macroblockd_plane *const pd = &xd->plane[plane];
52       const struct buf_2d *const dst_buf = &pd->dst;
53       const struct buf_2d *const pre_buf =
54           is_intrabc ? dst_buf : &pd->pre[ref_num];
55 
56       InterPredParams inter_pred_params;
57       inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
58       const int_interpfilters filters =
59           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
60       av1_init_inter_params(
61           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
62           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
63           xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
64       av1_enc_build_one_inter_predictor(comp_pred, width, mv,
65                                         &inter_pred_params);
66       return;
67     }
68   }
69 
70   const InterpFilterParams *filter = av1_get_filter(subpel_search);
71   // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
72   // 2-tap yet.
73   int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
74 
75   if (!subpel_x_q3 && !subpel_y_q3) {
76     if (width >= 16) {
77       int i;
78       assert(!(width & 15));
79       /*Read 16 pixels one row at a time.*/
80       for (i = 0; i < height; i++) {
81         int j;
82         for (j = 0; j < width; j += 16) {
83           xx_storeu_128(comp_pred, xx_loadu_128(ref));
84           comp_pred += 16;
85           ref += 16;
86         }
87         ref += ref_stride - width;
88       }
89     } else if (width >= 8) {
90       int i;
91       assert(!(width & 7));
92       assert(!(height & 1));
93       /*Read 8 pixels two rows at a time.*/
94       for (i = 0; i < height; i += 2) {
95         __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
96         __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
97         xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
98         comp_pred += 16;
99         ref += 2 * ref_stride;
100       }
101     } else {
102       int i;
103       assert(!(width & 3));
104       assert(!(height & 3));
105       /*Read 4 pixels four rows at a time.*/
106       for (i = 0; i < height; i++) {
107         const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
108         const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
109         const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
110         const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
111         const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
112                                                _mm_unpacklo_epi32(row2, row3));
113         xx_storeu_128(comp_pred, reg);
114         comp_pred += 16;
115         ref += 4 * ref_stride;
116       }
117     }
118   } else if (!subpel_y_q3) {
119     const int16_t *const kernel =
120         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
121     aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
122                         width, height);
123   } else if (!subpel_x_q3) {
124     const int16_t *const kernel =
125         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
126     aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
127                        width, height);
128   } else {
129     DECLARE_ALIGNED(16, uint8_t,
130                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
131     const int16_t *const kernel_x =
132         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
133     const int16_t *const kernel_y =
134         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
135     const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
136     uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
137                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
138                                     : temp;
139     uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
140     int intermediate_height =
141         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
142     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
143     aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
144                         kernel_x, 16, NULL, -1, width, intermediate_height);
145     aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
146                        kernel_y, 16, width, height);
147   }
148 }
149 
150 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)151 void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
152                                     const struct AV1Common *const cm,
153                                     int mi_row, int mi_col, const MV *const mv,
154                                     uint8_t *comp_pred8, int width, int height,
155                                     int subpel_x_q3, int subpel_y_q3,
156                                     const uint8_t *ref8, int ref_stride, int bd,
157                                     int subpel_search) {
158   // expect xd == NULL only in tests
159   if (xd != NULL) {
160     const MB_MODE_INFO *mi = xd->mi[0];
161     const int ref_num = 0;
162     const int is_intrabc = is_intrabc_block(mi);
163     const struct scale_factors *const sf =
164         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
165     const int is_scaled = av1_is_scaled(sf);
166 
167     if (is_scaled) {
168       int plane = 0;
169       const int mi_x = mi_col * MI_SIZE;
170       const int mi_y = mi_row * MI_SIZE;
171       const struct macroblockd_plane *const pd = &xd->plane[plane];
172       const struct buf_2d *const dst_buf = &pd->dst;
173       const struct buf_2d *const pre_buf =
174           is_intrabc ? dst_buf : &pd->pre[ref_num];
175 
176       InterPredParams inter_pred_params;
177       inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
178       const int_interpfilters filters =
179           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
180       av1_init_inter_params(
181           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
182           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
183           xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
184       av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
185                                         &inter_pred_params);
186       return;
187     }
188   }
189 
190   const InterpFilterParams *filter = av1_get_filter(subpel_search);
191   int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
192   if (!subpel_x_q3 && !subpel_y_q3) {
193     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
194     uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
195     if (width >= 8) {
196       int i;
197       assert(!(width & 7));
198       /*Read 8 pixels one row at a time.*/
199       for (i = 0; i < height; i++) {
200         int j;
201         for (j = 0; j < width; j += 8) {
202           __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
203           _mm_storeu_si128((__m128i *)comp_pred, s0);
204           comp_pred += 8;
205           ref += 8;
206         }
207         ref += ref_stride - width;
208       }
209     } else {
210       int i;
211       assert(!(width & 3));
212       /*Read 4 pixels two rows at a time.*/
213       for (i = 0; i < height; i += 2) {
214         __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
215         __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
216         __m128i t0 = _mm_unpacklo_epi64(s0, s1);
217         _mm_storeu_si128((__m128i *)comp_pred, t0);
218         comp_pred += 8;
219         ref += 2 * ref_stride;
220       }
221     }
222   } else if (!subpel_y_q3) {
223     const int16_t *const kernel =
224         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
225     aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
226                                NULL, -1, width, height, bd);
227   } else if (!subpel_x_q3) {
228     const int16_t *const kernel =
229         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
230     aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
231                               kernel, 16, width, height, bd);
232   } else {
233     DECLARE_ALIGNED(16, uint16_t,
234                     temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
235     const int16_t *const kernel_x =
236         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
237     const int16_t *const kernel_y =
238         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
239     const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
240     uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
241                                      ? temp + (filter_taps >> 1) * MAX_SB_SIZE
242                                      : temp;
243     uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
244     const int intermediate_height =
245         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
246     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
247     aom_highbd_convolve8_horiz(
248         ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
249         MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
250     aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
251                               comp_pred8, width, NULL, -1, kernel_y, 16, width,
252                               height, bd);
253   }
254 }
255 
aom_highbd_comp_avg_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)256 void aom_highbd_comp_avg_upsampled_pred_sse2(
257     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
258     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
259     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
260     int ref_stride, int bd, int subpel_search) {
261   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
262                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
263                             bd, subpel_search);
264   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
265   uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
266   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
267   assert(!(width * height & 7));
268   int n = width * height >> 3;
269   for (int i = 0; i < n; i++) {
270     __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
271     __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
272     _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
273     comp_pred16 += 8;
274     pred += 8;
275   }
276 }
277 #endif  // CONFIG_AV1_HIGHBITDEPTH
278 
aom_comp_avg_upsampled_pred_sse2(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)279 void aom_comp_avg_upsampled_pred_sse2(
280     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
281     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
282     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
283     int ref_stride, int subpel_search) {
284   int n;
285   int i;
286   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
287                      subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
288   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
289   assert(!(width * height & 15));
290   n = width * height >> 4;
291   for (i = 0; i < n; i++) {
292     __m128i s0 = xx_loadu_128(comp_pred);
293     __m128i p0 = xx_loadu_128(pred);
294     xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
295     comp_pred += 16;
296     pred += 16;
297   }
298 }
299