1 /*!
2 **********************************************************************************
3 * \copy
4 * Copyright (c) 2013, Cisco Systems
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 * \file vaa_lasx.c
32 *
33 * \brief Loongson optimization
34 *
35 * \date 14/4/2023 Created
36 *
37 **********************************************************************************
38 */
39
40 #include <stdint.h>
41 #include "loongson_intrinsics.h"
42
43 #define LASX_SELECT_MAX_H(in0, out0, out1) do {\
44 __m256i tmp0 = __lasx_xvbsrl_v(in0, 8);\
45 tmp0 = __lasx_xvmax_hu(tmp0, in0);\
46 in0 = __lasx_xvbsrl_v(tmp0, 4);\
47 tmp0 = __lasx_xvmax_hu(tmp0, in0);\
48 in0 = __lasx_xvbsrl_v(tmp0, 2);\
49 tmp0 = __lasx_xvmax_hu(tmp0, in0);\
50 out0 = __lasx_xvpickve2gr_w(tmp0, 0);\
51 out1 = __lasx_xvpickve2gr_w(tmp0, 4);\
52 } while(0)
53
54 #define CALC_SAD_SD_MAD(in0, in1) do {\
55 vec_diff = __lasx_xvsub_h(in0, in1);\
56 abs_diff = __lasx_xvabsd_hu(in0, in1);\
57 vec_l_sad = __lasx_xvadd_h(vec_l_sad, abs_diff);\
58 vec_l_sd = __lasx_xvadd_h(vec_l_sd, vec_diff);\
59 vec_l_mad = __lasx_xvmax_hu(abs_diff, vec_l_mad);\
60 } while(0)
61
62 #define LASX_HADD_UH_U32(in, sum_ml, sum_mh)\
63 {\
64 __m256i res_m;\
65 __m256i res0_m, res1_m;\
66 \
67 res_m = __lasx_xvhaddw_wu_hu(in, in);\
68 res0_m = __lasx_xvhaddw_du_wu(res_m, res_m);\
69 res1_m = __lasx_xvbsrl_v(res0_m, 8);\
70 res0_m = __lasx_xvadd_d(res0_m, res1_m);\
71 sum_ml = __lasx_xvpickve2gr_wu(res0_m, 0);\
72 sum_mh = __lasx_xvpickve2gr_wu(res0_m, 4);\
73 }
74
75 #define LASX_HADD_SH_S32(in, sum_ml, sum_mh)\
76 {\
77 __m256i res_m;\
78 __m256i res0_m, res1_m;\
79 \
80 res_m = __lasx_xvhaddw_w_h(in, in);\
81 res0_m = __lasx_xvhaddw_d_w(res_m, res_m);\
82 res1_m = __lasx_xvbsrl_v(res0_m, 8);\
83 res0_m = __lasx_xvadd_d(res0_m, res1_m);\
84 sum_ml = __lasx_xvpickve2gr_w(res0_m, 0);\
85 sum_mh = __lasx_xvpickve2gr_w(res0_m, 4);\
86 }
87
VAACalcSadBgd_lasx(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSd8x8,uint8_t * pMad8x8)88 void VAACalcSadBgd_lasx (const uint8_t* pCurData, const uint8_t* pRefData,
89 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
90 int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8,
91 uint8_t* pMad8x8) {
92 uint8_t* tmp_ref = (uint8_t*)pRefData;
93 uint8_t* tmp_cur = (uint8_t*)pCurData;
94 int32_t iMbWidth = (iPicWidth >> 4);
95 int32_t mb_height = (iPicHeight >> 4);
96 int32_t mb_index = 0;
97 int32_t pic_stride_x8 = iPicStride << 3;
98 int32_t step = (iPicStride << 4) - iPicWidth;
99 int32_t iStridex0 = 0, iStridex1 = iPicStride, iStridex2 = iStridex1 + iPicStride,
100 iStridex3 = iStridex2 + iPicStride, iStridex4 = iStridex3 + iPicStride,
101 iStridex5 = iStridex4 + iPicStride, iStridex6 = iStridex5 + iPicStride,
102 iStridex7 = iStridex6 + iPicStride;
103 uint8_t* tmp_cur_row;
104 uint8_t* tmp_ref_row;
105 int32_t l_sad_l, l_sd_l, l_mad_l, l_sad_h, l_sd_h, l_mad_h;
106 int32_t iFrameSad = 0, index;
107 __m256i zero = {0};
108 __m256i src0, src1, src2, src3, src4, src5, src6, src7;
109 __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
110 __m256i vec_diff, vec_l_sd;
111 __m256i abs_diff, vec_l_sad, vec_l_mad;
112 for (int32_t i = 0; i < mb_height; i++) {
113 for (int32_t j = 0; j < iMbWidth; j++) {
114 index = mb_index << 2;
115 tmp_cur_row = tmp_cur;
116 tmp_ref_row = tmp_ref;
117 vec_l_sad = zero;
118 vec_l_sd = zero;
119 vec_l_mad = zero;
120 DUP4_ARG2(__lasx_xvldx,
121 tmp_cur_row, iStridex0,
122 tmp_cur_row, iStridex1,
123 tmp_cur_row, iStridex2,
124 tmp_cur_row, iStridex3,
125 src0, src1, src2, src3);
126 DUP4_ARG2(__lasx_xvldx,
127 tmp_cur_row, iStridex4,
128 tmp_cur_row, iStridex5,
129 tmp_cur_row, iStridex6,
130 tmp_cur_row, iStridex7,
131 src4, src5, src6, src7);
132 DUP4_ARG2(__lasx_xvldx,
133 tmp_ref_row, iStridex0,
134 tmp_ref_row, iStridex1,
135 tmp_ref_row, iStridex2,
136 tmp_ref_row, iStridex3,
137 vec0, vec1, vec2, vec3);
138 DUP4_ARG2(__lasx_xvldx,
139 tmp_ref_row, iStridex4,
140 tmp_ref_row, iStridex5,
141 tmp_ref_row, iStridex6,
142 tmp_ref_row, iStridex7,
143 vec4, vec5, vec6, vec7);
144 src0 = __lasx_vext2xv_hu_bu(src0);
145 src1 = __lasx_vext2xv_hu_bu(src1);
146 src2 = __lasx_vext2xv_hu_bu(src2);
147 src3 = __lasx_vext2xv_hu_bu(src3);
148 src4 = __lasx_vext2xv_hu_bu(src4);
149 src5 = __lasx_vext2xv_hu_bu(src5);
150 src6 = __lasx_vext2xv_hu_bu(src6);
151 src7 = __lasx_vext2xv_hu_bu(src7);
152 vec0 = __lasx_vext2xv_hu_bu(vec0);
153 vec1 = __lasx_vext2xv_hu_bu(vec1);
154 vec2 = __lasx_vext2xv_hu_bu(vec2);
155 vec3 = __lasx_vext2xv_hu_bu(vec3);
156 vec4 = __lasx_vext2xv_hu_bu(vec4);
157 vec5 = __lasx_vext2xv_hu_bu(vec5);
158 vec6 = __lasx_vext2xv_hu_bu(vec6);
159 vec7 = __lasx_vext2xv_hu_bu(vec7);
160 CALC_SAD_SD_MAD(src0, vec0);
161 CALC_SAD_SD_MAD(src1, vec1);
162 CALC_SAD_SD_MAD(src2, vec2);
163 CALC_SAD_SD_MAD(src3, vec3);
164 CALC_SAD_SD_MAD(src4, vec4);
165 CALC_SAD_SD_MAD(src5, vec5);
166 CALC_SAD_SD_MAD(src6, vec6);
167 CALC_SAD_SD_MAD(src7, vec7);
168 LASX_HADD_UH_U32(vec_l_sad, l_sad_l, l_sad_h);
169 LASX_HADD_SH_S32(vec_l_sd, l_sd_l, l_sd_h);
170 LASX_SELECT_MAX_H(vec_l_mad, l_mad_l, l_mad_h);
171 iFrameSad += l_sad_l + l_sad_h;
172 pSad8x8[index + 0] = l_sad_l;
173 pSd8x8 [index + 0] = l_sd_l;
174 pMad8x8[index + 0] = l_mad_l;
175 pSad8x8[index + 1] = l_sad_h;
176 pSd8x8 [index + 1] = l_sd_h;
177 pMad8x8[index + 1] = l_mad_h;
178 tmp_cur_row = tmp_cur + pic_stride_x8;
179 tmp_ref_row = tmp_ref + pic_stride_x8;
180 vec_l_sad = zero;
181 vec_l_sd = zero;
182 vec_l_mad = zero;
183 DUP4_ARG2(__lasx_xvldx,
184 tmp_cur_row, iStridex0,
185 tmp_cur_row, iStridex1,
186 tmp_cur_row, iStridex2,
187 tmp_cur_row, iStridex3,
188 src0, src1, src2, src3);
189 DUP4_ARG2(__lasx_xvldx,
190 tmp_cur_row, iStridex4,
191 tmp_cur_row, iStridex5,
192 tmp_cur_row, iStridex6,
193 tmp_cur_row, iStridex7,
194 src4, src5, src6, src7);
195 DUP4_ARG2(__lasx_xvldx,
196 tmp_ref_row, iStridex0,
197 tmp_ref_row, iStridex1,
198 tmp_ref_row, iStridex2,
199 tmp_ref_row, iStridex3,
200 vec0, vec1, vec2, vec3);
201 DUP4_ARG2(__lasx_xvldx,
202 tmp_ref_row, iStridex4,
203 tmp_ref_row, iStridex5,
204 tmp_ref_row, iStridex6,
205 tmp_ref_row, iStridex7,
206 vec4, vec5, vec6, vec7);
207 src0 = __lasx_vext2xv_hu_bu(src0);
208 src1 = __lasx_vext2xv_hu_bu(src1);
209 src2 = __lasx_vext2xv_hu_bu(src2);
210 src3 = __lasx_vext2xv_hu_bu(src3);
211 src4 = __lasx_vext2xv_hu_bu(src4);
212 src5 = __lasx_vext2xv_hu_bu(src5);
213 src6 = __lasx_vext2xv_hu_bu(src6);
214 src7 = __lasx_vext2xv_hu_bu(src7);
215 vec0 = __lasx_vext2xv_hu_bu(vec0);
216 vec1 = __lasx_vext2xv_hu_bu(vec1);
217 vec2 = __lasx_vext2xv_hu_bu(vec2);
218 vec3 = __lasx_vext2xv_hu_bu(vec3);
219 vec4 = __lasx_vext2xv_hu_bu(vec4);
220 vec5 = __lasx_vext2xv_hu_bu(vec5);
221 vec6 = __lasx_vext2xv_hu_bu(vec6);
222 vec7 = __lasx_vext2xv_hu_bu(vec7);
223 CALC_SAD_SD_MAD(src0, vec0);
224 CALC_SAD_SD_MAD(src1, vec1);
225 CALC_SAD_SD_MAD(src2, vec2);
226 CALC_SAD_SD_MAD(src3, vec3);
227 CALC_SAD_SD_MAD(src4, vec4);
228 CALC_SAD_SD_MAD(src5, vec5);
229 CALC_SAD_SD_MAD(src6, vec6);
230 CALC_SAD_SD_MAD(src7, vec7);
231 LASX_HADD_UH_U32(vec_l_sad, l_sad_l, l_sad_h);
232 LASX_HADD_SH_S32(vec_l_sd, l_sd_l, l_sd_h);
233 LASX_SELECT_MAX_H(vec_l_mad, l_mad_l, l_mad_h);
234 iFrameSad += l_sad_l + l_sad_h;
235 pSad8x8[index + 2] = l_sad_l;
236 pSd8x8 [index + 2] = l_sd_l;
237 pMad8x8[index + 2] = l_mad_l;
238 pSad8x8[index + 3] = l_sad_h;
239 pSd8x8 [index + 3] = l_sd_h;
240 pMad8x8[index + 3] = l_mad_h;
241 tmp_ref += 16;
242 tmp_cur += 16;
243 ++mb_index;
244 }
245 tmp_ref += step;
246 tmp_cur += step;
247 }
248 *pFrameSad = iFrameSad;
249 }
250