1 /*!
2 **********************************************************************************
3 * Copyright (c) 2021 Loongson Technology Corporation Limited
4 * Contributed by Lu Wang <wanglu@loongson.cn>
5 *
6 * \copy
7 * Copyright (c) 2013, Cisco Systems
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * * Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 * \file vaa_lsx.c
35 *
36 * \brief Loongson optimization
37 *
38 * \date 12/10/2021 Created
39 *
40 **********************************************************************************
41 */
42
43 #include "stdint.h"
44 #include "loongson_intrinsics.h"
45
VAACalcSadBgd_lsx(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSd8x8,uint8_t * pMad8x8)46 void VAACalcSadBgd_lsx (const uint8_t* pCurData, const uint8_t* pRefData,
47 int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
48 int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8,
49 uint8_t* pMad8x8) {
50 const uint8_t* tmp_ref = pRefData;
51 const uint8_t* tmp_cur = pCurData;
52 int32_t iMbWidth = (iPicWidth >> 4);
53 int32_t mb_height = (iPicHeight >> 4);
54 int32_t mb_index = 0;
55 int32_t pic_stride_x8 = iPicStride << 3;
56 int32_t step = (iPicStride << 4) - iPicWidth;
57
58 *pFrameSad = 0;
59 for (int32_t i = 0; i < mb_height; i++) {
60 for (int32_t j = 0; j < iMbWidth; j++) {
61 int32_t k;
62 int32_t l_sad, l_sd, l_mad;
63 const uint8_t* tmp_cur_row;
64 const uint8_t* tmp_ref_row;
65 int32_t tmp_mb_index = mb_index << 2;
66 int32_t tmp_mb_index1 = tmp_mb_index + 1;
67 int32_t tmp_mb_index2 = tmp_mb_index + 2;
68 int32_t tmp_mb_index3 = tmp_mb_index + 3;
69 __m128i cur, ref;
70 __m128i vec_diff, vec_abs_diff, tmp_l_sd, tmp_l_sad, tmp_l_mad;
71 __m128i zero = __lsx_vreplgr2vr_b(0);
72 __m128i vec_l_sd = zero;
73 __m128i vec_l_sad = zero;
74 __m128i vec_l_mad = zero;
75
76 l_mad = l_sd = l_sad = 0;
77 tmp_cur_row = tmp_cur;
78 tmp_ref_row = tmp_ref;
79 for (k = 0; k < 8; k ++) {
80 DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
81 DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
82
83 vec_diff = __lsx_vsub_h(cur, ref);
84 vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
85 vec_abs_diff = __lsx_vabsd_h(cur, ref);
86 vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
87 vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
88 tmp_cur_row += iPicStride;
89 tmp_ref_row += iPicStride;
90 }
91
92 DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
93 tmp_l_sd, tmp_l_sad);
94 DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
95 tmp_l_sd, tmp_l_sad);
96 DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
97 tmp_l_sd, tmp_l_sad);
98 DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
99
100 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
101 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
102 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
103 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
104 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
105 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
106 l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
107
108 *pFrameSad += l_sad;
109 pSad8x8[tmp_mb_index] = l_sad;
110 pSd8x8[tmp_mb_index] = l_sd;
111 pMad8x8[tmp_mb_index] = l_mad;
112
113 l_mad = l_sd = l_sad = 0;
114 tmp_cur_row = tmp_cur + 8;
115 tmp_ref_row = tmp_ref + 8;
116 vec_l_sd = vec_l_sad = vec_l_mad = zero;
117 for (k = 0; k < 8; k ++) {
118 DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
119 DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
120
121 vec_diff = __lsx_vsub_h(cur, ref);
122 vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
123 vec_abs_diff = __lsx_vabsd_h(cur, ref);
124 vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
125 vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
126 tmp_cur_row += iPicStride;
127 tmp_ref_row += iPicStride;
128 }
129
130 DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
131 tmp_l_sd, tmp_l_sad);
132 DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
133 tmp_l_sd, tmp_l_sad);
134 DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
135 tmp_l_sd, tmp_l_sad);
136 DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
137
138 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
139 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
140 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
141 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
142 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
143 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
144 l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
145
146 *pFrameSad += l_sad;
147 pSad8x8[tmp_mb_index1] = l_sad;
148 pSd8x8[tmp_mb_index1] = l_sd;
149 pMad8x8[tmp_mb_index1] = l_mad;
150
151 l_mad = l_sd = l_sad = 0;
152 tmp_cur_row = tmp_cur + pic_stride_x8;
153 tmp_ref_row = tmp_ref + pic_stride_x8;
154 vec_l_sd = vec_l_sad = vec_l_mad = zero;
155 for (k = 0; k < 8; k ++) {
156 DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
157 DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
158
159 vec_diff = __lsx_vsub_h(cur, ref);
160 vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
161 vec_abs_diff = __lsx_vabsd_h(cur, ref);
162 vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
163 vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
164 tmp_cur_row += iPicStride;
165 tmp_ref_row += iPicStride;
166 }
167
168 DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
169 tmp_l_sd, tmp_l_sad);
170 DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
171 tmp_l_sd, tmp_l_sad);
172 DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
173 tmp_l_sd, tmp_l_sad);
174 DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
175
176 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
177 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
178 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
179 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
180 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
181 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
182 l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
183
184 *pFrameSad += l_sad;
185 pSad8x8[tmp_mb_index2] = l_sad;
186 pSd8x8[tmp_mb_index2] = l_sd;
187 pMad8x8[tmp_mb_index2] = l_mad;
188
189 l_mad = l_sd = l_sad = 0;
190 tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
191 tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
192 vec_l_sd = vec_l_sad = vec_l_mad = zero;
193 for (k = 0; k < 8; k ++) {
194 DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
195 DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
196
197 vec_diff = __lsx_vsub_h(cur, ref);
198 vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
199 vec_abs_diff = __lsx_vabsd_h(cur, ref);
200 vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
201 vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
202 tmp_cur_row += iPicStride;
203 tmp_ref_row += iPicStride;
204 }
205
206 DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
207 tmp_l_sd, tmp_l_sad);
208 DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
209 tmp_l_sd, tmp_l_sad);
210 DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
211 tmp_l_sd, tmp_l_sad);
212 DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
213
214 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
215 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
216 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
217 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
218 tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
219 vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
220 l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
221
222 *pFrameSad += l_sad;
223 pSad8x8[tmp_mb_index3] = l_sad;
224 pSd8x8[tmp_mb_index3] = l_sd;
225 pMad8x8[tmp_mb_index3] = l_mad;
226
227 tmp_ref += 16;
228 tmp_cur += 16;
229 ++mb_index;
230 }
231 tmp_ref += step;
232 tmp_cur += step;
233 }
234 }
235
236