• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  **********************************************************************************
3  * Copyright (c) 2021 Loongson Technology Corporation Limited
4  * Contributed by Lu Wang <wanglu@loongson.cn>
5  *
6  * \copy
7  *     Copyright (c)  2013, Cisco Systems
8  *     All rights reserved.
9  *
10  *     Redistribution and use in source and binary forms, with or without
11  *     modification, are permitted provided that the following conditions
12  *     are met:
13  *
14  *        * Redistributions of source code must retain the above copyright
15  *          notice, this list of conditions and the following disclaimer.
16  *
17  *        * Redistributions in binary form must reproduce the above copyright
18  *          notice, this list of conditions and the following disclaimer in
19  *          the documentation and/or other materials provided with the
20  *          distribution.
21  *
22  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  *     POSSIBILITY OF SUCH DAMAGE.
34  * \file    vaa_lsx.c
35  *
36  * \brief   Loongson optimization
37  *
38  * \date    12/10/2021 Created
39  *
40  **********************************************************************************
41  */
42 
43 #include "stdint.h"
44 #include "loongson_intrinsics.h"
45 
VAACalcSadBgd_lsx(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSd8x8,uint8_t * pMad8x8)46 void VAACalcSadBgd_lsx (const uint8_t* pCurData, const uint8_t* pRefData,
47                         int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
48                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8,
49                         uint8_t* pMad8x8) {
50   const uint8_t* tmp_ref = pRefData;
51   const uint8_t* tmp_cur = pCurData;
52   int32_t iMbWidth = (iPicWidth >> 4);
53   int32_t mb_height = (iPicHeight >> 4);
54   int32_t mb_index = 0;
55   int32_t pic_stride_x8 = iPicStride << 3;
56   int32_t step = (iPicStride << 4) - iPicWidth;
57 
58   *pFrameSad = 0;
59   for (int32_t i = 0; i < mb_height; i++) {
60     for (int32_t j = 0; j < iMbWidth; j++) {
61       int32_t k;
62       int32_t l_sad, l_sd, l_mad;
63       const uint8_t* tmp_cur_row;
64       const uint8_t* tmp_ref_row;
65       int32_t tmp_mb_index = mb_index << 2;
66       int32_t tmp_mb_index1 = tmp_mb_index + 1;
67       int32_t tmp_mb_index2 = tmp_mb_index + 2;
68       int32_t tmp_mb_index3 = tmp_mb_index + 3;
69       __m128i cur, ref;
70       __m128i vec_diff, vec_abs_diff, tmp_l_sd, tmp_l_sad, tmp_l_mad;
71       __m128i zero = __lsx_vreplgr2vr_b(0);
72       __m128i vec_l_sd =  zero;
73       __m128i vec_l_sad = zero;
74       __m128i vec_l_mad = zero;
75 
76       l_mad = l_sd = l_sad =  0;
77       tmp_cur_row = tmp_cur;
78       tmp_ref_row = tmp_ref;
79       for (k = 0; k < 8; k ++) {
80         DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
81         DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
82 
83         vec_diff = __lsx_vsub_h(cur, ref);
84         vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
85         vec_abs_diff = __lsx_vabsd_h(cur, ref);
86         vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
87         vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
88         tmp_cur_row += iPicStride;
89         tmp_ref_row += iPicStride;
90       }
91 
92       DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
93                 tmp_l_sd, tmp_l_sad);
94       DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
95                 tmp_l_sd, tmp_l_sad);
96       DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
97                 tmp_l_sd,  tmp_l_sad);
98       DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
99 
100       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
101       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
102       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
103       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
104       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
105       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
106       l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
107 
108       *pFrameSad += l_sad;
109       pSad8x8[tmp_mb_index] = l_sad;
110       pSd8x8[tmp_mb_index] = l_sd;
111       pMad8x8[tmp_mb_index] = l_mad;
112 
113       l_mad = l_sd = l_sad =  0;
114       tmp_cur_row = tmp_cur + 8;
115       tmp_ref_row = tmp_ref + 8;
116       vec_l_sd = vec_l_sad = vec_l_mad = zero;
117       for (k = 0; k < 8; k ++) {
118         DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
119         DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
120 
121         vec_diff = __lsx_vsub_h(cur, ref);
122         vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
123         vec_abs_diff = __lsx_vabsd_h(cur, ref);
124         vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
125         vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
126         tmp_cur_row += iPicStride;
127         tmp_ref_row += iPicStride;
128       }
129 
130       DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
131                 tmp_l_sd, tmp_l_sad);
132       DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
133                 tmp_l_sd, tmp_l_sad);
134       DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
135                 tmp_l_sd,  tmp_l_sad);
136       DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
137 
138       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
139       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
140       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
141       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
142       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
143       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
144       l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
145 
146       *pFrameSad += l_sad;
147       pSad8x8[tmp_mb_index1] = l_sad;
148       pSd8x8[tmp_mb_index1] = l_sd;
149       pMad8x8[tmp_mb_index1] = l_mad;
150 
151       l_mad = l_sd = l_sad =  0;
152       tmp_cur_row = tmp_cur + pic_stride_x8;
153       tmp_ref_row = tmp_ref + pic_stride_x8;
154       vec_l_sd = vec_l_sad = vec_l_mad = zero;
155       for (k = 0; k < 8; k ++) {
156         DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
157         DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
158 
159         vec_diff = __lsx_vsub_h(cur, ref);
160         vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
161         vec_abs_diff = __lsx_vabsd_h(cur, ref);
162         vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
163         vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
164         tmp_cur_row += iPicStride;
165         tmp_ref_row += iPicStride;
166       }
167 
168       DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
169                 tmp_l_sd, tmp_l_sad);
170       DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
171                 tmp_l_sd, tmp_l_sad);
172       DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
173                 tmp_l_sd,  tmp_l_sad);
174       DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
175 
176       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
177       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
178       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
179       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
180       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
181       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
182       l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
183 
184       *pFrameSad += l_sad;
185       pSad8x8[tmp_mb_index2] = l_sad;
186       pSd8x8[tmp_mb_index2] = l_sd;
187       pMad8x8[tmp_mb_index2] = l_mad;
188 
189       l_mad = l_sd = l_sad =  0;
190       tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
191       tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
192       vec_l_sd = vec_l_sad = vec_l_mad = zero;
193       for (k = 0; k < 8; k ++) {
194         DUP2_ARG2(__lsx_vld, tmp_cur_row, 0, tmp_ref_row, 0, cur, ref);
195         DUP2_ARG2(__lsx_vilvl_b, zero, cur, zero, ref, cur, ref);
196 
197         vec_diff = __lsx_vsub_h(cur, ref);
198         vec_l_sd = __lsx_vadd_h(vec_l_sd, vec_diff);
199         vec_abs_diff = __lsx_vabsd_h(cur, ref);
200         vec_l_sad = __lsx_vadd_h(vec_l_sad, vec_abs_diff);
201         vec_l_mad = __lsx_vmax_h(vec_l_mad, vec_abs_diff);
202         tmp_cur_row += iPicStride;
203         tmp_ref_row += iPicStride;
204       }
205 
206       DUP2_ARG2(__lsx_vhaddw_w_h, vec_l_sd, vec_l_sd, vec_l_sad, vec_l_sad,
207                 tmp_l_sd, tmp_l_sad);
208       DUP2_ARG2(__lsx_vhaddw_d_w, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
209                 tmp_l_sd, tmp_l_sad);
210       DUP2_ARG2(__lsx_vhaddw_q_d, tmp_l_sd, tmp_l_sd, tmp_l_sad, tmp_l_sad,
211                 tmp_l_sd,  tmp_l_sad);
212       DUP2_ARG2(__lsx_vpickve2gr_d, tmp_l_sd, 0, tmp_l_sad, 0, l_sd, l_sad);
213 
214       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 8);
215       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
216       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 4);
217       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
218       tmp_l_mad = __lsx_vbsrl_v(vec_l_mad, 2);
219       vec_l_mad = __lsx_vmax_h(vec_l_mad, tmp_l_mad);
220       l_mad = __lsx_vpickve2gr_h(vec_l_mad, 0);
221 
222       *pFrameSad += l_sad;
223       pSad8x8[tmp_mb_index3] = l_sad;
224       pSd8x8[tmp_mb_index3] = l_sd;
225       pMad8x8[tmp_mb_index3] = l_mad;
226 
227       tmp_ref += 16;
228       tmp_cur += 16;
229       ++mb_index;
230     }
231     tmp_ref += step;
232     tmp_cur += step;
233   }
234 }
235 
236