• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  **********************************************************************************
3  * \copy
4  *     Copyright (c)  2013, Cisco Systems
5  *     All rights reserved.
6  *
7  *     Redistribution and use in source and binary forms, with or without
8  *     modification, are permitted provided that the following conditions
9  *     are met:
10  *
11  *        * Redistributions of source code must retain the above copyright
12  *          notice, this list of conditions and the following disclaimer.
13  *
14  *        * Redistributions in binary form must reproduce the above copyright
15  *          notice, this list of conditions and the following disclaimer in
16  *          the documentation and/or other materials provided with the
17  *          distribution.
18  *
19  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  *     POSSIBILITY OF SUCH DAMAGE.
31  * \file    vaa_lasx.c
32  *
33  * \brief   Loongson optimization
34  *
35  * \date    14/4/2023 Created
36  *
37  **********************************************************************************
38  */
39 
40 #include <stdint.h>
41 #include "loongson_intrinsics.h"
42 
43 #define LASX_SELECT_MAX_H(in0, out0, out1) do {\
44   __m256i tmp0 = __lasx_xvbsrl_v(in0, 8);\
45   tmp0 = __lasx_xvmax_hu(tmp0, in0);\
46   in0 = __lasx_xvbsrl_v(tmp0, 4);\
47   tmp0 = __lasx_xvmax_hu(tmp0, in0);\
48   in0 = __lasx_xvbsrl_v(tmp0, 2);\
49   tmp0 = __lasx_xvmax_hu(tmp0, in0);\
50   out0 = __lasx_xvpickve2gr_w(tmp0, 0);\
51   out1 = __lasx_xvpickve2gr_w(tmp0, 4);\
52 } while(0)
53 
54 #define CALC_SAD_SD_MAD(in0, in1) do {\
55   vec_diff = __lasx_xvsub_h(in0, in1);\
56   abs_diff = __lasx_xvabsd_hu(in0, in1);\
57   vec_l_sad = __lasx_xvadd_h(vec_l_sad, abs_diff);\
58   vec_l_sd = __lasx_xvadd_h(vec_l_sd, vec_diff);\
59   vec_l_mad = __lasx_xvmax_hu(abs_diff, vec_l_mad);\
60 } while(0)
61 
62 #define LASX_HADD_UH_U32(in, sum_ml, sum_mh)\
63 {\
64   __m256i res_m;\
65   __m256i res0_m, res1_m;\
66 \
67   res_m = __lasx_xvhaddw_wu_hu(in, in);\
68   res0_m = __lasx_xvhaddw_du_wu(res_m, res_m);\
69   res1_m = __lasx_xvbsrl_v(res0_m, 8);\
70   res0_m = __lasx_xvadd_d(res0_m, res1_m);\
71   sum_ml = __lasx_xvpickve2gr_wu(res0_m, 0);\
72   sum_mh = __lasx_xvpickve2gr_wu(res0_m, 4);\
73 }
74 
75 #define LASX_HADD_SH_S32(in, sum_ml, sum_mh)\
76 {\
77   __m256i res_m;\
78   __m256i res0_m, res1_m;\
79 \
80   res_m = __lasx_xvhaddw_w_h(in, in);\
81   res0_m = __lasx_xvhaddw_d_w(res_m, res_m);\
82   res1_m = __lasx_xvbsrl_v(res0_m, 8);\
83   res0_m = __lasx_xvadd_d(res0_m, res1_m);\
84   sum_ml = __lasx_xvpickve2gr_w(res0_m, 0);\
85   sum_mh = __lasx_xvpickve2gr_w(res0_m, 4);\
86 }
87 
VAACalcSadBgd_lasx(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSd8x8,uint8_t * pMad8x8)88 void VAACalcSadBgd_lasx (const uint8_t* pCurData, const uint8_t* pRefData,
89                          int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
90                          int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8,
91                          uint8_t* pMad8x8) {
92   uint8_t* tmp_ref = (uint8_t*)pRefData;
93   uint8_t* tmp_cur = (uint8_t*)pCurData;
94   int32_t iMbWidth = (iPicWidth >> 4);
95   int32_t mb_height = (iPicHeight >> 4);
96   int32_t mb_index = 0;
97   int32_t pic_stride_x8 = iPicStride << 3;
98   int32_t step = (iPicStride << 4) - iPicWidth;
99   int32_t iStridex0 = 0, iStridex1 = iPicStride, iStridex2 = iStridex1 + iPicStride,
100           iStridex3 = iStridex2 + iPicStride, iStridex4 = iStridex3 + iPicStride,
101 		  iStridex5 = iStridex4 + iPicStride, iStridex6 = iStridex5 + iPicStride,
102 		  iStridex7 = iStridex6 + iPicStride;
103   uint8_t* tmp_cur_row;
104   uint8_t* tmp_ref_row;
105   int32_t l_sad_l, l_sd_l, l_mad_l, l_sad_h, l_sd_h, l_mad_h;
106   int32_t iFrameSad = 0, index;
107   __m256i zero = {0};
108   __m256i src0, src1, src2, src3, src4, src5, src6, src7;
109   __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
110   __m256i vec_diff, vec_l_sd;
111   __m256i abs_diff, vec_l_sad, vec_l_mad;
112   for (int32_t i = 0; i < mb_height; i++) {
113     for (int32_t j = 0; j < iMbWidth; j++) {
114       index = mb_index << 2;
115       tmp_cur_row = tmp_cur;
116       tmp_ref_row = tmp_ref;
117       vec_l_sad = zero;
118       vec_l_sd  = zero;
119       vec_l_mad = zero;
120       DUP4_ARG2(__lasx_xvldx,
121                 tmp_cur_row, iStridex0,
122                 tmp_cur_row, iStridex1,
123                 tmp_cur_row, iStridex2,
124                 tmp_cur_row, iStridex3,
125                 src0, src1, src2, src3);
126       DUP4_ARG2(__lasx_xvldx,
127                 tmp_cur_row, iStridex4,
128                 tmp_cur_row, iStridex5,
129                 tmp_cur_row, iStridex6,
130                 tmp_cur_row, iStridex7,
131                 src4, src5, src6, src7);
132       DUP4_ARG2(__lasx_xvldx,
133                 tmp_ref_row, iStridex0,
134                 tmp_ref_row, iStridex1,
135                 tmp_ref_row, iStridex2,
136                 tmp_ref_row, iStridex3,
137                 vec0, vec1, vec2, vec3);
138       DUP4_ARG2(__lasx_xvldx,
139                 tmp_ref_row, iStridex4,
140                 tmp_ref_row, iStridex5,
141                 tmp_ref_row, iStridex6,
142                 tmp_ref_row, iStridex7,
143                 vec4, vec5, vec6, vec7);
144       src0 = __lasx_vext2xv_hu_bu(src0);
145       src1 = __lasx_vext2xv_hu_bu(src1);
146       src2 = __lasx_vext2xv_hu_bu(src2);
147       src3 = __lasx_vext2xv_hu_bu(src3);
148       src4 = __lasx_vext2xv_hu_bu(src4);
149       src5 = __lasx_vext2xv_hu_bu(src5);
150       src6 = __lasx_vext2xv_hu_bu(src6);
151       src7 = __lasx_vext2xv_hu_bu(src7);
152       vec0 = __lasx_vext2xv_hu_bu(vec0);
153       vec1 = __lasx_vext2xv_hu_bu(vec1);
154       vec2 = __lasx_vext2xv_hu_bu(vec2);
155       vec3 = __lasx_vext2xv_hu_bu(vec3);
156       vec4 = __lasx_vext2xv_hu_bu(vec4);
157       vec5 = __lasx_vext2xv_hu_bu(vec5);
158       vec6 = __lasx_vext2xv_hu_bu(vec6);
159       vec7 = __lasx_vext2xv_hu_bu(vec7);
160       CALC_SAD_SD_MAD(src0, vec0);
161       CALC_SAD_SD_MAD(src1, vec1);
162       CALC_SAD_SD_MAD(src2, vec2);
163       CALC_SAD_SD_MAD(src3, vec3);
164       CALC_SAD_SD_MAD(src4, vec4);
165       CALC_SAD_SD_MAD(src5, vec5);
166       CALC_SAD_SD_MAD(src6, vec6);
167       CALC_SAD_SD_MAD(src7, vec7);
168       LASX_HADD_UH_U32(vec_l_sad, l_sad_l, l_sad_h);
169       LASX_HADD_SH_S32(vec_l_sd, l_sd_l, l_sd_h);
170       LASX_SELECT_MAX_H(vec_l_mad, l_mad_l, l_mad_h);
171       iFrameSad += l_sad_l + l_sad_h;
172       pSad8x8[index + 0] = l_sad_l;
173       pSd8x8 [index + 0] = l_sd_l;
174       pMad8x8[index + 0] = l_mad_l;
175       pSad8x8[index + 1] = l_sad_h;
176       pSd8x8 [index + 1] = l_sd_h;
177       pMad8x8[index + 1] = l_mad_h;
178       tmp_cur_row = tmp_cur + pic_stride_x8;
179       tmp_ref_row = tmp_ref + pic_stride_x8;
180       vec_l_sad = zero;
181       vec_l_sd  = zero;
182       vec_l_mad = zero;
183       DUP4_ARG2(__lasx_xvldx,
184                 tmp_cur_row, iStridex0,
185                 tmp_cur_row, iStridex1,
186                 tmp_cur_row, iStridex2,
187                 tmp_cur_row, iStridex3,
188                 src0, src1, src2, src3);
189       DUP4_ARG2(__lasx_xvldx,
190                 tmp_cur_row, iStridex4,
191                 tmp_cur_row, iStridex5,
192                 tmp_cur_row, iStridex6,
193                 tmp_cur_row, iStridex7,
194                 src4, src5, src6, src7);
195       DUP4_ARG2(__lasx_xvldx,
196                 tmp_ref_row, iStridex0,
197                 tmp_ref_row, iStridex1,
198                 tmp_ref_row, iStridex2,
199                 tmp_ref_row, iStridex3,
200                 vec0, vec1, vec2, vec3);
201       DUP4_ARG2(__lasx_xvldx,
202                 tmp_ref_row, iStridex4,
203                 tmp_ref_row, iStridex5,
204                 tmp_ref_row, iStridex6,
205                 tmp_ref_row, iStridex7,
206                 vec4, vec5, vec6, vec7);
207       src0 = __lasx_vext2xv_hu_bu(src0);
208       src1 = __lasx_vext2xv_hu_bu(src1);
209       src2 = __lasx_vext2xv_hu_bu(src2);
210       src3 = __lasx_vext2xv_hu_bu(src3);
211       src4 = __lasx_vext2xv_hu_bu(src4);
212       src5 = __lasx_vext2xv_hu_bu(src5);
213       src6 = __lasx_vext2xv_hu_bu(src6);
214       src7 = __lasx_vext2xv_hu_bu(src7);
215       vec0 = __lasx_vext2xv_hu_bu(vec0);
216       vec1 = __lasx_vext2xv_hu_bu(vec1);
217       vec2 = __lasx_vext2xv_hu_bu(vec2);
218       vec3 = __lasx_vext2xv_hu_bu(vec3);
219       vec4 = __lasx_vext2xv_hu_bu(vec4);
220       vec5 = __lasx_vext2xv_hu_bu(vec5);
221       vec6 = __lasx_vext2xv_hu_bu(vec6);
222       vec7 = __lasx_vext2xv_hu_bu(vec7);
223       CALC_SAD_SD_MAD(src0, vec0);
224       CALC_SAD_SD_MAD(src1, vec1);
225       CALC_SAD_SD_MAD(src2, vec2);
226       CALC_SAD_SD_MAD(src3, vec3);
227       CALC_SAD_SD_MAD(src4, vec4);
228       CALC_SAD_SD_MAD(src5, vec5);
229       CALC_SAD_SD_MAD(src6, vec6);
230       CALC_SAD_SD_MAD(src7, vec7);
231       LASX_HADD_UH_U32(vec_l_sad, l_sad_l, l_sad_h);
232       LASX_HADD_SH_S32(vec_l_sd, l_sd_l, l_sd_h);
233       LASX_SELECT_MAX_H(vec_l_mad, l_mad_l, l_mad_h);
234       iFrameSad += l_sad_l + l_sad_h;
235       pSad8x8[index + 2] = l_sad_l;
236       pSd8x8 [index + 2] = l_sd_l;
237       pMad8x8[index + 2] = l_mad_l;
238       pSad8x8[index + 3] = l_sad_h;
239       pSd8x8 [index + 3] = l_sd_h;
240       pMad8x8[index + 3] = l_mad_h;
241       tmp_ref += 16;
242       tmp_cur += 16;
243       ++mb_index;
244     }
245     tmp_ref += step;
246     tmp_cur += step;
247   }
248   *pFrameSad = iFrameSad;
249 }
250