• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  * \copy
3  *     Copyright (c)  2009-2018, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  *
32  * \file    copy_mb_lsx.c
33  *
34  * \brief   Loongson optimization
35  *
36  * \date    12/18/2021 Created
37  *
38  *************************************************************************************
39  */
40 
41 #include <stdint.h>
42 #include "loongson_intrinsics.h"
43 
WelsCopy8x8_lsx(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)44 void WelsCopy8x8_lsx (uint8_t* pDst, int32_t iStrideD,
45                       uint8_t* pSrc, int32_t iStrideS) {
46 
47   int32_t iStride0 = 0;
48   int32_t iStride1 = iStrideS << 1;
49   int32_t iStride2 = iStride1 << 1;
50 
51   __m128i src0, src1, src2, src3, src4 ,src5, src6, src7;
52 
53   DUP4_ARG2(__lsx_vldx,
54             pSrc, iStride0,
55             pSrc, iStrideS,
56             pSrc, iStride1,
57             pSrc, iStride1 + iStrideS,
58             src0, src1, src2, src3);
59   pSrc += iStride2;
60   DUP4_ARG2(__lsx_vldx,
61             pSrc, iStride0,
62             pSrc, iStrideS,
63             pSrc, iStride1,
64             pSrc, iStride1 + iStrideS,
65             src4, src5, src6, src7);
66 
67   iStride1 = iStrideD << 1;
68 
69   __lsx_vstelm_d(src0, pDst, 0, 0);
70   __lsx_vstelm_d(src1, pDst + iStrideD, 0, 0);
71   pDst += iStride1;
72   __lsx_vstelm_d(src2, pDst, 0, 0);
73   __lsx_vstelm_d(src3, pDst + iStrideD, 0, 0);
74   pDst += iStride1;
75   __lsx_vstelm_d(src4, pDst, 0, 0);
76   __lsx_vstelm_d(src5, pDst + iStrideD, 0, 0);
77   pDst += iStride1;
78   __lsx_vstelm_d(src6, pDst, 0, 0);
79   __lsx_vstelm_d(src7, pDst + iStrideD, 0, 0);
80 }
81 
WelsCopy16x16_lsx(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)82 void WelsCopy16x16_lsx (uint8_t* pDst, int32_t iStrideD,
83                         uint8_t* pSrc, int32_t iStrideS) {
84   int32_t iStride0 = 0;
85   int32_t iStride1 = iStrideS;
86   int32_t iStride2 = iStrideS << 1;
87   int32_t iStride3 = iStride2 + iStrideS;
88   int32_t iStride4 = iStrideS << 2;
89 
90   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
91   __m128i src8, src9, src10, src11, src12, src13, src14, src15;
92 
93   DUP4_ARG2(__lsx_vldx,
94             pSrc, iStride0, pSrc, iStride1,
95             pSrc, iStride2, pSrc, iStride3,
96             src0, src1, src2, src3);
97   pSrc += iStride4;
98   DUP4_ARG2(__lsx_vldx,
99             pSrc, iStride0, pSrc, iStride1,
100             pSrc, iStride2, pSrc, iStride3,
101             src4, src5, src6, src7);
102   pSrc += iStride4;
103   DUP4_ARG2(__lsx_vldx,
104             pSrc, iStride0, pSrc, iStride1,
105             pSrc, iStride2, pSrc, iStride3,
106             src8, src9, src10, src11);
107   pSrc += iStride4;
108   DUP4_ARG2(__lsx_vldx,
109             pSrc, iStride0, pSrc, iStride1,
110             pSrc, iStride2, pSrc, iStride3,
111             src12, src13, src14, src15);
112 
113   iStride1 = iStrideD;
114   iStride2 = iStrideD << 1;
115   iStride3 = iStride2 + iStrideD;
116   iStride4 = iStrideD << 2;
117 
118   __lsx_vstx(src0, pDst, iStride0);
119   __lsx_vstx(src1, pDst, iStride1);
120   __lsx_vstx(src2, pDst, iStride2);
121   __lsx_vstx(src3, pDst, iStride3);
122   pDst += iStride4;
123   __lsx_vstx(src4, pDst, iStride0);
124   __lsx_vstx(src5, pDst, iStride1);
125   __lsx_vstx(src6, pDst, iStride2);
126   __lsx_vstx(src7, pDst, iStride3);
127   pDst += iStride4;
128   __lsx_vstx(src8, pDst, iStride0);
129   __lsx_vstx(src9, pDst, iStride1);
130   __lsx_vstx(src10, pDst, iStride2);
131   __lsx_vstx(src11, pDst, iStride3);
132   pDst += iStride4;
133   __lsx_vstx(src12, pDst, iStride0);
134   __lsx_vstx(src13, pDst, iStride1);
135   __lsx_vstx(src14, pDst, iStride2);
136   __lsx_vstx(src15, pDst, iStride3);
137 }
138 
WelsCopy16x16NotAligned_lsx(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)139 void WelsCopy16x16NotAligned_lsx (uint8_t* pDst, int32_t iStrideD,
140                                   uint8_t* pSrc, int32_t iStrideS) {
141   int32_t iStride0 = 0;
142   int32_t iStride1 = iStrideS;
143   int32_t iStride2 = iStrideS << 1;
144   int32_t iStride3 = iStride2 + iStrideS;
145   int32_t iStride4 = iStrideS << 2;
146 
147   v16u8_b src0, src1, src2, src3, src4, src5, src6, src7;
148   v16u8_b src8, src9, src10, src11, src12, src13, src14, src15;
149 
150   DUP4_ARG2((v16u8_b)__lsx_vldx,
151             pSrc, iStride0, pSrc, iStride1,
152             pSrc, iStride2, pSrc, iStride3,
153             src0, src1, src2, src3);
154   pSrc += iStride4;
155   DUP4_ARG2((v16u8_b)__lsx_vldx,
156             pSrc, iStride0, pSrc, iStride1,
157             pSrc, iStride2, pSrc, iStride3,
158             src4, src5, src6, src7);
159   pSrc += iStride4;
160   DUP4_ARG2((v16u8_b)__lsx_vldx,
161             pSrc, iStride0, pSrc, iStride1,
162             pSrc, iStride2, pSrc, iStride3,
163             src8, src9, src10, src11);
164   pSrc += iStride4;
165   DUP4_ARG2((v16u8_b)__lsx_vldx,
166             pSrc, iStride0, pSrc, iStride1,
167             pSrc, iStride2, pSrc, iStride3,
168             src12, src13, src14, src15);
169 
170   iStride1 = iStrideD;
171   iStride2 = iStrideD << 1;
172   iStride3 = iStride2 + iStrideD;
173   iStride4 = iStrideD << 2;
174 
175   __lsx_vstx((__m128i)src0, pDst, iStride0);
176   __lsx_vstx((__m128i)src1, pDst, iStride1);
177   __lsx_vstx((__m128i)src2, pDst, iStride2);
178   __lsx_vstx((__m128i)src3, pDst, iStride3);
179   pDst += iStride4;
180   __lsx_vstx((__m128i)src4, pDst, iStride0);
181   __lsx_vstx((__m128i)src5, pDst, iStride1);
182   __lsx_vstx((__m128i)src6, pDst, iStride2);
183   __lsx_vstx((__m128i)src7, pDst, iStride3);
184   pDst += iStride4;
185   __lsx_vstx((__m128i)src8, pDst, iStride0);
186   __lsx_vstx((__m128i)src9, pDst, iStride1);
187   __lsx_vstx((__m128i)src10, pDst, iStride2);
188   __lsx_vstx((__m128i)src11, pDst, iStride3);
189   pDst += iStride4;
190   __lsx_vstx((__m128i)src12, pDst, iStride0);
191   __lsx_vstx((__m128i)src13, pDst, iStride1);
192   __lsx_vstx((__m128i)src14, pDst, iStride2);
193   __lsx_vstx((__m128i)src15, pDst, iStride3);
194 }
195