1 /*!
2 * \copy
3 * Copyright (c) 2009-2018, Cisco Systems
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 *
32 * \file copy_mb_lsx.c
33 *
34 * \brief Loongson optimization
35 *
36 * \date 12/18/2021 Created
37 *
38 *************************************************************************************
39 */
40
41 #include <stdint.h>
42 #include "loongson_intrinsics.h"
43
WelsCopy8x8_lsx(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)44 void WelsCopy8x8_lsx (uint8_t* pDst, int32_t iStrideD,
45 uint8_t* pSrc, int32_t iStrideS) {
46
47 int32_t iStride0 = 0;
48 int32_t iStride1 = iStrideS << 1;
49 int32_t iStride2 = iStride1 << 1;
50
51 __m128i src0, src1, src2, src3, src4 ,src5, src6, src7;
52
53 DUP4_ARG2(__lsx_vldx,
54 pSrc, iStride0,
55 pSrc, iStrideS,
56 pSrc, iStride1,
57 pSrc, iStride1 + iStrideS,
58 src0, src1, src2, src3);
59 pSrc += iStride2;
60 DUP4_ARG2(__lsx_vldx,
61 pSrc, iStride0,
62 pSrc, iStrideS,
63 pSrc, iStride1,
64 pSrc, iStride1 + iStrideS,
65 src4, src5, src6, src7);
66
67 iStride1 = iStrideD << 1;
68
69 __lsx_vstelm_d(src0, pDst, 0, 0);
70 __lsx_vstelm_d(src1, pDst + iStrideD, 0, 0);
71 pDst += iStride1;
72 __lsx_vstelm_d(src2, pDst, 0, 0);
73 __lsx_vstelm_d(src3, pDst + iStrideD, 0, 0);
74 pDst += iStride1;
75 __lsx_vstelm_d(src4, pDst, 0, 0);
76 __lsx_vstelm_d(src5, pDst + iStrideD, 0, 0);
77 pDst += iStride1;
78 __lsx_vstelm_d(src6, pDst, 0, 0);
79 __lsx_vstelm_d(src7, pDst + iStrideD, 0, 0);
80 }
81
WelsCopy16x16_lsx(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)82 void WelsCopy16x16_lsx (uint8_t* pDst, int32_t iStrideD,
83 uint8_t* pSrc, int32_t iStrideS) {
84 int32_t iStride0 = 0;
85 int32_t iStride1 = iStrideS;
86 int32_t iStride2 = iStrideS << 1;
87 int32_t iStride3 = iStride2 + iStrideS;
88 int32_t iStride4 = iStrideS << 2;
89
90 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
91 __m128i src8, src9, src10, src11, src12, src13, src14, src15;
92
93 DUP4_ARG2(__lsx_vldx,
94 pSrc, iStride0, pSrc, iStride1,
95 pSrc, iStride2, pSrc, iStride3,
96 src0, src1, src2, src3);
97 pSrc += iStride4;
98 DUP4_ARG2(__lsx_vldx,
99 pSrc, iStride0, pSrc, iStride1,
100 pSrc, iStride2, pSrc, iStride3,
101 src4, src5, src6, src7);
102 pSrc += iStride4;
103 DUP4_ARG2(__lsx_vldx,
104 pSrc, iStride0, pSrc, iStride1,
105 pSrc, iStride2, pSrc, iStride3,
106 src8, src9, src10, src11);
107 pSrc += iStride4;
108 DUP4_ARG2(__lsx_vldx,
109 pSrc, iStride0, pSrc, iStride1,
110 pSrc, iStride2, pSrc, iStride3,
111 src12, src13, src14, src15);
112
113 iStride1 = iStrideD;
114 iStride2 = iStrideD << 1;
115 iStride3 = iStride2 + iStrideD;
116 iStride4 = iStrideD << 2;
117
118 __lsx_vstx(src0, pDst, iStride0);
119 __lsx_vstx(src1, pDst, iStride1);
120 __lsx_vstx(src2, pDst, iStride2);
121 __lsx_vstx(src3, pDst, iStride3);
122 pDst += iStride4;
123 __lsx_vstx(src4, pDst, iStride0);
124 __lsx_vstx(src5, pDst, iStride1);
125 __lsx_vstx(src6, pDst, iStride2);
126 __lsx_vstx(src7, pDst, iStride3);
127 pDst += iStride4;
128 __lsx_vstx(src8, pDst, iStride0);
129 __lsx_vstx(src9, pDst, iStride1);
130 __lsx_vstx(src10, pDst, iStride2);
131 __lsx_vstx(src11, pDst, iStride3);
132 pDst += iStride4;
133 __lsx_vstx(src12, pDst, iStride0);
134 __lsx_vstx(src13, pDst, iStride1);
135 __lsx_vstx(src14, pDst, iStride2);
136 __lsx_vstx(src15, pDst, iStride3);
137 }
138
WelsCopy16x16NotAligned_lsx(uint8_t * pDst,int32_t iStrideD,uint8_t * pSrc,int32_t iStrideS)139 void WelsCopy16x16NotAligned_lsx (uint8_t* pDst, int32_t iStrideD,
140 uint8_t* pSrc, int32_t iStrideS) {
141 int32_t iStride0 = 0;
142 int32_t iStride1 = iStrideS;
143 int32_t iStride2 = iStrideS << 1;
144 int32_t iStride3 = iStride2 + iStrideS;
145 int32_t iStride4 = iStrideS << 2;
146
147 v16u8_b src0, src1, src2, src3, src4, src5, src6, src7;
148 v16u8_b src8, src9, src10, src11, src12, src13, src14, src15;
149
150 DUP4_ARG2((v16u8_b)__lsx_vldx,
151 pSrc, iStride0, pSrc, iStride1,
152 pSrc, iStride2, pSrc, iStride3,
153 src0, src1, src2, src3);
154 pSrc += iStride4;
155 DUP4_ARG2((v16u8_b)__lsx_vldx,
156 pSrc, iStride0, pSrc, iStride1,
157 pSrc, iStride2, pSrc, iStride3,
158 src4, src5, src6, src7);
159 pSrc += iStride4;
160 DUP4_ARG2((v16u8_b)__lsx_vldx,
161 pSrc, iStride0, pSrc, iStride1,
162 pSrc, iStride2, pSrc, iStride3,
163 src8, src9, src10, src11);
164 pSrc += iStride4;
165 DUP4_ARG2((v16u8_b)__lsx_vldx,
166 pSrc, iStride0, pSrc, iStride1,
167 pSrc, iStride2, pSrc, iStride3,
168 src12, src13, src14, src15);
169
170 iStride1 = iStrideD;
171 iStride2 = iStrideD << 1;
172 iStride3 = iStride2 + iStrideD;
173 iStride4 = iStrideD << 2;
174
175 __lsx_vstx((__m128i)src0, pDst, iStride0);
176 __lsx_vstx((__m128i)src1, pDst, iStride1);
177 __lsx_vstx((__m128i)src2, pDst, iStride2);
178 __lsx_vstx((__m128i)src3, pDst, iStride3);
179 pDst += iStride4;
180 __lsx_vstx((__m128i)src4, pDst, iStride0);
181 __lsx_vstx((__m128i)src5, pDst, iStride1);
182 __lsx_vstx((__m128i)src6, pDst, iStride2);
183 __lsx_vstx((__m128i)src7, pDst, iStride3);
184 pDst += iStride4;
185 __lsx_vstx((__m128i)src8, pDst, iStride0);
186 __lsx_vstx((__m128i)src9, pDst, iStride1);
187 __lsx_vstx((__m128i)src10, pDst, iStride2);
188 __lsx_vstx((__m128i)src11, pDst, iStride3);
189 pDst += iStride4;
190 __lsx_vstx((__m128i)src12, pDst, iStride0);
191 __lsx_vstx((__m128i)src13, pDst, iStride1);
192 __lsx_vstx((__m128i)src14, pDst, iStride2);
193 __lsx_vstx((__m128i)src15, pDst, iStride3);
194 }
195