1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <stdlib.h>
11
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/macros_msa.h"
14
vpx_avg_8x8_msa(const uint8_t * src,int32_t src_stride)15 uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
16 uint32_t sum_out;
17 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
18 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
19 v4u32 sum = { 0 };
20
21 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
22 HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
23 HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
24 ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
25 ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
26 sum0 += sum4;
27
28 sum = __msa_hadd_u_w(sum0, sum0);
29 sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
30 sum = __msa_hadd_u_w(sum0, sum0);
31 sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
32 sum_out = __msa_copy_u_w((v4i32)sum, 0);
33
34 return sum_out;
35 }
36
vpx_avg_4x4_msa(const uint8_t * src,int32_t src_stride)37 uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
38 uint32_t sum_out;
39 uint32_t src0, src1, src2, src3;
40 v16u8 vec = { 0 };
41 v8u16 sum0;
42 v4u32 sum1;
43 v2u64 sum2;
44
45 LW4(src, src_stride, src0, src1, src2, src3);
46 INSERT_W4_UB(src0, src1, src2, src3, vec);
47
48 sum0 = __msa_hadd_u_h(vec, vec);
49 sum1 = __msa_hadd_u_w(sum0, sum0);
50 sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
51 sum1 = __msa_hadd_u_w(sum0, sum0);
52 sum2 = __msa_hadd_u_d(sum1, sum1);
53 sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
54 sum_out = __msa_copy_u_w((v4i32)sum1, 0);
55
56 return sum_out;
57 }
58
vpx_hadamard_8x8_msa(const int16_t * src,ptrdiff_t src_stride,int16_t * dst)59 void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
60 int16_t *dst) {
61 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
62 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
63
64 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
65 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
66 tmp6, tmp7, tmp5, tmp3, tmp1);
67 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
68 src5, src7, src6, src3, src2);
69 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
70 tmp4, tmp5, tmp1, tmp6, tmp2);
71 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
72 src2, src3, src4, src5, src6, src7);
73 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
74 tmp6, tmp7, tmp5, tmp3, tmp1);
75 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
76 src5, src7, src6, src3, src2);
77 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
78 tmp4, tmp5, tmp1, tmp6, tmp2);
79 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
80 src2, src3, src4, src5, src6, src7);
81 ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
82 }
83
vpx_hadamard_16x16_msa(const int16_t * src,ptrdiff_t src_stride,int16_t * dst)84 void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
85 int16_t *dst) {
86 v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
87 v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
88 v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
89 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
90
91 LD_SH2(src, 8, src0, src8);
92 src += src_stride;
93 LD_SH2(src, 8, src1, src9);
94 src += src_stride;
95 LD_SH2(src, 8, src2, src10);
96 src += src_stride;
97 LD_SH2(src, 8, src3, src11);
98 src += src_stride;
99 LD_SH2(src, 8, src4, src12);
100 src += src_stride;
101 LD_SH2(src, 8, src5, src13);
102 src += src_stride;
103 LD_SH2(src, 8, src6, src14);
104 src += src_stride;
105 LD_SH2(src, 8, src7, src15);
106 src += src_stride;
107
108 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
109 tmp6, tmp7, tmp5, tmp3, tmp1);
110 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
111 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
112
113 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
114 src5, src7, src6, src3, src2);
115 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
116 tmp4, tmp5, tmp1, tmp6, tmp2);
117 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
118 src2, src3, src4, src5, src6, src7);
119 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
120 tmp6, tmp7, tmp5, tmp3, tmp1);
121 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
122 src5, src7, src6, src3, src2);
123 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
124 tmp4, tmp5, tmp1, tmp6, tmp2);
125 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
126 src2, src11, src4, src5, src6, src7);
127 ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
128
129 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
130 src12, src13, src15, src14, src11, src10);
131 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
132 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
133 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
134 src9, src10, src11, src12, src13, src14, src15);
135 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
136 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
137 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
138 src12, src13, src15, src14, src11, src10);
139 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
140 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
141 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
142 res1, res2, res3, res4, res5, res6, res7);
143
144 LD_SH2(src, 8, src0, src8);
145 src += src_stride;
146 LD_SH2(src, 8, src1, src9);
147 src += src_stride;
148 LD_SH2(src, 8, src2, src10);
149 src += src_stride;
150 LD_SH2(src, 8, src3, src11);
151 src += src_stride;
152
153 ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
154
155 LD_SH2(src, 8, src4, src12);
156 src += src_stride;
157 LD_SH2(src, 8, src5, src13);
158 src += src_stride;
159 LD_SH2(src, 8, src6, src14);
160 src += src_stride;
161 LD_SH2(src, 8, src7, src15);
162 src += src_stride;
163
164 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
165 tmp6, tmp7, tmp5, tmp3, tmp1);
166 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
167 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
168
169 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
170 src5, src7, src6, src3, src2);
171 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
172 tmp4, tmp5, tmp1, tmp6, tmp2);
173 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
174 src2, src3, src4, src5, src6, src7);
175 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
176 tmp6, tmp7, tmp5, tmp3, tmp1);
177 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
178 src5, src7, src6, src3, src2);
179 BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
180 tmp4, tmp5, tmp1, tmp6, tmp2);
181 TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
182 src2, src3, src4, src5, src6, src7);
183 ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
184
185 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
186 src12, src13, src15, src14, src11, src10);
187 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
188 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
189 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
190 src9, src10, src11, src12, src13, src14, src15);
191 BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
192 tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
193 BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
194 src12, src13, src15, src14, src11, src10);
195 BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
196 tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
197 TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
198 res1, res2, res3, res4, res5, res6, res7);
199 ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
200
201 LD_SH4(dst, 64, src0, src1, src2, src3);
202 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
203
204 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
205 tmp6, tmp7, tmp5, tmp3, tmp1);
206 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
207 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
208 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
209 src5, src7, src6, src3, src2);
210
211 ST_SH4(src0, src1, src2, src3, dst, 64);
212 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
213 dst += 16;
214
215 LD_SH4(dst, 64, src0, src1, src2, src3);
216 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
217
218 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
219 tmp6, tmp7, tmp5, tmp3, tmp1);
220 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
221 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
222 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
223 src5, src7, src6, src3, src2);
224
225 ST_SH4(src0, src1, src2, src3, dst, 64);
226 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
227 dst += 16;
228
229 LD_SH4(dst, 64, src0, src1, src2, src3);
230 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
231
232 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
233 tmp6, tmp7, tmp5, tmp3, tmp1);
234 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
235 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
236 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
237 src5, src7, src6, src3, src2);
238
239 ST_SH4(src0, src1, src2, src3, dst, 64);
240 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
241 dst += 16;
242
243 LD_SH4(dst, 64, src0, src1, src2, src3);
244 LD_SH4(dst + 8, 64, src4, src5, src6, src7);
245
246 BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
247 tmp6, tmp7, tmp5, tmp3, tmp1);
248 SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
249 SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
250 BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
251 src5, src7, src6, src3, src2);
252
253 ST_SH4(src0, src1, src2, src3, dst, 64);
254 ST_SH4(src4, src5, src6, src7, dst + 8, 64);
255 }
256
vpx_satd_msa(const int16_t * data,int length)257 int vpx_satd_msa(const int16_t *data, int length) {
258 int i, satd;
259 v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
260 v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
261 v8i16 zero = { 0 };
262 v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
263 v4u32 tmp0_w = { 0 };
264
265 if (16 == length) {
266 LD_SH2(data, 8, src0, src1);
267 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
268 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
269 tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
270 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
271 satd = HADD_UW_U32(tmp0_w);
272 } else if (64 == length) {
273 LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
274
275 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
276 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
277 tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
278 tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
279 tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
280 tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
281 tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
282 tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
283
284 tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
285 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
286 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
287 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
288 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
289 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
290 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
291 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
292
293 satd = HADD_UW_U32(tmp0_w);
294 } else if (256 == length) {
295 for (i = 0; i < 2; ++i) {
296 LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
297 data += 8 * 8;
298 LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
299 data += 8 * 8;
300
301 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
302 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
303 tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
304 tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
305 tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
306 tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
307 tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
308 tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
309
310 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
311 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
312 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
313 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
314 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
315 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
316 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
317 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
318
319 tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
320 tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
321 tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
322 tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
323 tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
324 tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
325 tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
326 tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
327
328 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
329 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
330 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
331 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
332 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
333 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
334 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
335 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
336 }
337
338 satd = HADD_UW_U32(tmp0_w);
339 } else if (1024 == length) {
340 for (i = 0; i < 8; ++i) {
341 LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
342 data += 8 * 8;
343 LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
344 data += 8 * 8;
345
346 tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
347 tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
348 tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
349 tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
350 tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
351 tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
352 tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
353 tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
354
355 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
356 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
357 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
358 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
359 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
360 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
361 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
362 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
363
364 tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
365 tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
366 tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
367 tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
368 tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
369 tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
370 tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
371 tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
372
373 tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
374 tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
375 tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
376 tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
377 tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
378 tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
379 tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
380 tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
381 }
382
383 satd = HADD_UW_U32(tmp0_w);
384 } else {
385 satd = 0;
386
387 for (i = 0; i < length; ++i) {
388 satd += abs(data[i]);
389 }
390 }
391
392 return satd;
393 }
394
vpx_int_pro_row_msa(int16_t hbuf[16],const uint8_t * ref,const int ref_stride,const int height)395 void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
396 const int ref_stride, const int height) {
397 int i;
398 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
399 v8i16 hbuf_r = { 0 };
400 v8i16 hbuf_l = { 0 };
401 v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
402 v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
403
404 if (16 == height) {
405 for (i = 2; i--;) {
406 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
407 ref += 8 * ref_stride;
408 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
409 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
410 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
411 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
412 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
413 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
414 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
415 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
416 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
417 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
418 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
419 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
420 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
421 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
422 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
423 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
424 }
425
426 SRA_2V(hbuf_r, hbuf_l, 3);
427 ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
428 } else if (32 == height) {
429 for (i = 2; i--;) {
430 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
431 ref += 8 * ref_stride;
432 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
433 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
434 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
435 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
436 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
437 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
438 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
439 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
440 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
441 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
442 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
443 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
444 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
445 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
446 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
447 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
448 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
449 ref += 8 * ref_stride;
450 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
451 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
452 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
453 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
454 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
455 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
456 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
457 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
458 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
459 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
460 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
461 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
462 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
463 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
464 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
465 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
466 }
467
468 SRA_2V(hbuf_r, hbuf_l, 4);
469 ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
470 } else if (64 == height) {
471 for (i = 4; i--;) {
472 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
473 ref += 8 * ref_stride;
474 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
475 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
476 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
477 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
478 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
479 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
480 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
481 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
482 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
483 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
484 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
485 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
486 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
487 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
488 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
489 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
490 LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
491 ref += 8 * ref_stride;
492 UNPCK_UB_SH(ref0, ref0_r, ref0_l);
493 UNPCK_UB_SH(ref1, ref1_r, ref1_l);
494 UNPCK_UB_SH(ref2, ref2_r, ref2_l);
495 UNPCK_UB_SH(ref3, ref3_r, ref3_l);
496 UNPCK_UB_SH(ref4, ref4_r, ref4_l);
497 UNPCK_UB_SH(ref5, ref5_r, ref5_l);
498 UNPCK_UB_SH(ref6, ref6_r, ref6_l);
499 UNPCK_UB_SH(ref7, ref7_r, ref7_l);
500 ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
501 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
502 ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
503 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
504 ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
505 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
506 ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
507 hbuf_r, hbuf_l, hbuf_r, hbuf_l);
508 }
509
510 SRA_2V(hbuf_r, hbuf_l, 5);
511 ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
512 } else {
513 const int norm_factor = height >> 1;
514 int cnt;
515
516 for (cnt = 0; cnt < 16; cnt++) {
517 hbuf[cnt] = 0;
518 }
519
520 for (i = 0; i < height; ++i) {
521 for (cnt = 0; cnt < 16; cnt++) {
522 hbuf[cnt] += ref[cnt];
523 }
524
525 ref += ref_stride;
526 }
527
528 for (cnt = 0; cnt < 16; cnt++) {
529 hbuf[cnt] /= norm_factor;
530 }
531 }
532 }
533
vpx_int_pro_col_msa(const uint8_t * ref,const int width)534 int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
535 int16_t sum;
536 v16u8 ref0, ref1, ref2, ref3;
537 v8u16 ref0_h;
538
539 if (16 == width) {
540 ref0 = LD_UB(ref);
541 ref0_h = __msa_hadd_u_h(ref0, ref0);
542 sum = HADD_UH_U32(ref0_h);
543 } else if (32 == width) {
544 LD_UB2(ref, 16, ref0, ref1);
545 ref0_h = __msa_hadd_u_h(ref0, ref0);
546 ref0_h += __msa_hadd_u_h(ref1, ref1);
547 sum = HADD_UH_U32(ref0_h);
548 } else if (64 == width) {
549 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
550 ref0_h = __msa_hadd_u_h(ref0, ref0);
551 ref0_h += __msa_hadd_u_h(ref1, ref1);
552 ref0_h += __msa_hadd_u_h(ref2, ref2);
553 ref0_h += __msa_hadd_u_h(ref3, ref3);
554 sum = HADD_UH_U32(ref0_h);
555 } else {
556 int idx;
557
558 sum = 0;
559 for (idx = 0; idx < width; ++idx) {
560 sum += ref[idx];
561 }
562 }
563
564 return sum;
565 }
566
vpx_vector_var_msa(const int16_t * ref,const int16_t * src,const int bwl)567 int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
568 int sse, mean, var;
569 v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
570 v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
571 v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
572 v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
573 v4i32 res_l7_m, mean_v;
574 v2i64 sse_v;
575
576 if (2 == bwl) {
577 LD_SH2(src, 8, src0, src1);
578 LD_SH2(ref, 8, ref0, ref1);
579
580 ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
581 ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
582 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
583 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
584 sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
585 sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
586 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
587 mean_v = res_l0_m + res_l1_m;
588 mean_v += res_l2_m + res_l3_m;
589
590 sse_v += __msa_splati_d(sse_v, 1);
591 sse = __msa_copy_s_w((v4i32)sse_v, 0);
592
593 mean = HADD_SW_S32(mean_v);
594 } else if (3 == bwl) {
595 LD_SH4(src, 8, src0, src1, src2, src3);
596 LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
597
598 ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
599 ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
600 ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
601 ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
602 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
603 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
604 HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
605 HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
606 sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
607 sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
608 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
609 DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
610 DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
611 mean_v = res_l0_m + res_l1_m;
612 mean_v += res_l2_m + res_l3_m;
613 mean_v += res_l4_m + res_l5_m;
614 mean_v += res_l6_m + res_l7_m;
615
616 sse_v += __msa_splati_d(sse_v, 1);
617 sse = __msa_copy_s_w((v4i32)sse_v, 0);
618
619 mean = HADD_SW_S32(mean_v);
620 } else if (4 == bwl) {
621 LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
622 LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
623
624 ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
625 ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
626 ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
627 ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
628 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
629 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
630 HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
631 HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
632 sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
633 sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
634 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
635 DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
636 DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
637 mean_v = res_l0_m + res_l1_m;
638 mean_v += res_l2_m + res_l3_m;
639 mean_v += res_l4_m + res_l5_m;
640 mean_v += res_l6_m + res_l7_m;
641
642 ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
643 ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
644 ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
645 ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
646 HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
647 HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
648 HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
649 HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
650 DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
651 DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
652 DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
653 DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
654 mean_v += res_l0_m + res_l1_m;
655 mean_v += res_l2_m + res_l3_m;
656 mean_v += res_l4_m + res_l5_m;
657 mean_v += res_l6_m + res_l7_m;
658
659 sse_v += __msa_splati_d(sse_v, 1);
660 sse = __msa_copy_s_w((v4i32)sse_v, 0);
661
662 mean = HADD_SW_S32(mean_v);
663 } else {
664 int i;
665 const int width = 4 << bwl;
666
667 sse = 0;
668 mean = 0;
669
670 for (i = 0; i < width; ++i) {
671 const int diff = ref[i] - src[i];
672
673 mean += diff;
674 sse += diff * diff;
675 }
676 }
677
678 var = sse - ((mean * mean) >> (bwl + 2));
679
680 return var;
681 }
682
vpx_minmax_8x8_msa(const uint8_t * s,int p,const uint8_t * d,int dp,int * min,int * max)683 void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
684 int *min, int *max) {
685 v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
686 v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
687
688 LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
689 LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
690 PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
691 PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
692
693 diff0 = __msa_asub_u_b(s0, d0);
694 diff1 = __msa_asub_u_b(s1, d1);
695 diff2 = __msa_asub_u_b(s2, d2);
696 diff3 = __msa_asub_u_b(s3, d3);
697
698 min0 = __msa_min_u_b(diff0, diff1);
699 min1 = __msa_min_u_b(diff2, diff3);
700 min0 = __msa_min_u_b(min0, min1);
701
702 max0 = __msa_max_u_b(diff0, diff1);
703 max1 = __msa_max_u_b(diff2, diff3);
704 max0 = __msa_max_u_b(max0, max1);
705
706 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
707 min0 = __msa_min_u_b(min0, min1);
708 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
709 max0 = __msa_max_u_b(max0, max1);
710
711 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
712 min0 = __msa_min_u_b(min0, min1);
713 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
714 max0 = __msa_max_u_b(max0, max1);
715
716 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
717 min0 = __msa_min_u_b(min0, min1);
718 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
719 max0 = __msa_max_u_b(max0, max1);
720
721 min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
722 min0 = __msa_min_u_b(min0, min1);
723 max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
724 max0 = __msa_max_u_b(max0, max1);
725
726 *min = min0[0];
727 *max = max0[0];
728 }
729