1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
sub_blk_4x4_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)14 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
15 const uint8_t *pred_ptr, int32_t pred_stride,
16 int16_t *diff_ptr, int32_t diff_stride) {
17 uint32_t src0, src1, src2, src3;
18 uint32_t pred0, pred1, pred2, pred3;
19 v16i8 src = { 0 };
20 v16i8 pred = { 0 };
21 v16u8 src_l0, src_l1;
22 v8i16 diff0, diff1;
23
24 LW4(src_ptr, src_stride, src0, src1, src2, src3);
25 LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
26 INSERT_W4_SB(src0, src1, src2, src3, src);
27 INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
28 ILVRL_B2_UB(src, pred, src_l0, src_l1);
29 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
30 ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
31 }
32
sub_blk_8x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)33 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
34 const uint8_t *pred_ptr, int32_t pred_stride,
35 int16_t *diff_ptr, int32_t diff_stride) {
36 uint32_t loop_cnt;
37 uint64_t src0, src1, pred0, pred1;
38 v16i8 src = { 0 };
39 v16i8 pred = { 0 };
40 v16u8 src_l0, src_l1;
41 v8i16 diff0, diff1;
42
43 for (loop_cnt = 4; loop_cnt--;) {
44 LD2(src_ptr, src_stride, src0, src1);
45 src_ptr += (2 * src_stride);
46 LD2(pred_ptr, pred_stride, pred0, pred1);
47 pred_ptr += (2 * pred_stride);
48
49 INSERT_D2_SB(src0, src1, src);
50 INSERT_D2_SB(pred0, pred1, pred);
51 ILVRL_B2_UB(src, pred, src_l0, src_l1);
52 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
53 ST_SH2(diff0, diff1, diff_ptr, diff_stride);
54 diff_ptr += (2 * diff_stride);
55 }
56 }
57
sub_blk_16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)58 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
59 const uint8_t *pred, int32_t pred_stride,
60 int16_t *diff, int32_t diff_stride) {
61 int8_t count;
62 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
63 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
64 v16u8 src_l0, src_l1;
65 v8i16 diff0, diff1;
66
67 for (count = 2; count--;) {
68 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
69 src += (8 * src_stride);
70
71 LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
72 pred7);
73 pred += (8 * pred_stride);
74
75 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
76 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
77 ST_SH2(diff0, diff1, diff, 8);
78 diff += diff_stride;
79
80 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
81 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
82 ST_SH2(diff0, diff1, diff, 8);
83 diff += diff_stride;
84
85 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
86 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
87 ST_SH2(diff0, diff1, diff, 8);
88 diff += diff_stride;
89
90 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
91 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
92 ST_SH2(diff0, diff1, diff, 8);
93 diff += diff_stride;
94
95 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
96 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
97 ST_SH2(diff0, diff1, diff, 8);
98 diff += diff_stride;
99
100 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
101 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
102 ST_SH2(diff0, diff1, diff, 8);
103 diff += diff_stride;
104
105 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
106 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
107 ST_SH2(diff0, diff1, diff, 8);
108 diff += diff_stride;
109
110 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
111 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
112 ST_SH2(diff0, diff1, diff, 8);
113 diff += diff_stride;
114 }
115 }
116
sub_blk_32x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)117 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
118 const uint8_t *pred, int32_t pred_stride,
119 int16_t *diff, int32_t diff_stride) {
120 uint32_t loop_cnt;
121 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
122 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
123 v16u8 src_l0, src_l1;
124 v8i16 diff0, diff1;
125
126 for (loop_cnt = 8; loop_cnt--;) {
127 LD_SB2(src, 16, src0, src1);
128 src += src_stride;
129 LD_SB2(src, 16, src2, src3);
130 src += src_stride;
131 LD_SB2(src, 16, src4, src5);
132 src += src_stride;
133 LD_SB2(src, 16, src6, src7);
134 src += src_stride;
135
136 LD_SB2(pred, 16, pred0, pred1);
137 pred += pred_stride;
138 LD_SB2(pred, 16, pred2, pred3);
139 pred += pred_stride;
140 LD_SB2(pred, 16, pred4, pred5);
141 pred += pred_stride;
142 LD_SB2(pred, 16, pred6, pred7);
143 pred += pred_stride;
144
145 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
146 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
147 ST_SH2(diff0, diff1, diff, 8);
148 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
149 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
150 ST_SH2(diff0, diff1, diff + 16, 8);
151 diff += diff_stride;
152
153 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
154 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
155 ST_SH2(diff0, diff1, diff, 8);
156 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
157 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
158 ST_SH2(diff0, diff1, diff + 16, 8);
159 diff += diff_stride;
160
161 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
162 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
163 ST_SH2(diff0, diff1, diff, 8);
164 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
165 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
166 ST_SH2(diff0, diff1, diff + 16, 8);
167 diff += diff_stride;
168
169 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
170 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
171 ST_SH2(diff0, diff1, diff, 8);
172 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
173 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
174 ST_SH2(diff0, diff1, diff + 16, 8);
175 diff += diff_stride;
176 }
177 }
178
sub_blk_64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)179 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
180 const uint8_t *pred, int32_t pred_stride,
181 int16_t *diff, int32_t diff_stride) {
182 uint32_t loop_cnt;
183 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
184 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
185 v16u8 src_l0, src_l1;
186 v8i16 diff0, diff1;
187
188 for (loop_cnt = 32; loop_cnt--;) {
189 LD_SB4(src, 16, src0, src1, src2, src3);
190 src += src_stride;
191 LD_SB4(src, 16, src4, src5, src6, src7);
192 src += src_stride;
193
194 LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
195 pred += pred_stride;
196 LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
197 pred += pred_stride;
198
199 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
200 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
201 ST_SH2(diff0, diff1, diff, 8);
202 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
203 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
204 ST_SH2(diff0, diff1, diff + 16, 8);
205 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
206 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
207 ST_SH2(diff0, diff1, diff + 32, 8);
208 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
209 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
210 ST_SH2(diff0, diff1, diff + 48, 8);
211 diff += diff_stride;
212
213 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
214 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
215 ST_SH2(diff0, diff1, diff, 8);
216 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
217 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
218 ST_SH2(diff0, diff1, diff + 16, 8);
219 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
220 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
221 ST_SH2(diff0, diff1, diff + 32, 8);
222 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
223 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
224 ST_SH2(diff0, diff1, diff + 48, 8);
225 diff += diff_stride;
226 }
227 }
228
vpx_subtract_block_msa(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)229 void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
230 ptrdiff_t diff_stride, const uint8_t *src_ptr,
231 ptrdiff_t src_stride, const uint8_t *pred_ptr,
232 ptrdiff_t pred_stride) {
233 if (rows == cols) {
234 switch (rows) {
235 case 4:
236 sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
237 diff_stride);
238 break;
239 case 8:
240 sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
241 diff_stride);
242 break;
243 case 16:
244 sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
245 diff_stride);
246 break;
247 case 32:
248 sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
249 diff_stride);
250 break;
251 case 64:
252 sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
253 diff_stride);
254 break;
255 default:
256 vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
257 src_stride, pred_ptr, pred_stride);
258 break;
259 }
260 } else {
261 vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
262 pred_ptr, pred_stride);
263 }
264 }
265