1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_dsp_rtcd.h"
13
14 #include "aom_dsp/mips/macros_msa.h"
15
sub_blk_4x4_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)16 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
17 const uint8_t *pred_ptr, int32_t pred_stride,
18 int16_t *diff_ptr, int32_t diff_stride) {
19 uint32_t src0, src1, src2, src3;
20 uint32_t pred0, pred1, pred2, pred3;
21 v16i8 src = { 0 };
22 v16i8 pred = { 0 };
23 v16u8 src_l0, src_l1;
24 v8i16 diff0, diff1;
25
26 LW4(src_ptr, src_stride, src0, src1, src2, src3);
27 LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
28 INSERT_W4_SB(src0, src1, src2, src3, src);
29 INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
30 ILVRL_B2_UB(src, pred, src_l0, src_l1);
31 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
32 ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
33 }
34
sub_blk_8x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)35 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
36 const uint8_t *pred_ptr, int32_t pred_stride,
37 int16_t *diff_ptr, int32_t diff_stride) {
38 uint32_t loop_cnt;
39 uint64_t src0, src1, pred0, pred1;
40 v16i8 src = { 0 };
41 v16i8 pred = { 0 };
42 v16u8 src_l0, src_l1;
43 v8i16 diff0, diff1;
44
45 for (loop_cnt = 4; loop_cnt--;) {
46 LD2(src_ptr, src_stride, src0, src1);
47 src_ptr += (2 * src_stride);
48 LD2(pred_ptr, pred_stride, pred0, pred1);
49 pred_ptr += (2 * pred_stride);
50
51 INSERT_D2_SB(src0, src1, src);
52 INSERT_D2_SB(pred0, pred1, pred);
53 ILVRL_B2_UB(src, pred, src_l0, src_l1);
54 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
55 ST_SH2(diff0, diff1, diff_ptr, diff_stride);
56 diff_ptr += (2 * diff_stride);
57 }
58 }
59
sub_blk_16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)60 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
61 const uint8_t *pred, int32_t pred_stride,
62 int16_t *diff, int32_t diff_stride) {
63 int8_t count;
64 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
65 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
66 v16u8 src_l0, src_l1;
67 v8i16 diff0, diff1;
68
69 for (count = 2; count--;) {
70 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
71 src += (8 * src_stride);
72
73 LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
74 pred7);
75 pred += (8 * pred_stride);
76
77 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
78 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
79 ST_SH2(diff0, diff1, diff, 8);
80 diff += diff_stride;
81
82 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
83 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
84 ST_SH2(diff0, diff1, diff, 8);
85 diff += diff_stride;
86
87 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
88 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
89 ST_SH2(diff0, diff1, diff, 8);
90 diff += diff_stride;
91
92 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
93 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
94 ST_SH2(diff0, diff1, diff, 8);
95 diff += diff_stride;
96
97 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
98 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
99 ST_SH2(diff0, diff1, diff, 8);
100 diff += diff_stride;
101
102 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
103 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
104 ST_SH2(diff0, diff1, diff, 8);
105 diff += diff_stride;
106
107 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
108 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
109 ST_SH2(diff0, diff1, diff, 8);
110 diff += diff_stride;
111
112 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
113 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
114 ST_SH2(diff0, diff1, diff, 8);
115 diff += diff_stride;
116 }
117 }
118
sub_blk_32x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)119 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
120 const uint8_t *pred, int32_t pred_stride,
121 int16_t *diff, int32_t diff_stride) {
122 uint32_t loop_cnt;
123 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
124 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
125 v16u8 src_l0, src_l1;
126 v8i16 diff0, diff1;
127
128 for (loop_cnt = 8; loop_cnt--;) {
129 LD_SB2(src, 16, src0, src1);
130 src += src_stride;
131 LD_SB2(src, 16, src2, src3);
132 src += src_stride;
133 LD_SB2(src, 16, src4, src5);
134 src += src_stride;
135 LD_SB2(src, 16, src6, src7);
136 src += src_stride;
137
138 LD_SB2(pred, 16, pred0, pred1);
139 pred += pred_stride;
140 LD_SB2(pred, 16, pred2, pred3);
141 pred += pred_stride;
142 LD_SB2(pred, 16, pred4, pred5);
143 pred += pred_stride;
144 LD_SB2(pred, 16, pred6, pred7);
145 pred += pred_stride;
146
147 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
148 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
149 ST_SH2(diff0, diff1, diff, 8);
150 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
151 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
152 ST_SH2(diff0, diff1, diff + 16, 8);
153 diff += diff_stride;
154
155 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
156 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
157 ST_SH2(diff0, diff1, diff, 8);
158 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
159 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
160 ST_SH2(diff0, diff1, diff + 16, 8);
161 diff += diff_stride;
162
163 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
164 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
165 ST_SH2(diff0, diff1, diff, 8);
166 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
167 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
168 ST_SH2(diff0, diff1, diff + 16, 8);
169 diff += diff_stride;
170
171 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
172 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
173 ST_SH2(diff0, diff1, diff, 8);
174 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
175 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
176 ST_SH2(diff0, diff1, diff + 16, 8);
177 diff += diff_stride;
178 }
179 }
180
sub_blk_64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)181 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
182 const uint8_t *pred, int32_t pred_stride,
183 int16_t *diff, int32_t diff_stride) {
184 uint32_t loop_cnt;
185 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
186 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
187 v16u8 src_l0, src_l1;
188 v8i16 diff0, diff1;
189
190 for (loop_cnt = 32; loop_cnt--;) {
191 LD_SB4(src, 16, src0, src1, src2, src3);
192 src += src_stride;
193 LD_SB4(src, 16, src4, src5, src6, src7);
194 src += src_stride;
195
196 LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
197 pred += pred_stride;
198 LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
199 pred += pred_stride;
200
201 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
202 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
203 ST_SH2(diff0, diff1, diff, 8);
204 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
205 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
206 ST_SH2(diff0, diff1, diff + 16, 8);
207 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
208 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
209 ST_SH2(diff0, diff1, diff + 32, 8);
210 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
211 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
212 ST_SH2(diff0, diff1, diff + 48, 8);
213 diff += diff_stride;
214
215 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
216 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
217 ST_SH2(diff0, diff1, diff, 8);
218 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
219 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
220 ST_SH2(diff0, diff1, diff + 16, 8);
221 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
222 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
223 ST_SH2(diff0, diff1, diff + 32, 8);
224 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
225 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
226 ST_SH2(diff0, diff1, diff + 48, 8);
227 diff += diff_stride;
228 }
229 }
230
aom_subtract_block_msa(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)231 void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
232 ptrdiff_t diff_stride, const uint8_t *src_ptr,
233 ptrdiff_t src_stride, const uint8_t *pred_ptr,
234 ptrdiff_t pred_stride) {
235 if (rows == cols) {
236 switch (rows) {
237 case 4:
238 sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
239 diff_stride);
240 break;
241 case 8:
242 sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
243 diff_stride);
244 break;
245 case 16:
246 sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
247 diff_stride);
248 break;
249 case 32:
250 sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
251 diff_stride);
252 break;
253 case 64:
254 sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
255 diff_stride);
256 break;
257 default:
258 aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
259 src_stride, pred_ptr, pred_stride);
260 break;
261 }
262 } else {
263 aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
264 pred_ptr, pred_stride);
265 }
266 }
267