• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_dsp_rtcd.h"
13 
14 #include "aom_dsp/mips/macros_msa.h"
15 
sub_blk_4x4_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)16 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
17                             const uint8_t *pred_ptr, int32_t pred_stride,
18                             int16_t *diff_ptr, int32_t diff_stride) {
19   uint32_t src0, src1, src2, src3;
20   uint32_t pred0, pred1, pred2, pred3;
21   v16i8 src = { 0 };
22   v16i8 pred = { 0 };
23   v16u8 src_l0, src_l1;
24   v8i16 diff0, diff1;
25 
26   LW4(src_ptr, src_stride, src0, src1, src2, src3);
27   LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
28   INSERT_W4_SB(src0, src1, src2, src3, src);
29   INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
30   ILVRL_B2_UB(src, pred, src_l0, src_l1);
31   HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
32   ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
33 }
34 
sub_blk_8x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)35 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
36                             const uint8_t *pred_ptr, int32_t pred_stride,
37                             int16_t *diff_ptr, int32_t diff_stride) {
38   uint32_t loop_cnt;
39   uint64_t src0, src1, pred0, pred1;
40   v16i8 src = { 0 };
41   v16i8 pred = { 0 };
42   v16u8 src_l0, src_l1;
43   v8i16 diff0, diff1;
44 
45   for (loop_cnt = 4; loop_cnt--;) {
46     LD2(src_ptr, src_stride, src0, src1);
47     src_ptr += (2 * src_stride);
48     LD2(pred_ptr, pred_stride, pred0, pred1);
49     pred_ptr += (2 * pred_stride);
50 
51     INSERT_D2_SB(src0, src1, src);
52     INSERT_D2_SB(pred0, pred1, pred);
53     ILVRL_B2_UB(src, pred, src_l0, src_l1);
54     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
55     ST_SH2(diff0, diff1, diff_ptr, diff_stride);
56     diff_ptr += (2 * diff_stride);
57   }
58 }
59 
sub_blk_16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)60 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
61                               const uint8_t *pred, int32_t pred_stride,
62                               int16_t *diff, int32_t diff_stride) {
63   int8_t count;
64   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
65   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
66   v16u8 src_l0, src_l1;
67   v8i16 diff0, diff1;
68 
69   for (count = 2; count--;) {
70     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
71     src += (8 * src_stride);
72 
73     LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
74            pred7);
75     pred += (8 * pred_stride);
76 
77     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
78     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
79     ST_SH2(diff0, diff1, diff, 8);
80     diff += diff_stride;
81 
82     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
83     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
84     ST_SH2(diff0, diff1, diff, 8);
85     diff += diff_stride;
86 
87     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
88     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
89     ST_SH2(diff0, diff1, diff, 8);
90     diff += diff_stride;
91 
92     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
93     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
94     ST_SH2(diff0, diff1, diff, 8);
95     diff += diff_stride;
96 
97     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
98     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
99     ST_SH2(diff0, diff1, diff, 8);
100     diff += diff_stride;
101 
102     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
103     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
104     ST_SH2(diff0, diff1, diff, 8);
105     diff += diff_stride;
106 
107     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
108     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
109     ST_SH2(diff0, diff1, diff, 8);
110     diff += diff_stride;
111 
112     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
113     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
114     ST_SH2(diff0, diff1, diff, 8);
115     diff += diff_stride;
116   }
117 }
118 
sub_blk_32x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)119 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
120                               const uint8_t *pred, int32_t pred_stride,
121                               int16_t *diff, int32_t diff_stride) {
122   uint32_t loop_cnt;
123   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
124   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
125   v16u8 src_l0, src_l1;
126   v8i16 diff0, diff1;
127 
128   for (loop_cnt = 8; loop_cnt--;) {
129     LD_SB2(src, 16, src0, src1);
130     src += src_stride;
131     LD_SB2(src, 16, src2, src3);
132     src += src_stride;
133     LD_SB2(src, 16, src4, src5);
134     src += src_stride;
135     LD_SB2(src, 16, src6, src7);
136     src += src_stride;
137 
138     LD_SB2(pred, 16, pred0, pred1);
139     pred += pred_stride;
140     LD_SB2(pred, 16, pred2, pred3);
141     pred += pred_stride;
142     LD_SB2(pred, 16, pred4, pred5);
143     pred += pred_stride;
144     LD_SB2(pred, 16, pred6, pred7);
145     pred += pred_stride;
146 
147     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
148     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
149     ST_SH2(diff0, diff1, diff, 8);
150     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
151     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
152     ST_SH2(diff0, diff1, diff + 16, 8);
153     diff += diff_stride;
154 
155     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
156     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
157     ST_SH2(diff0, diff1, diff, 8);
158     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
159     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
160     ST_SH2(diff0, diff1, diff + 16, 8);
161     diff += diff_stride;
162 
163     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
164     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
165     ST_SH2(diff0, diff1, diff, 8);
166     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
167     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
168     ST_SH2(diff0, diff1, diff + 16, 8);
169     diff += diff_stride;
170 
171     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
172     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
173     ST_SH2(diff0, diff1, diff, 8);
174     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
175     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
176     ST_SH2(diff0, diff1, diff + 16, 8);
177     diff += diff_stride;
178   }
179 }
180 
sub_blk_64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)181 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
182                               const uint8_t *pred, int32_t pred_stride,
183                               int16_t *diff, int32_t diff_stride) {
184   uint32_t loop_cnt;
185   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
186   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
187   v16u8 src_l0, src_l1;
188   v8i16 diff0, diff1;
189 
190   for (loop_cnt = 32; loop_cnt--;) {
191     LD_SB4(src, 16, src0, src1, src2, src3);
192     src += src_stride;
193     LD_SB4(src, 16, src4, src5, src6, src7);
194     src += src_stride;
195 
196     LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
197     pred += pred_stride;
198     LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
199     pred += pred_stride;
200 
201     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
202     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
203     ST_SH2(diff0, diff1, diff, 8);
204     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
205     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
206     ST_SH2(diff0, diff1, diff + 16, 8);
207     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
208     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
209     ST_SH2(diff0, diff1, diff + 32, 8);
210     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
211     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
212     ST_SH2(diff0, diff1, diff + 48, 8);
213     diff += diff_stride;
214 
215     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
216     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
217     ST_SH2(diff0, diff1, diff, 8);
218     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
219     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
220     ST_SH2(diff0, diff1, diff + 16, 8);
221     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
222     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
223     ST_SH2(diff0, diff1, diff + 32, 8);
224     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
225     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
226     ST_SH2(diff0, diff1, diff + 48, 8);
227     diff += diff_stride;
228   }
229 }
230 
aom_subtract_block_msa(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)231 void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
232                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
233                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
234                             ptrdiff_t pred_stride) {
235   if (rows == cols) {
236     switch (rows) {
237       case 4:
238         sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
239                         diff_stride);
240         break;
241       case 8:
242         sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
243                         diff_stride);
244         break;
245       case 16:
246         sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
247                           diff_stride);
248         break;
249       case 32:
250         sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
251                           diff_stride);
252         break;
253       case 64:
254         sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
255                           diff_stride);
256         break;
257       default:
258         aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
259                              src_stride, pred_ptr, pred_stride);
260         break;
261     }
262   } else {
263     aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
264                          pred_ptr, pred_stride);
265   }
266 }
267