• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <stdio.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/mips/convolve_common_dspr2.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/aom_filter.h"
20 #include "aom_ports/mem.h"
21 
22 #if HAVE_DSPR2
convolve_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)23 static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
24                                   uint8_t *dst, int32_t dst_stride,
25                                   const int16_t *filter_y, int32_t w,
26                                   int32_t h) {
27   int32_t x, y;
28   const uint8_t *src_ptr;
29   uint8_t *dst_ptr;
30   uint8_t *cm = aom_ff_cropTbl;
31   uint32_t vector4a = 64;
32   uint32_t load1, load2, load3, load4;
33   uint32_t p1, p2;
34   uint32_t n1, n2;
35   uint32_t scratch1, scratch2;
36   uint32_t store1, store2;
37   int32_t vector1b, vector2b, vector3b, vector4b;
38   int32_t Temp1, Temp2;
39 
40   vector1b = ((const int32_t *)filter_y)[0];
41   vector2b = ((const int32_t *)filter_y)[1];
42   vector3b = ((const int32_t *)filter_y)[2];
43   vector4b = ((const int32_t *)filter_y)[3];
44 
45   src -= 3 * src_stride;
46 
47   for (y = h; y--;) {
48     /* prefetch data to cache memory */
49     prefetch_store(dst + dst_stride);
50 
51     for (x = 0; x < w; x += 4) {
52       src_ptr = src + x;
53       dst_ptr = dst + x;
54 
55       __asm__ __volatile__(
56           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
57           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
58           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
59           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
60           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
61           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
62           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
63 
64           "mtlo             %[vector4a],  $ac0                            \n\t"
65           "mtlo             %[vector4a],  $ac1                            \n\t"
66           "mtlo             %[vector4a],  $ac2                            \n\t"
67           "mtlo             %[vector4a],  $ac3                            \n\t"
68           "mthi             $zero,        $ac0                            \n\t"
69           "mthi             $zero,        $ac1                            \n\t"
70           "mthi             $zero,        $ac2                            \n\t"
71           "mthi             $zero,        $ac3                            \n\t"
72 
73           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
74           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
75           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
76           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
77           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
78           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
79           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
80           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
81 
82           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
83           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
84           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
85           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
86 
87           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
88           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
89           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
90           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
91           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
92           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
93           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
94           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
95 
96           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
97           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
98           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
99           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
100 
101           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
102           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
103           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
104           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
105           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
106           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
107           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
108           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
109 
110           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
111           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
112           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
113           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
114           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
115           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
116           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
117           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
118 
119           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
120           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
121           "extp             %[Temp1],     $ac0,           31              \n\t"
122           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
123           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
124           "extp             %[Temp2],     $ac1,           31              \n\t"
125 
126           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
127           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
128           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
129           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
130           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
131           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
132           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
133           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
134 
135           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
136           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
137           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
138           "extp             %[Temp1],     $ac2,           31              \n\t"
139 
140           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
141           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
142           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
143           "extp             %[Temp2],     $ac3,           31              \n\t"
144 
145           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
146           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
147 
148           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
149           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
150 
151           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
152           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
153 
154           : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
155             [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
156             [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
157             [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
158             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
159             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
160           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
161             [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
162             [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
163             [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
164     }
165 
166     /* Next row... */
167     src += src_stride;
168     dst += dst_stride;
169   }
170 }
171 
convolve_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)172 static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
173                                    uint8_t *dst, int32_t dst_stride,
174                                    const int16_t *filter_y, int32_t h) {
175   int32_t x, y;
176   const uint8_t *src_ptr;
177   uint8_t *dst_ptr;
178   uint8_t *cm = aom_ff_cropTbl;
179   uint32_t vector4a = 64;
180   uint32_t load1, load2, load3, load4;
181   uint32_t p1, p2;
182   uint32_t n1, n2;
183   uint32_t scratch1, scratch2;
184   uint32_t store1, store2;
185   int32_t vector1b, vector2b, vector3b, vector4b;
186   int32_t Temp1, Temp2;
187 
188   vector1b = ((const int32_t *)filter_y)[0];
189   vector2b = ((const int32_t *)filter_y)[1];
190   vector3b = ((const int32_t *)filter_y)[2];
191   vector4b = ((const int32_t *)filter_y)[3];
192 
193   src -= 3 * src_stride;
194 
195   for (y = h; y--;) {
196     /* prefetch data to cache memory */
197     prefetch_store(dst + dst_stride);
198     prefetch_store(dst + dst_stride + 32);
199 
200     for (x = 0; x < 64; x += 4) {
201       src_ptr = src + x;
202       dst_ptr = dst + x;
203 
204       __asm__ __volatile__(
205           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
206           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
207           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
208           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
209           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
210           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
211           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
212 
213           "mtlo             %[vector4a],  $ac0                            \n\t"
214           "mtlo             %[vector4a],  $ac1                            \n\t"
215           "mtlo             %[vector4a],  $ac2                            \n\t"
216           "mtlo             %[vector4a],  $ac3                            \n\t"
217           "mthi             $zero,        $ac0                            \n\t"
218           "mthi             $zero,        $ac1                            \n\t"
219           "mthi             $zero,        $ac2                            \n\t"
220           "mthi             $zero,        $ac3                            \n\t"
221 
222           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
223           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
224           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
225           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
226           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
227           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
228           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
229           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
230 
231           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
232           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
233           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
234           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
235 
236           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
237           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
238           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
239           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
240           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
241           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
242           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
243           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
244 
245           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
246           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
247           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
248           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
249 
250           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
251           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
252           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
253           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
254           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
255           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
256           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
257           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
258 
259           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
260           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
261           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
262           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
263           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
264           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
265           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
266           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
267 
268           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
269           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
270           "extp             %[Temp1],     $ac0,           31              \n\t"
271           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
272           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
273           "extp             %[Temp2],     $ac1,           31              \n\t"
274 
275           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
276           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
277           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
278           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
279           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
280           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
281           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
282           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
283 
284           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
285           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
286           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
287           "extp             %[Temp1],     $ac2,           31              \n\t"
288 
289           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
290           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
291           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
292           "extp             %[Temp2],     $ac3,           31              \n\t"
293 
294           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
295           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
296 
297           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
298           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
299 
300           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
301           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
302 
303           : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
304             [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
305             [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
306             [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
307             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
308             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
309           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
310             [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
311             [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
312             [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
313     }
314 
315     /* Next row... */
316     src += src_stride;
317     dst += dst_stride;
318   }
319 }
320 
aom_convolve8_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)321 void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
322                               uint8_t *dst, ptrdiff_t dst_stride,
323                               const int16_t *filter_x, int x_step_q4,
324                               const int16_t *filter_y, int y_step_q4, int w,
325                               int h) {
326   assert(y_step_q4 == 16);
327   assert(((const int32_t *)filter_y)[1] != 0x800000);
328 
329   if (((const int32_t *)filter_y)[0] == 0) {
330     aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
331                              x_step_q4, filter_y, y_step_q4, w, h);
332   } else {
333     uint32_t pos = 38;
334 
335     /* bit positon for extract from acc */
336     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
337                          :
338                          : [pos] "r"(pos));
339 
340     prefetch_store(dst);
341 
342     switch (w) {
343       case 4:
344       case 8:
345       case 16:
346       case 32:
347         convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
348         break;
349       case 64:
350         prefetch_store(dst + 32);
351         convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
352         break;
353       default:
354         aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
355                              x_step_q4, filter_y, y_step_q4, w, h);
356         break;
357     }
358   }
359 }
360 
361 #endif
362