• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <stdio.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/mips/convolve_common_dspr2.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/aom_filter.h"
20 #include "aom_ports/mem.h"
21 
22 #if HAVE_DSPR2
aom_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)23 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
24                              uint8_t *dst, ptrdiff_t dst_stride,
25                              const int16_t *filter_x, int filter_x_stride,
26                              const int16_t *filter_y, int filter_y_stride,
27                              int w, int h) {
28   int x, y;
29 
30   (void)filter_x;
31   (void)filter_x_stride;
32   (void)filter_y;
33   (void)filter_y_stride;
34 
35   /* prefetch data to cache memory */
36   prefetch_load(src);
37   prefetch_load(src + 32);
38   prefetch_store(dst);
39 
40   switch (w) {
41     case 4: {
42       uint32_t tp1;
43 
44       /* 1 word storage */
45       for (y = h; y--;) {
46         prefetch_load(src + src_stride);
47         prefetch_load(src + src_stride + 32);
48         prefetch_store(dst + dst_stride);
49 
50         __asm__ __volatile__(
51             "ulw              %[tp1],         (%[src])      \n\t"
52             "sw               %[tp1],         (%[dst])      \n\t" /* store */
53 
54             : [tp1] "=&r"(tp1)
55             : [src] "r"(src), [dst] "r"(dst));
56 
57         src += src_stride;
58         dst += dst_stride;
59       }
60     } break;
61     case 8: {
62       uint32_t tp1, tp2;
63 
64       /* 2 word storage */
65       for (y = h; y--;) {
66         prefetch_load(src + src_stride);
67         prefetch_load(src + src_stride + 32);
68         prefetch_store(dst + dst_stride);
69 
70         __asm__ __volatile__(
71             "ulw              %[tp1],         0(%[src])      \n\t"
72             "ulw              %[tp2],         4(%[src])      \n\t"
73             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
74             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
75 
76             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
77             : [src] "r"(src), [dst] "r"(dst));
78 
79         src += src_stride;
80         dst += dst_stride;
81       }
82     } break;
83     case 16: {
84       uint32_t tp1, tp2, tp3, tp4;
85 
86       /* 4 word storage */
87       for (y = h; y--;) {
88         prefetch_load(src + src_stride);
89         prefetch_load(src + src_stride + 32);
90         prefetch_store(dst + dst_stride);
91 
92         __asm__ __volatile__(
93             "ulw              %[tp1],         0(%[src])      \n\t"
94             "ulw              %[tp2],         4(%[src])      \n\t"
95             "ulw              %[tp3],         8(%[src])      \n\t"
96             "ulw              %[tp4],         12(%[src])     \n\t"
97 
98             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
99             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
100             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
101             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
102 
103             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
104               [tp4] "=&r"(tp4)
105             : [src] "r"(src), [dst] "r"(dst));
106 
107         src += src_stride;
108         dst += dst_stride;
109       }
110     } break;
111     case 32: {
112       uint32_t tp1, tp2, tp3, tp4;
113       uint32_t tp5, tp6, tp7, tp8;
114 
115       /* 8 word storage */
116       for (y = h; y--;) {
117         prefetch_load(src + src_stride);
118         prefetch_load(src + src_stride + 32);
119         prefetch_store(dst + dst_stride);
120 
121         __asm__ __volatile__(
122             "ulw              %[tp1],         0(%[src])      \n\t"
123             "ulw              %[tp2],         4(%[src])      \n\t"
124             "ulw              %[tp3],         8(%[src])      \n\t"
125             "ulw              %[tp4],         12(%[src])     \n\t"
126             "ulw              %[tp5],         16(%[src])     \n\t"
127             "ulw              %[tp6],         20(%[src])     \n\t"
128             "ulw              %[tp7],         24(%[src])     \n\t"
129             "ulw              %[tp8],         28(%[src])     \n\t"
130 
131             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
132             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
133             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
134             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
135             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
136             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
137             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
138             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
139 
140             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
141               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
142               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
143             : [src] "r"(src), [dst] "r"(dst));
144 
145         src += src_stride;
146         dst += dst_stride;
147       }
148     } break;
149     case 64: {
150       uint32_t tp1, tp2, tp3, tp4;
151       uint32_t tp5, tp6, tp7, tp8;
152 
153       prefetch_load(src + 64);
154       prefetch_store(dst + 32);
155 
156       /* 16 word storage */
157       for (y = h; y--;) {
158         prefetch_load(src + src_stride);
159         prefetch_load(src + src_stride + 32);
160         prefetch_load(src + src_stride + 64);
161         prefetch_store(dst + dst_stride);
162         prefetch_store(dst + dst_stride + 32);
163 
164         __asm__ __volatile__(
165             "ulw              %[tp1],         0(%[src])      \n\t"
166             "ulw              %[tp2],         4(%[src])      \n\t"
167             "ulw              %[tp3],         8(%[src])      \n\t"
168             "ulw              %[tp4],         12(%[src])     \n\t"
169             "ulw              %[tp5],         16(%[src])     \n\t"
170             "ulw              %[tp6],         20(%[src])     \n\t"
171             "ulw              %[tp7],         24(%[src])     \n\t"
172             "ulw              %[tp8],         28(%[src])     \n\t"
173 
174             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
175             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
176             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
177             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
178             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
179             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
180             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
181             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
182 
183             "ulw              %[tp1],         32(%[src])     \n\t"
184             "ulw              %[tp2],         36(%[src])     \n\t"
185             "ulw              %[tp3],         40(%[src])     \n\t"
186             "ulw              %[tp4],         44(%[src])     \n\t"
187             "ulw              %[tp5],         48(%[src])     \n\t"
188             "ulw              %[tp6],         52(%[src])     \n\t"
189             "ulw              %[tp7],         56(%[src])     \n\t"
190             "ulw              %[tp8],         60(%[src])     \n\t"
191 
192             "sw               %[tp1],         32(%[dst])     \n\t" /* store */
193             "sw               %[tp2],         36(%[dst])     \n\t" /* store */
194             "sw               %[tp3],         40(%[dst])     \n\t" /* store */
195             "sw               %[tp4],         44(%[dst])     \n\t" /* store */
196             "sw               %[tp5],         48(%[dst])     \n\t" /* store */
197             "sw               %[tp6],         52(%[dst])     \n\t" /* store */
198             "sw               %[tp7],         56(%[dst])     \n\t" /* store */
199             "sw               %[tp8],         60(%[dst])     \n\t" /* store */
200 
201             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
202               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
203               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
204             : [src] "r"(src), [dst] "r"(dst));
205 
206         src += src_stride;
207         dst += dst_stride;
208       }
209     } break;
210     default:
211       for (y = h; y--;) {
212         for (x = 0; x < w; ++x) {
213           dst[x] = src[x];
214         }
215 
216         src += src_stride;
217         dst += dst_stride;
218       }
219       break;
220   }
221 }
222 #endif
223