1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <stdio.h>
14
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom_dsp/mips/convolve_common_dspr2.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/aom_filter.h"
20 #include "aom_ports/mem.h"
21
22 #if HAVE_DSPR2
aom_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)23 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
24 uint8_t *dst, ptrdiff_t dst_stride,
25 const int16_t *filter_x, int filter_x_stride,
26 const int16_t *filter_y, int filter_y_stride,
27 int w, int h) {
28 int x, y;
29
30 (void)filter_x;
31 (void)filter_x_stride;
32 (void)filter_y;
33 (void)filter_y_stride;
34
35 /* prefetch data to cache memory */
36 prefetch_load(src);
37 prefetch_load(src + 32);
38 prefetch_store(dst);
39
40 switch (w) {
41 case 4: {
42 uint32_t tp1;
43
44 /* 1 word storage */
45 for (y = h; y--;) {
46 prefetch_load(src + src_stride);
47 prefetch_load(src + src_stride + 32);
48 prefetch_store(dst + dst_stride);
49
50 __asm__ __volatile__(
51 "ulw %[tp1], (%[src]) \n\t"
52 "sw %[tp1], (%[dst]) \n\t" /* store */
53
54 : [tp1] "=&r"(tp1)
55 : [src] "r"(src), [dst] "r"(dst));
56
57 src += src_stride;
58 dst += dst_stride;
59 }
60 } break;
61 case 8: {
62 uint32_t tp1, tp2;
63
64 /* 2 word storage */
65 for (y = h; y--;) {
66 prefetch_load(src + src_stride);
67 prefetch_load(src + src_stride + 32);
68 prefetch_store(dst + dst_stride);
69
70 __asm__ __volatile__(
71 "ulw %[tp1], 0(%[src]) \n\t"
72 "ulw %[tp2], 4(%[src]) \n\t"
73 "sw %[tp1], 0(%[dst]) \n\t" /* store */
74 "sw %[tp2], 4(%[dst]) \n\t" /* store */
75
76 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
77 : [src] "r"(src), [dst] "r"(dst));
78
79 src += src_stride;
80 dst += dst_stride;
81 }
82 } break;
83 case 16: {
84 uint32_t tp1, tp2, tp3, tp4;
85
86 /* 4 word storage */
87 for (y = h; y--;) {
88 prefetch_load(src + src_stride);
89 prefetch_load(src + src_stride + 32);
90 prefetch_store(dst + dst_stride);
91
92 __asm__ __volatile__(
93 "ulw %[tp1], 0(%[src]) \n\t"
94 "ulw %[tp2], 4(%[src]) \n\t"
95 "ulw %[tp3], 8(%[src]) \n\t"
96 "ulw %[tp4], 12(%[src]) \n\t"
97
98 "sw %[tp1], 0(%[dst]) \n\t" /* store */
99 "sw %[tp2], 4(%[dst]) \n\t" /* store */
100 "sw %[tp3], 8(%[dst]) \n\t" /* store */
101 "sw %[tp4], 12(%[dst]) \n\t" /* store */
102
103 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
104 [tp4] "=&r"(tp4)
105 : [src] "r"(src), [dst] "r"(dst));
106
107 src += src_stride;
108 dst += dst_stride;
109 }
110 } break;
111 case 32: {
112 uint32_t tp1, tp2, tp3, tp4;
113 uint32_t tp5, tp6, tp7, tp8;
114
115 /* 8 word storage */
116 for (y = h; y--;) {
117 prefetch_load(src + src_stride);
118 prefetch_load(src + src_stride + 32);
119 prefetch_store(dst + dst_stride);
120
121 __asm__ __volatile__(
122 "ulw %[tp1], 0(%[src]) \n\t"
123 "ulw %[tp2], 4(%[src]) \n\t"
124 "ulw %[tp3], 8(%[src]) \n\t"
125 "ulw %[tp4], 12(%[src]) \n\t"
126 "ulw %[tp5], 16(%[src]) \n\t"
127 "ulw %[tp6], 20(%[src]) \n\t"
128 "ulw %[tp7], 24(%[src]) \n\t"
129 "ulw %[tp8], 28(%[src]) \n\t"
130
131 "sw %[tp1], 0(%[dst]) \n\t" /* store */
132 "sw %[tp2], 4(%[dst]) \n\t" /* store */
133 "sw %[tp3], 8(%[dst]) \n\t" /* store */
134 "sw %[tp4], 12(%[dst]) \n\t" /* store */
135 "sw %[tp5], 16(%[dst]) \n\t" /* store */
136 "sw %[tp6], 20(%[dst]) \n\t" /* store */
137 "sw %[tp7], 24(%[dst]) \n\t" /* store */
138 "sw %[tp8], 28(%[dst]) \n\t" /* store */
139
140 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
141 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
142 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
143 : [src] "r"(src), [dst] "r"(dst));
144
145 src += src_stride;
146 dst += dst_stride;
147 }
148 } break;
149 case 64: {
150 uint32_t tp1, tp2, tp3, tp4;
151 uint32_t tp5, tp6, tp7, tp8;
152
153 prefetch_load(src + 64);
154 prefetch_store(dst + 32);
155
156 /* 16 word storage */
157 for (y = h; y--;) {
158 prefetch_load(src + src_stride);
159 prefetch_load(src + src_stride + 32);
160 prefetch_load(src + src_stride + 64);
161 prefetch_store(dst + dst_stride);
162 prefetch_store(dst + dst_stride + 32);
163
164 __asm__ __volatile__(
165 "ulw %[tp1], 0(%[src]) \n\t"
166 "ulw %[tp2], 4(%[src]) \n\t"
167 "ulw %[tp3], 8(%[src]) \n\t"
168 "ulw %[tp4], 12(%[src]) \n\t"
169 "ulw %[tp5], 16(%[src]) \n\t"
170 "ulw %[tp6], 20(%[src]) \n\t"
171 "ulw %[tp7], 24(%[src]) \n\t"
172 "ulw %[tp8], 28(%[src]) \n\t"
173
174 "sw %[tp1], 0(%[dst]) \n\t" /* store */
175 "sw %[tp2], 4(%[dst]) \n\t" /* store */
176 "sw %[tp3], 8(%[dst]) \n\t" /* store */
177 "sw %[tp4], 12(%[dst]) \n\t" /* store */
178 "sw %[tp5], 16(%[dst]) \n\t" /* store */
179 "sw %[tp6], 20(%[dst]) \n\t" /* store */
180 "sw %[tp7], 24(%[dst]) \n\t" /* store */
181 "sw %[tp8], 28(%[dst]) \n\t" /* store */
182
183 "ulw %[tp1], 32(%[src]) \n\t"
184 "ulw %[tp2], 36(%[src]) \n\t"
185 "ulw %[tp3], 40(%[src]) \n\t"
186 "ulw %[tp4], 44(%[src]) \n\t"
187 "ulw %[tp5], 48(%[src]) \n\t"
188 "ulw %[tp6], 52(%[src]) \n\t"
189 "ulw %[tp7], 56(%[src]) \n\t"
190 "ulw %[tp8], 60(%[src]) \n\t"
191
192 "sw %[tp1], 32(%[dst]) \n\t" /* store */
193 "sw %[tp2], 36(%[dst]) \n\t" /* store */
194 "sw %[tp3], 40(%[dst]) \n\t" /* store */
195 "sw %[tp4], 44(%[dst]) \n\t" /* store */
196 "sw %[tp5], 48(%[dst]) \n\t" /* store */
197 "sw %[tp6], 52(%[dst]) \n\t" /* store */
198 "sw %[tp7], 56(%[dst]) \n\t" /* store */
199 "sw %[tp8], 60(%[dst]) \n\t" /* store */
200
201 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
202 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
203 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
204 : [src] "r"(src), [dst] "r"(dst));
205
206 src += src_stride;
207 dst += dst_stride;
208 }
209 } break;
210 default:
211 for (y = h; y--;) {
212 for (x = 0; x < w; ++x) {
213 dst[x] = src[x];
214 }
215
216 src += src_stride;
217 dst += dst_stride;
218 }
219 break;
220 }
221 }
222 #endif
223