1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)21 static void convolve_vert_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_y,
26 int32_t w,
27 int32_t h) {
28 int32_t x, y;
29 const uint8_t *src_ptr;
30 uint8_t *dst_ptr;
31 uint8_t *cm = vpx_ff_cropTbl;
32 uint32_t vector4a = 64;
33 uint32_t load1, load2, load3, load4;
34 uint32_t p1, p2;
35 uint32_t n1, n2;
36 uint32_t scratch1, scratch2;
37 uint32_t store1, store2;
38 int32_t vector1b, vector2b, vector3b, vector4b;
39 int32_t Temp1, Temp2;
40
41 vector1b = ((const int32_t *)filter_y)[0];
42 vector2b = ((const int32_t *)filter_y)[1];
43 vector3b = ((const int32_t *)filter_y)[2];
44 vector4b = ((const int32_t *)filter_y)[3];
45
46 src -= 3 * src_stride;
47
48 for (y = h; y--;) {
49 /* prefetch data to cache memory */
50 prefetch_store(dst + dst_stride);
51
52 for (x = 0; x < w; x += 4) {
53 src_ptr = src + x;
54 dst_ptr = dst + x;
55
56 __asm__ __volatile__ (
57 "ulw %[load1], 0(%[src_ptr]) \n\t"
58 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
59 "ulw %[load2], 0(%[src_ptr]) \n\t"
60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
61 "ulw %[load3], 0(%[src_ptr]) \n\t"
62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
63 "ulw %[load4], 0(%[src_ptr]) \n\t"
64
65 "mtlo %[vector4a], $ac0 \n\t"
66 "mtlo %[vector4a], $ac1 \n\t"
67 "mtlo %[vector4a], $ac2 \n\t"
68 "mtlo %[vector4a], $ac3 \n\t"
69 "mthi $zero, $ac0 \n\t"
70 "mthi $zero, $ac1 \n\t"
71 "mthi $zero, $ac2 \n\t"
72 "mthi $zero, $ac3 \n\t"
73
74 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
75 "preceu.ph.qbr %[p1], %[load2] \n\t"
76 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
77 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
78 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
79 "preceu.ph.qbr %[p2], %[load4] \n\t"
80 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
81 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
82
83 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
84 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
85 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
86 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
87
88 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
89 "preceu.ph.qbl %[p1], %[load2] \n\t"
90 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
91 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
92 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
93 "preceu.ph.qbl %[p2], %[load4] \n\t"
94 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
95 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
96
97 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
98 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
99 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
100 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
101
102 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
103 "ulw %[load1], 0(%[src_ptr]) \n\t"
104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
105 "ulw %[load2], 0(%[src_ptr]) \n\t"
106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
107 "ulw %[load3], 0(%[src_ptr]) \n\t"
108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
109 "ulw %[load4], 0(%[src_ptr]) \n\t"
110
111 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
112 "preceu.ph.qbr %[p1], %[load2] \n\t"
113 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
114 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
115 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
116 "preceu.ph.qbr %[p2], %[load4] \n\t"
117 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
118 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
119
120 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
121 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
122 "extp %[Temp1], $ac0, 31 \n\t"
123 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
124 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
125 "extp %[Temp2], $ac1, 31 \n\t"
126
127 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
128 "preceu.ph.qbl %[p1], %[load2] \n\t"
129 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
130 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
131 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
132 "preceu.ph.qbl %[p2], %[load4] \n\t"
133 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
134 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
135
136 "lbux %[store1], %[Temp1](%[cm]) \n\t"
137 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
138 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
139 "extp %[Temp1], $ac2, 31 \n\t"
140
141 "lbux %[store2], %[Temp2](%[cm]) \n\t"
142 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
143 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
144 "extp %[Temp2], $ac3, 31 \n\t"
145
146 "sb %[store1], 0(%[dst_ptr]) \n\t"
147 "sb %[store2], 1(%[dst_ptr]) \n\t"
148
149 "lbux %[store1], %[Temp1](%[cm]) \n\t"
150 "lbux %[store2], %[Temp2](%[cm]) \n\t"
151
152 "sb %[store1], 2(%[dst_ptr]) \n\t"
153 "sb %[store2], 3(%[dst_ptr]) \n\t"
154
155 : [load1] "=&r" (load1), [load2] "=&r" (load2),
156 [load3] "=&r" (load3), [load4] "=&r" (load4),
157 [p1] "=&r" (p1), [p2] "=&r" (p2),
158 [n1] "=&r" (n1), [n2] "=&r" (n2),
159 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
160 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
161 [store1] "=&r" (store1), [store2] "=&r" (store2),
162 [src_ptr] "+r" (src_ptr)
163 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
164 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
165 [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
166 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
167 );
168 }
169
170 /* Next row... */
171 src += src_stride;
172 dst += dst_stride;
173 }
174 }
175
convolve_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)176 static void convolve_vert_64_dspr2(const uint8_t *src,
177 int32_t src_stride,
178 uint8_t *dst,
179 int32_t dst_stride,
180 const int16_t *filter_y,
181 int32_t h) {
182 int32_t x, y;
183 const uint8_t *src_ptr;
184 uint8_t *dst_ptr;
185 uint8_t *cm = vpx_ff_cropTbl;
186 uint32_t vector4a = 64;
187 uint32_t load1, load2, load3, load4;
188 uint32_t p1, p2;
189 uint32_t n1, n2;
190 uint32_t scratch1, scratch2;
191 uint32_t store1, store2;
192 int32_t vector1b, vector2b, vector3b, vector4b;
193 int32_t Temp1, Temp2;
194
195 vector1b = ((const int32_t *)filter_y)[0];
196 vector2b = ((const int32_t *)filter_y)[1];
197 vector3b = ((const int32_t *)filter_y)[2];
198 vector4b = ((const int32_t *)filter_y)[3];
199
200 src -= 3 * src_stride;
201
202 for (y = h; y--;) {
203 /* prefetch data to cache memory */
204 prefetch_store(dst + dst_stride);
205 prefetch_store(dst + dst_stride + 32);
206
207 for (x = 0; x < 64; x += 4) {
208 src_ptr = src + x;
209 dst_ptr = dst + x;
210
211 __asm__ __volatile__ (
212 "ulw %[load1], 0(%[src_ptr]) \n\t"
213 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
214 "ulw %[load2], 0(%[src_ptr]) \n\t"
215 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
216 "ulw %[load3], 0(%[src_ptr]) \n\t"
217 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
218 "ulw %[load4], 0(%[src_ptr]) \n\t"
219
220 "mtlo %[vector4a], $ac0 \n\t"
221 "mtlo %[vector4a], $ac1 \n\t"
222 "mtlo %[vector4a], $ac2 \n\t"
223 "mtlo %[vector4a], $ac3 \n\t"
224 "mthi $zero, $ac0 \n\t"
225 "mthi $zero, $ac1 \n\t"
226 "mthi $zero, $ac2 \n\t"
227 "mthi $zero, $ac3 \n\t"
228
229 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
230 "preceu.ph.qbr %[p1], %[load2] \n\t"
231 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
232 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
233 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
234 "preceu.ph.qbr %[p2], %[load4] \n\t"
235 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
236 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
237
238 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
239 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
240 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
241 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
242
243 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
244 "preceu.ph.qbl %[p1], %[load2] \n\t"
245 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
246 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
247 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
248 "preceu.ph.qbl %[p2], %[load4] \n\t"
249 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
250 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
251
252 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
253 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
254 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
255 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
256
257 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
258 "ulw %[load1], 0(%[src_ptr]) \n\t"
259 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
260 "ulw %[load2], 0(%[src_ptr]) \n\t"
261 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
262 "ulw %[load3], 0(%[src_ptr]) \n\t"
263 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
264 "ulw %[load4], 0(%[src_ptr]) \n\t"
265
266 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
267 "preceu.ph.qbr %[p1], %[load2] \n\t"
268 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
269 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
270 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
271 "preceu.ph.qbr %[p2], %[load4] \n\t"
272 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
273 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
274
275 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
276 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
277 "extp %[Temp1], $ac0, 31 \n\t"
278 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
279 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
280 "extp %[Temp2], $ac1, 31 \n\t"
281
282 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
283 "preceu.ph.qbl %[p1], %[load2] \n\t"
284 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
285 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
286 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
287 "preceu.ph.qbl %[p2], %[load4] \n\t"
288 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
289 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
290
291 "lbux %[store1], %[Temp1](%[cm]) \n\t"
292 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
293 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
294 "extp %[Temp1], $ac2, 31 \n\t"
295
296 "lbux %[store2], %[Temp2](%[cm]) \n\t"
297 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
298 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
299 "extp %[Temp2], $ac3, 31 \n\t"
300
301 "sb %[store1], 0(%[dst_ptr]) \n\t"
302 "sb %[store2], 1(%[dst_ptr]) \n\t"
303
304 "lbux %[store1], %[Temp1](%[cm]) \n\t"
305 "lbux %[store2], %[Temp2](%[cm]) \n\t"
306
307 "sb %[store1], 2(%[dst_ptr]) \n\t"
308 "sb %[store2], 3(%[dst_ptr]) \n\t"
309
310 : [load1] "=&r" (load1), [load2] "=&r" (load2),
311 [load3] "=&r" (load3), [load4] "=&r" (load4),
312 [p1] "=&r" (p1), [p2] "=&r" (p2),
313 [n1] "=&r" (n1), [n2] "=&r" (n2),
314 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
315 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
316 [store1] "=&r" (store1), [store2] "=&r" (store2),
317 [src_ptr] "+r" (src_ptr)
318 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
319 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
320 [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
321 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
322 );
323 }
324
325 /* Next row... */
326 src += src_stride;
327 dst += dst_stride;
328 }
329 }
330
vpx_convolve8_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)331 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
332 uint8_t *dst, ptrdiff_t dst_stride,
333 const int16_t *filter_x, int x_step_q4,
334 const int16_t *filter_y, int y_step_q4,
335 int w, int h) {
336 assert(y_step_q4 == 16);
337 assert(((const int32_t *)filter_y)[1] != 0x800000);
338
339 if (((const int32_t *)filter_y)[0] == 0) {
340 vpx_convolve2_vert_dspr2(src, src_stride,
341 dst, dst_stride,
342 filter_x, x_step_q4,
343 filter_y, y_step_q4,
344 w, h);
345 } else {
346 uint32_t pos = 38;
347
348 /* bit positon for extract from acc */
349 __asm__ __volatile__ (
350 "wrdsp %[pos], 1 \n\t"
351 :
352 : [pos] "r" (pos)
353 );
354
355 prefetch_store(dst);
356
357 switch (w) {
358 case 4 :
359 case 8 :
360 case 16 :
361 case 32 :
362 convolve_vert_4_dspr2(src, src_stride,
363 dst, dst_stride,
364 filter_y, w, h);
365 break;
366 case 64 :
367 prefetch_store(dst + 32);
368 convolve_vert_64_dspr2(src, src_stride,
369 dst, dst_stride,
370 filter_y, h);
371 break;
372 default:
373 vpx_convolve8_vert_c(src, src_stride,
374 dst, dst_stride,
375 filter_x, x_step_q4,
376 filter_y, y_step_q4,
377 w, h);
378 break;
379 }
380 }
381 }
382
383 #endif
384