1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)21 static void convolve_bi_vert_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_y,
26 int32_t w,
27 int32_t h) {
28 int32_t x, y;
29 const uint8_t *src_ptr;
30 uint8_t *dst_ptr;
31 uint8_t *cm = vpx_ff_cropTbl;
32 uint32_t vector4a = 64;
33 uint32_t load1, load2;
34 uint32_t p1, p2;
35 uint32_t scratch1;
36 uint32_t store1, store2;
37 int32_t Temp1, Temp2;
38 const int16_t *filter = &filter_y[3];
39 uint32_t filter45;
40
41 filter45 = ((const int32_t *)filter)[0];
42
43 for (y = h; y--;) {
44 /* prefetch data to cache memory */
45 prefetch_store(dst + dst_stride);
46
47 for (x = 0; x < w; x += 4) {
48 src_ptr = src + x;
49 dst_ptr = dst + x;
50
51 __asm__ __volatile__ (
52 "ulw %[load1], 0(%[src_ptr]) \n\t"
53 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
54 "ulw %[load2], 0(%[src_ptr]) \n\t"
55
56 "mtlo %[vector4a], $ac0 \n\t"
57 "mtlo %[vector4a], $ac1 \n\t"
58 "mtlo %[vector4a], $ac2 \n\t"
59 "mtlo %[vector4a], $ac3 \n\t"
60 "mthi $zero, $ac0 \n\t"
61 "mthi $zero, $ac1 \n\t"
62 "mthi $zero, $ac2 \n\t"
63 "mthi $zero, $ac3 \n\t"
64
65 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
66 "preceu.ph.qbr %[p1], %[load2] \n\t"
67
68 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
69 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
70
71 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
72 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
73
74 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
75 "preceu.ph.qbl %[p1], %[load2] \n\t"
76
77 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
78 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
79
80 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
81 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
82
83 "extp %[Temp1], $ac0, 31 \n\t"
84 "extp %[Temp2], $ac1, 31 \n\t"
85
86 "lbux %[store1], %[Temp1](%[cm]) \n\t"
87 "extp %[Temp1], $ac2, 31 \n\t"
88
89 "lbux %[store2], %[Temp2](%[cm]) \n\t"
90 "extp %[Temp2], $ac3, 31 \n\t"
91
92 "sb %[store1], 0(%[dst_ptr]) \n\t"
93 "sb %[store2], 1(%[dst_ptr]) \n\t"
94
95 "lbux %[store1], %[Temp1](%[cm]) \n\t"
96 "lbux %[store2], %[Temp2](%[cm]) \n\t"
97
98 "sb %[store1], 2(%[dst_ptr]) \n\t"
99 "sb %[store2], 3(%[dst_ptr]) \n\t"
100
101 : [load1] "=&r" (load1), [load2] "=&r" (load2),
102 [p1] "=&r" (p1), [p2] "=&r" (p2),
103 [scratch1] "=&r" (scratch1),
104 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
105 [store1] "=&r" (store1), [store2] "=&r" (store2),
106 [src_ptr] "+r" (src_ptr)
107 : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
108 [src_stride] "r" (src_stride),
109 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
110 );
111 }
112
113 /* Next row... */
114 src += src_stride;
115 dst += dst_stride;
116 }
117 }
118
convolve_bi_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)119 static void convolve_bi_vert_64_dspr2(const uint8_t *src,
120 int32_t src_stride,
121 uint8_t *dst,
122 int32_t dst_stride,
123 const int16_t *filter_y,
124 int32_t h) {
125 int32_t x, y;
126 const uint8_t *src_ptr;
127 uint8_t *dst_ptr;
128 uint8_t *cm = vpx_ff_cropTbl;
129 uint32_t vector4a = 64;
130 uint32_t load1, load2;
131 uint32_t p1, p2;
132 uint32_t scratch1;
133 uint32_t store1, store2;
134 int32_t Temp1, Temp2;
135 const int16_t *filter = &filter_y[3];
136 uint32_t filter45;
137
138 filter45 = ((const int32_t *)filter)[0];
139
140 for (y = h; y--;) {
141 /* prefetch data to cache memory */
142 prefetch_store(dst + dst_stride);
143
144 for (x = 0; x < 64; x += 4) {
145 src_ptr = src + x;
146 dst_ptr = dst + x;
147
148 __asm__ __volatile__ (
149 "ulw %[load1], 0(%[src_ptr]) \n\t"
150 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
151 "ulw %[load2], 0(%[src_ptr]) \n\t"
152
153 "mtlo %[vector4a], $ac0 \n\t"
154 "mtlo %[vector4a], $ac1 \n\t"
155 "mtlo %[vector4a], $ac2 \n\t"
156 "mtlo %[vector4a], $ac3 \n\t"
157 "mthi $zero, $ac0 \n\t"
158 "mthi $zero, $ac1 \n\t"
159 "mthi $zero, $ac2 \n\t"
160 "mthi $zero, $ac3 \n\t"
161
162 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
163 "preceu.ph.qbr %[p1], %[load2] \n\t"
164
165 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
166 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
167
168 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
169 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
170
171 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
172 "preceu.ph.qbl %[p1], %[load2] \n\t"
173
174 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
175 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
176
177 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
178 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
179
180 "extp %[Temp1], $ac0, 31 \n\t"
181 "extp %[Temp2], $ac1, 31 \n\t"
182
183 "lbux %[store1], %[Temp1](%[cm]) \n\t"
184 "extp %[Temp1], $ac2, 31 \n\t"
185
186 "lbux %[store2], %[Temp2](%[cm]) \n\t"
187 "extp %[Temp2], $ac3, 31 \n\t"
188
189 "sb %[store1], 0(%[dst_ptr]) \n\t"
190 "sb %[store2], 1(%[dst_ptr]) \n\t"
191
192 "lbux %[store1], %[Temp1](%[cm]) \n\t"
193 "lbux %[store2], %[Temp2](%[cm]) \n\t"
194
195 "sb %[store1], 2(%[dst_ptr]) \n\t"
196 "sb %[store2], 3(%[dst_ptr]) \n\t"
197
198 : [load1] "=&r" (load1), [load2] "=&r" (load2),
199 [p1] "=&r" (p1), [p2] "=&r" (p2),
200 [scratch1] "=&r" (scratch1),
201 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
202 [store1] "=&r" (store1), [store2] "=&r" (store2),
203 [src_ptr] "+r" (src_ptr)
204 : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
205 [src_stride] "r" (src_stride),
206 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
207 );
208 }
209
210 /* Next row... */
211 src += src_stride;
212 dst += dst_stride;
213 }
214 }
215
vpx_convolve2_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)216 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
217 uint8_t *dst, ptrdiff_t dst_stride,
218 const int16_t *filter_x, int x_step_q4,
219 const int16_t *filter_y, int y_step_q4,
220 int w, int h) {
221 uint32_t pos = 38;
222
223 assert(y_step_q4 == 16);
224
225 /* bit positon for extract from acc */
226 __asm__ __volatile__ (
227 "wrdsp %[pos], 1 \n\t"
228 :
229 : [pos] "r" (pos)
230 );
231
232 prefetch_store(dst);
233
234 switch (w) {
235 case 4 :
236 case 8 :
237 case 16 :
238 case 32 :
239 convolve_bi_vert_4_dspr2(src, src_stride,
240 dst, dst_stride,
241 filter_y, w, h);
242 break;
243 case 64 :
244 prefetch_store(dst + 32);
245 convolve_bi_vert_64_dspr2(src, src_stride,
246 dst, dst_stride,
247 filter_y, h);
248 break;
249 default:
250 vpx_convolve8_vert_c(src, src_stride,
251 dst, dst_stride,
252 filter_x, x_step_q4,
253 filter_y, y_step_q4,
254 w, h);
255 break;
256 }
257 }
258 #endif
259