1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <stdio.h>
14
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom_dsp/mips/convolve_common_dspr2.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20
21 #if HAVE_DSPR2
convolve_bi_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)22 static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
23 uint8_t *dst, int32_t dst_stride,
24 const int16_t *filter_y, int32_t w,
25 int32_t h) {
26 int32_t x, y;
27 const uint8_t *src_ptr;
28 uint8_t *dst_ptr;
29 uint8_t *cm = aom_ff_cropTbl;
30 uint32_t vector4a = 64;
31 uint32_t load1, load2;
32 uint32_t p1, p2;
33 uint32_t scratch1;
34 uint32_t store1, store2;
35 int32_t Temp1, Temp2;
36 const int16_t *filter = &filter_y[3];
37 uint32_t filter45;
38
39 filter45 = ((const int32_t *)filter)[0];
40
41 for (y = h; y--;) {
42 /* prefetch data to cache memory */
43 prefetch_store(dst + dst_stride);
44
45 for (x = 0; x < w; x += 4) {
46 src_ptr = src + x;
47 dst_ptr = dst + x;
48
49 __asm__ __volatile__(
50 "ulw %[load1], 0(%[src_ptr]) \n\t"
51 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
52 "ulw %[load2], 0(%[src_ptr]) \n\t"
53
54 "mtlo %[vector4a], $ac0 \n\t"
55 "mtlo %[vector4a], $ac1 \n\t"
56 "mtlo %[vector4a], $ac2 \n\t"
57 "mtlo %[vector4a], $ac3 \n\t"
58 "mthi $zero, $ac0 \n\t"
59 "mthi $zero, $ac1 \n\t"
60 "mthi $zero, $ac2 \n\t"
61 "mthi $zero, $ac3 \n\t"
62
63 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
64 "preceu.ph.qbr %[p1], %[load2] \n\t"
65
66 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
67 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
68
69 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
70 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
71
72 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
73 "preceu.ph.qbl %[p1], %[load2] \n\t"
74
75 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
76 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
77
78 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
79 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
80
81 "extp %[Temp1], $ac0, 31 \n\t"
82 "extp %[Temp2], $ac1, 31 \n\t"
83
84 "lbux %[store1], %[Temp1](%[cm]) \n\t"
85 "extp %[Temp1], $ac2, 31 \n\t"
86
87 "lbux %[store2], %[Temp2](%[cm]) \n\t"
88 "extp %[Temp2], $ac3, 31 \n\t"
89
90 "sb %[store1], 0(%[dst_ptr]) \n\t"
91 "sb %[store2], 1(%[dst_ptr]) \n\t"
92
93 "lbux %[store1], %[Temp1](%[cm]) \n\t"
94 "lbux %[store2], %[Temp2](%[cm]) \n\t"
95
96 "sb %[store1], 2(%[dst_ptr]) \n\t"
97 "sb %[store2], 3(%[dst_ptr]) \n\t"
98
99 : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
100 [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
101 [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
102 [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
103 : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
104 [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
105 }
106
107 /* Next row... */
108 src += src_stride;
109 dst += dst_stride;
110 }
111 }
112
convolve_bi_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)113 static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
114 uint8_t *dst, int32_t dst_stride,
115 const int16_t *filter_y, int32_t h) {
116 int32_t x, y;
117 const uint8_t *src_ptr;
118 uint8_t *dst_ptr;
119 uint8_t *cm = aom_ff_cropTbl;
120 uint32_t vector4a = 64;
121 uint32_t load1, load2;
122 uint32_t p1, p2;
123 uint32_t scratch1;
124 uint32_t store1, store2;
125 int32_t Temp1, Temp2;
126 const int16_t *filter = &filter_y[3];
127 uint32_t filter45;
128
129 filter45 = ((const int32_t *)filter)[0];
130
131 for (y = h; y--;) {
132 /* prefetch data to cache memory */
133 prefetch_store(dst + dst_stride);
134
135 for (x = 0; x < 64; x += 4) {
136 src_ptr = src + x;
137 dst_ptr = dst + x;
138
139 __asm__ __volatile__(
140 "ulw %[load1], 0(%[src_ptr]) \n\t"
141 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
142 "ulw %[load2], 0(%[src_ptr]) \n\t"
143
144 "mtlo %[vector4a], $ac0 \n\t"
145 "mtlo %[vector4a], $ac1 \n\t"
146 "mtlo %[vector4a], $ac2 \n\t"
147 "mtlo %[vector4a], $ac3 \n\t"
148 "mthi $zero, $ac0 \n\t"
149 "mthi $zero, $ac1 \n\t"
150 "mthi $zero, $ac2 \n\t"
151 "mthi $zero, $ac3 \n\t"
152
153 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
154 "preceu.ph.qbr %[p1], %[load2] \n\t"
155
156 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
157 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
158
159 "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
160 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
161
162 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
163 "preceu.ph.qbl %[p1], %[load2] \n\t"
164
165 "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
166 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
167
168 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
169 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
170
171 "extp %[Temp1], $ac0, 31 \n\t"
172 "extp %[Temp2], $ac1, 31 \n\t"
173
174 "lbux %[store1], %[Temp1](%[cm]) \n\t"
175 "extp %[Temp1], $ac2, 31 \n\t"
176
177 "lbux %[store2], %[Temp2](%[cm]) \n\t"
178 "extp %[Temp2], $ac3, 31 \n\t"
179
180 "sb %[store1], 0(%[dst_ptr]) \n\t"
181 "sb %[store2], 1(%[dst_ptr]) \n\t"
182
183 "lbux %[store1], %[Temp1](%[cm]) \n\t"
184 "lbux %[store2], %[Temp2](%[cm]) \n\t"
185
186 "sb %[store1], 2(%[dst_ptr]) \n\t"
187 "sb %[store2], 3(%[dst_ptr]) \n\t"
188
189 : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
190 [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
191 [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
192 [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
193 : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
194 [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
195 }
196
197 /* Next row... */
198 src += src_stride;
199 dst += dst_stride;
200 }
201 }
202
aom_convolve2_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)203 void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
204 uint8_t *dst, ptrdiff_t dst_stride,
205 const int16_t *filter_x, int x_step_q4,
206 const int16_t *filter_y, int y_step_q4, int w,
207 int h) {
208 uint32_t pos = 38;
209
210 assert(y_step_q4 == 16);
211
212 /* bit positon for extract from acc */
213 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
214 :
215 : [pos] "r"(pos));
216
217 prefetch_store(dst);
218
219 switch (w) {
220 case 4:
221 case 8:
222 case 16:
223 case 32:
224 convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
225 h);
226 break;
227 case 64:
228 prefetch_store(dst + 32);
229 convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
230 break;
231 default:
232 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
233 x_step_q4, filter_y, y_step_q4, w, h);
234 break;
235 }
236 }
237 #endif
238