1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
22 (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
23
TransposeWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)24 void TransposeWx8_DSPR2(const uint8* src,
25 int src_stride,
26 uint8* dst,
27 int dst_stride,
28 int width) {
29 __asm__ __volatile__(
30 ".set push \n"
31 ".set noreorder \n"
32 "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
33 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
34 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
35 "addu $t3, $t2, %[src_stride] \n"
36 "addu $t5, $t4, %[src_stride] \n"
37 "addu $t6, $t2, $t4 \n"
38 "andi $t0, %[dst], 0x3 \n"
39 "andi $t1, %[dst_stride], 0x3 \n"
40 "or $t0, $t0, $t1 \n"
41 "bnez $t0, 11f \n"
42 " subu $t7, $t9, %[src_stride] \n"
43 // dst + dst_stride word aligned
44 "1: \n"
45 "lbu $t0, 0(%[src]) \n"
46 "lbux $t1, %[src_stride](%[src]) \n"
47 "lbux $t8, $t2(%[src]) \n"
48 "lbux $t9, $t3(%[src]) \n"
49 "sll $t1, $t1, 16 \n"
50 "sll $t9, $t9, 16 \n"
51 "or $t0, $t0, $t1 \n"
52 "or $t8, $t8, $t9 \n"
53 "precr.qb.ph $s0, $t8, $t0 \n"
54 "lbux $t0, $t4(%[src]) \n"
55 "lbux $t1, $t5(%[src]) \n"
56 "lbux $t8, $t6(%[src]) \n"
57 "lbux $t9, $t7(%[src]) \n"
58 "sll $t1, $t1, 16 \n"
59 "sll $t9, $t9, 16 \n"
60 "or $t0, $t0, $t1 \n"
61 "or $t8, $t8, $t9 \n"
62 "precr.qb.ph $s1, $t8, $t0 \n"
63 "sw $s0, 0(%[dst]) \n"
64 "addiu %[width], -1 \n"
65 "addiu %[src], 1 \n"
66 "sw $s1, 4(%[dst]) \n"
67 "bnez %[width], 1b \n"
68 " addu %[dst], %[dst], %[dst_stride] \n"
69 "b 2f \n"
70 // dst + dst_stride unaligned
71 "11: \n"
72 "lbu $t0, 0(%[src]) \n"
73 "lbux $t1, %[src_stride](%[src]) \n"
74 "lbux $t8, $t2(%[src]) \n"
75 "lbux $t9, $t3(%[src]) \n"
76 "sll $t1, $t1, 16 \n"
77 "sll $t9, $t9, 16 \n"
78 "or $t0, $t0, $t1 \n"
79 "or $t8, $t8, $t9 \n"
80 "precr.qb.ph $s0, $t8, $t0 \n"
81 "lbux $t0, $t4(%[src]) \n"
82 "lbux $t1, $t5(%[src]) \n"
83 "lbux $t8, $t6(%[src]) \n"
84 "lbux $t9, $t7(%[src]) \n"
85 "sll $t1, $t1, 16 \n"
86 "sll $t9, $t9, 16 \n"
87 "or $t0, $t0, $t1 \n"
88 "or $t8, $t8, $t9 \n"
89 "precr.qb.ph $s1, $t8, $t0 \n"
90 "swr $s0, 0(%[dst]) \n"
91 "swl $s0, 3(%[dst]) \n"
92 "addiu %[width], -1 \n"
93 "addiu %[src], 1 \n"
94 "swr $s1, 4(%[dst]) \n"
95 "swl $s1, 7(%[dst]) \n"
96 "bnez %[width], 11b \n"
97 "addu %[dst], %[dst], %[dst_stride] \n"
98 "2: \n"
99 ".set pop \n"
100 : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
101 : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
102 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
103 }
104
TransposeWx8_Fast_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)105 void TransposeWx8_Fast_DSPR2(const uint8* src,
106 int src_stride,
107 uint8* dst,
108 int dst_stride,
109 int width) {
110 __asm__ __volatile__(
111 ".set noat \n"
112 ".set push \n"
113 ".set noreorder \n"
114 "beqz %[width], 2f \n"
115 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
116 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
117 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
118 "addu $t3, $t2, %[src_stride] \n"
119 "addu $t5, $t4, %[src_stride] \n"
120 "addu $t6, $t2, $t4 \n"
121
122 "srl $AT, %[width], 0x2 \n"
123 "andi $t0, %[dst], 0x3 \n"
124 "andi $t1, %[dst_stride], 0x3 \n"
125 "or $t0, $t0, $t1 \n"
126 "bnez $t0, 11f \n"
127 " subu $t7, $t9, %[src_stride] \n"
128 // dst + dst_stride word aligned
129 "1: \n"
130 "lw $t0, 0(%[src]) \n"
131 "lwx $t1, %[src_stride](%[src]) \n"
132 "lwx $t8, $t2(%[src]) \n"
133 "lwx $t9, $t3(%[src]) \n"
134
135 // t0 = | 30 | 20 | 10 | 00 |
136 // t1 = | 31 | 21 | 11 | 01 |
137 // t8 = | 32 | 22 | 12 | 02 |
138 // t9 = | 33 | 23 | 13 | 03 |
139
140 "precr.qb.ph $s0, $t1, $t0 \n"
141 "precr.qb.ph $s1, $t9, $t8 \n"
142 "precrq.qb.ph $s2, $t1, $t0 \n"
143 "precrq.qb.ph $s3, $t9, $t8 \n"
144
145 // s0 = | 21 | 01 | 20 | 00 |
146 // s1 = | 23 | 03 | 22 | 02 |
147 // s2 = | 31 | 11 | 30 | 10 |
148 // s3 = | 33 | 13 | 32 | 12 |
149
150 "precr.qb.ph $s4, $s1, $s0 \n"
151 "precrq.qb.ph $s5, $s1, $s0 \n"
152 "precr.qb.ph $s6, $s3, $s2 \n"
153 "precrq.qb.ph $s7, $s3, $s2 \n"
154
155 // s4 = | 03 | 02 | 01 | 00 |
156 // s5 = | 23 | 22 | 21 | 20 |
157 // s6 = | 13 | 12 | 11 | 10 |
158 // s7 = | 33 | 32 | 31 | 30 |
159
160 "lwx $t0, $t4(%[src]) \n"
161 "lwx $t1, $t5(%[src]) \n"
162 "lwx $t8, $t6(%[src]) \n"
163 "lwx $t9, $t7(%[src]) \n"
164
165 // t0 = | 34 | 24 | 14 | 04 |
166 // t1 = | 35 | 25 | 15 | 05 |
167 // t8 = | 36 | 26 | 16 | 06 |
168 // t9 = | 37 | 27 | 17 | 07 |
169
170 "precr.qb.ph $s0, $t1, $t0 \n"
171 "precr.qb.ph $s1, $t9, $t8 \n"
172 "precrq.qb.ph $s2, $t1, $t0 \n"
173 "precrq.qb.ph $s3, $t9, $t8 \n"
174
175 // s0 = | 25 | 05 | 24 | 04 |
176 // s1 = | 27 | 07 | 26 | 06 |
177 // s2 = | 35 | 15 | 34 | 14 |
178 // s3 = | 37 | 17 | 36 | 16 |
179
180 "precr.qb.ph $t0, $s1, $s0 \n"
181 "precrq.qb.ph $t1, $s1, $s0 \n"
182 "precr.qb.ph $t8, $s3, $s2 \n"
183 "precrq.qb.ph $t9, $s3, $s2 \n"
184
185 // t0 = | 07 | 06 | 05 | 04 |
186 // t1 = | 27 | 26 | 25 | 24 |
187 // t8 = | 17 | 16 | 15 | 14 |
188 // t9 = | 37 | 36 | 35 | 34 |
189
190 "addu $s0, %[dst], %[dst_stride] \n"
191 "addu $s1, $s0, %[dst_stride] \n"
192 "addu $s2, $s1, %[dst_stride] \n"
193
194 "sw $s4, 0(%[dst]) \n"
195 "sw $t0, 4(%[dst]) \n"
196 "sw $s6, 0($s0) \n"
197 "sw $t8, 4($s0) \n"
198 "sw $s5, 0($s1) \n"
199 "sw $t1, 4($s1) \n"
200 "sw $s7, 0($s2) \n"
201 "sw $t9, 4($s2) \n"
202
203 "addiu $AT, -1 \n"
204 "addiu %[src], 4 \n"
205
206 "bnez $AT, 1b \n"
207 " addu %[dst], $s2, %[dst_stride] \n"
208 "b 2f \n"
209 // dst + dst_stride unaligned
210 "11: \n"
211 "lw $t0, 0(%[src]) \n"
212 "lwx $t1, %[src_stride](%[src]) \n"
213 "lwx $t8, $t2(%[src]) \n"
214 "lwx $t9, $t3(%[src]) \n"
215
216 // t0 = | 30 | 20 | 10 | 00 |
217 // t1 = | 31 | 21 | 11 | 01 |
218 // t8 = | 32 | 22 | 12 | 02 |
219 // t9 = | 33 | 23 | 13 | 03 |
220
221 "precr.qb.ph $s0, $t1, $t0 \n"
222 "precr.qb.ph $s1, $t9, $t8 \n"
223 "precrq.qb.ph $s2, $t1, $t0 \n"
224 "precrq.qb.ph $s3, $t9, $t8 \n"
225
226 // s0 = | 21 | 01 | 20 | 00 |
227 // s1 = | 23 | 03 | 22 | 02 |
228 // s2 = | 31 | 11 | 30 | 10 |
229 // s3 = | 33 | 13 | 32 | 12 |
230
231 "precr.qb.ph $s4, $s1, $s0 \n"
232 "precrq.qb.ph $s5, $s1, $s0 \n"
233 "precr.qb.ph $s6, $s3, $s2 \n"
234 "precrq.qb.ph $s7, $s3, $s2 \n"
235
236 // s4 = | 03 | 02 | 01 | 00 |
237 // s5 = | 23 | 22 | 21 | 20 |
238 // s6 = | 13 | 12 | 11 | 10 |
239 // s7 = | 33 | 32 | 31 | 30 |
240
241 "lwx $t0, $t4(%[src]) \n"
242 "lwx $t1, $t5(%[src]) \n"
243 "lwx $t8, $t6(%[src]) \n"
244 "lwx $t9, $t7(%[src]) \n"
245
246 // t0 = | 34 | 24 | 14 | 04 |
247 // t1 = | 35 | 25 | 15 | 05 |
248 // t8 = | 36 | 26 | 16 | 06 |
249 // t9 = | 37 | 27 | 17 | 07 |
250
251 "precr.qb.ph $s0, $t1, $t0 \n"
252 "precr.qb.ph $s1, $t9, $t8 \n"
253 "precrq.qb.ph $s2, $t1, $t0 \n"
254 "precrq.qb.ph $s3, $t9, $t8 \n"
255
256 // s0 = | 25 | 05 | 24 | 04 |
257 // s1 = | 27 | 07 | 26 | 06 |
258 // s2 = | 35 | 15 | 34 | 14 |
259 // s3 = | 37 | 17 | 36 | 16 |
260
261 "precr.qb.ph $t0, $s1, $s0 \n"
262 "precrq.qb.ph $t1, $s1, $s0 \n"
263 "precr.qb.ph $t8, $s3, $s2 \n"
264 "precrq.qb.ph $t9, $s3, $s2 \n"
265
266 // t0 = | 07 | 06 | 05 | 04 |
267 // t1 = | 27 | 26 | 25 | 24 |
268 // t8 = | 17 | 16 | 15 | 14 |
269 // t9 = | 37 | 36 | 35 | 34 |
270
271 "addu $s0, %[dst], %[dst_stride] \n"
272 "addu $s1, $s0, %[dst_stride] \n"
273 "addu $s2, $s1, %[dst_stride] \n"
274
275 "swr $s4, 0(%[dst]) \n"
276 "swl $s4, 3(%[dst]) \n"
277 "swr $t0, 4(%[dst]) \n"
278 "swl $t0, 7(%[dst]) \n"
279 "swr $s6, 0($s0) \n"
280 "swl $s6, 3($s0) \n"
281 "swr $t8, 4($s0) \n"
282 "swl $t8, 7($s0) \n"
283 "swr $s5, 0($s1) \n"
284 "swl $s5, 3($s1) \n"
285 "swr $t1, 4($s1) \n"
286 "swl $t1, 7($s1) \n"
287 "swr $s7, 0($s2) \n"
288 "swl $s7, 3($s2) \n"
289 "swr $t9, 4($s2) \n"
290 "swl $t9, 7($s2) \n"
291
292 "addiu $AT, -1 \n"
293 "addiu %[src], 4 \n"
294
295 "bnez $AT, 11b \n"
296 " addu %[dst], $s2, %[dst_stride] \n"
297 "2: \n"
298 ".set pop \n"
299 ".set at \n"
300 : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
301 : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
302 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
303 "s2", "s3", "s4", "s5", "s6", "s7");
304 }
305
TransposeUVWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)306 void TransposeUVWx8_DSPR2(const uint8* src,
307 int src_stride,
308 uint8* dst_a,
309 int dst_stride_a,
310 uint8* dst_b,
311 int dst_stride_b,
312 int width) {
313 __asm__ __volatile__(
314 ".set push \n"
315 ".set noreorder \n"
316 "beqz %[width], 2f \n"
317 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
318 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
319 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
320 "addu $t3, $t2, %[src_stride] \n"
321 "addu $t5, $t4, %[src_stride] \n"
322 "addu $t6, $t2, $t4 \n"
323 "subu $t7, $t9, %[src_stride] \n"
324 "srl $t1, %[width], 1 \n"
325
326 // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
327 "andi $t0, %[dst_a], 0x3 \n"
328 "andi $t8, %[dst_b], 0x3 \n"
329 "or $t0, $t0, $t8 \n"
330 "andi $t8, %[dst_stride_a], 0x3 \n"
331 "andi $s5, %[dst_stride_b], 0x3 \n"
332 "or $t8, $t8, $s5 \n"
333 "or $t0, $t0, $t8 \n"
334 "bnez $t0, 11f \n"
335 " nop \n"
336 // dst + dst_stride word aligned (both, a & b dst addresses)
337 "1: \n"
338 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
339 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
340 "addu $s5, %[dst_a], %[dst_stride_a] \n"
341 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
342 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
343 "addu $s6, %[dst_b], %[dst_stride_b] \n"
344
345 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
346 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
347 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
348 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
349
350 "sll $t0, $t0, 16 \n"
351 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
352 "sll $t9, $t9, 16 \n"
353 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
354
355 "sw $s3, 0($s5) \n"
356 "sw $s4, 0($s6) \n"
357
358 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
359 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
360
361 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
362 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
363 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
364 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
365 "sw $s3, 0(%[dst_a]) \n"
366 "sw $s4, 0(%[dst_b]) \n"
367
368 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
369 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
370 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
371 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
372
373 "sll $t0, $t0, 16 \n"
374 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
375 "sll $t9, $t9, 16 \n"
376 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
377 "sw $s3, 4($s5) \n"
378 "sw $s4, 4($s6) \n"
379
380 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
381 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
382
383 "addiu %[src], 4 \n"
384 "addiu $t1, -1 \n"
385 "sll $t0, %[dst_stride_a], 1 \n"
386 "sll $t8, %[dst_stride_b], 1 \n"
387 "sw $s3, 4(%[dst_a]) \n"
388 "sw $s4, 4(%[dst_b]) \n"
389 "addu %[dst_a], %[dst_a], $t0 \n"
390 "bnez $t1, 1b \n"
391 " addu %[dst_b], %[dst_b], $t8 \n"
392 "b 2f \n"
393 " nop \n"
394
395 // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
396 "11: \n"
397 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
398 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
399 "addu $s5, %[dst_a], %[dst_stride_a] \n"
400 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
401 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
402 "addu $s6, %[dst_b], %[dst_stride_b] \n"
403
404 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
405 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
406 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
407 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
408
409 "sll $t0, $t0, 16 \n"
410 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
411 "sll $t9, $t9, 16 \n"
412 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
413
414 "swr $s3, 0($s5) \n"
415 "swl $s3, 3($s5) \n"
416 "swr $s4, 0($s6) \n"
417 "swl $s4, 3($s6) \n"
418
419 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
420 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
421
422 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
423 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
424 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
425 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
426 "swr $s3, 0(%[dst_a]) \n"
427 "swl $s3, 3(%[dst_a]) \n"
428 "swr $s4, 0(%[dst_b]) \n"
429 "swl $s4, 3(%[dst_b]) \n"
430
431 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
432 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
433 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
434 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
435
436 "sll $t0, $t0, 16 \n"
437 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
438 "sll $t9, $t9, 16 \n"
439 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
440
441 "swr $s3, 4($s5) \n"
442 "swl $s3, 7($s5) \n"
443 "swr $s4, 4($s6) \n"
444 "swl $s4, 7($s6) \n"
445
446 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
447 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
448
449 "addiu %[src], 4 \n"
450 "addiu $t1, -1 \n"
451 "sll $t0, %[dst_stride_a], 1 \n"
452 "sll $t8, %[dst_stride_b], 1 \n"
453 "swr $s3, 4(%[dst_a]) \n"
454 "swl $s3, 7(%[dst_a]) \n"
455 "swr $s4, 4(%[dst_b]) \n"
456 "swl $s4, 7(%[dst_b]) \n"
457 "addu %[dst_a], %[dst_a], $t0 \n"
458 "bnez $t1, 11b \n"
459 " addu %[dst_b], %[dst_b], $t8 \n"
460
461 "2: \n"
462 ".set pop \n"
463 : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
464 [width] "+r"(width), [src_stride] "+r"(src_stride)
465 : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
466 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
467 "s2", "s3", "s4", "s5", "s6");
468 }
469
470 #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
471
472 #ifdef __cplusplus
473 } // extern "C"
474 } // namespace libyuv
475 #endif
476