1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_MIPS) && \
22 defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
23 (_MIPS_SIM == _MIPS_SIM_ABI32)
24
TransposeWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)25 void TransposeWx8_DSPR2(const uint8* src, int src_stride,
26 uint8* dst, int dst_stride, int width) {
27 __asm__ __volatile__ (
28 ".set push \n"
29 ".set noreorder \n"
30 "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
31 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
32 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
33 "addu $t3, $t2, %[src_stride] \n"
34 "addu $t5, $t4, %[src_stride] \n"
35 "addu $t6, $t2, $t4 \n"
36 "andi $t0, %[dst], 0x3 \n"
37 "andi $t1, %[dst_stride], 0x3 \n"
38 "or $t0, $t0, $t1 \n"
39 "bnez $t0, 11f \n"
40 " subu $t7, $t9, %[src_stride] \n"
41 //dst + dst_stride word aligned
42 "1: \n"
43 "lbu $t0, 0(%[src]) \n"
44 "lbux $t1, %[src_stride](%[src]) \n"
45 "lbux $t8, $t2(%[src]) \n"
46 "lbux $t9, $t3(%[src]) \n"
47 "sll $t1, $t1, 16 \n"
48 "sll $t9, $t9, 16 \n"
49 "or $t0, $t0, $t1 \n"
50 "or $t8, $t8, $t9 \n"
51 "precr.qb.ph $s0, $t8, $t0 \n"
52 "lbux $t0, $t4(%[src]) \n"
53 "lbux $t1, $t5(%[src]) \n"
54 "lbux $t8, $t6(%[src]) \n"
55 "lbux $t9, $t7(%[src]) \n"
56 "sll $t1, $t1, 16 \n"
57 "sll $t9, $t9, 16 \n"
58 "or $t0, $t0, $t1 \n"
59 "or $t8, $t8, $t9 \n"
60 "precr.qb.ph $s1, $t8, $t0 \n"
61 "sw $s0, 0(%[dst]) \n"
62 "addiu %[width], -1 \n"
63 "addiu %[src], 1 \n"
64 "sw $s1, 4(%[dst]) \n"
65 "bnez %[width], 1b \n"
66 " addu %[dst], %[dst], %[dst_stride] \n"
67 "b 2f \n"
68 //dst + dst_stride unaligned
69 "11: \n"
70 "lbu $t0, 0(%[src]) \n"
71 "lbux $t1, %[src_stride](%[src]) \n"
72 "lbux $t8, $t2(%[src]) \n"
73 "lbux $t9, $t3(%[src]) \n"
74 "sll $t1, $t1, 16 \n"
75 "sll $t9, $t9, 16 \n"
76 "or $t0, $t0, $t1 \n"
77 "or $t8, $t8, $t9 \n"
78 "precr.qb.ph $s0, $t8, $t0 \n"
79 "lbux $t0, $t4(%[src]) \n"
80 "lbux $t1, $t5(%[src]) \n"
81 "lbux $t8, $t6(%[src]) \n"
82 "lbux $t9, $t7(%[src]) \n"
83 "sll $t1, $t1, 16 \n"
84 "sll $t9, $t9, 16 \n"
85 "or $t0, $t0, $t1 \n"
86 "or $t8, $t8, $t9 \n"
87 "precr.qb.ph $s1, $t8, $t0 \n"
88 "swr $s0, 0(%[dst]) \n"
89 "swl $s0, 3(%[dst]) \n"
90 "addiu %[width], -1 \n"
91 "addiu %[src], 1 \n"
92 "swr $s1, 4(%[dst]) \n"
93 "swl $s1, 7(%[dst]) \n"
94 "bnez %[width], 11b \n"
95 "addu %[dst], %[dst], %[dst_stride] \n"
96 "2: \n"
97 ".set pop \n"
98 :[src] "+r" (src),
99 [dst] "+r" (dst),
100 [width] "+r" (width)
101 :[src_stride] "r" (src_stride),
102 [dst_stride] "r" (dst_stride)
103 : "t0", "t1", "t2", "t3", "t4", "t5",
104 "t6", "t7", "t8", "t9",
105 "s0", "s1"
106 );
107 }
108
TransposeWx8_Fast_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)109 void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
110 uint8* dst, int dst_stride, int width) {
111 __asm__ __volatile__ (
112 ".set noat \n"
113 ".set push \n"
114 ".set noreorder \n"
115 "beqz %[width], 2f \n"
116 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
117 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
118 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
119 "addu $t3, $t2, %[src_stride] \n"
120 "addu $t5, $t4, %[src_stride] \n"
121 "addu $t6, $t2, $t4 \n"
122
123 "srl $AT, %[width], 0x2 \n"
124 "andi $t0, %[dst], 0x3 \n"
125 "andi $t1, %[dst_stride], 0x3 \n"
126 "or $t0, $t0, $t1 \n"
127 "bnez $t0, 11f \n"
128 " subu $t7, $t9, %[src_stride] \n"
129 //dst + dst_stride word aligned
130 "1: \n"
131 "lw $t0, 0(%[src]) \n"
132 "lwx $t1, %[src_stride](%[src]) \n"
133 "lwx $t8, $t2(%[src]) \n"
134 "lwx $t9, $t3(%[src]) \n"
135
136 // t0 = | 30 | 20 | 10 | 00 |
137 // t1 = | 31 | 21 | 11 | 01 |
138 // t8 = | 32 | 22 | 12 | 02 |
139 // t9 = | 33 | 23 | 13 | 03 |
140
141 "precr.qb.ph $s0, $t1, $t0 \n"
142 "precr.qb.ph $s1, $t9, $t8 \n"
143 "precrq.qb.ph $s2, $t1, $t0 \n"
144 "precrq.qb.ph $s3, $t9, $t8 \n"
145
146 // s0 = | 21 | 01 | 20 | 00 |
147 // s1 = | 23 | 03 | 22 | 02 |
148 // s2 = | 31 | 11 | 30 | 10 |
149 // s3 = | 33 | 13 | 32 | 12 |
150
151 "precr.qb.ph $s4, $s1, $s0 \n"
152 "precrq.qb.ph $s5, $s1, $s0 \n"
153 "precr.qb.ph $s6, $s3, $s2 \n"
154 "precrq.qb.ph $s7, $s3, $s2 \n"
155
156 // s4 = | 03 | 02 | 01 | 00 |
157 // s5 = | 23 | 22 | 21 | 20 |
158 // s6 = | 13 | 12 | 11 | 10 |
159 // s7 = | 33 | 32 | 31 | 30 |
160
161 "lwx $t0, $t4(%[src]) \n"
162 "lwx $t1, $t5(%[src]) \n"
163 "lwx $t8, $t6(%[src]) \n"
164 "lwx $t9, $t7(%[src]) \n"
165
166 // t0 = | 34 | 24 | 14 | 04 |
167 // t1 = | 35 | 25 | 15 | 05 |
168 // t8 = | 36 | 26 | 16 | 06 |
169 // t9 = | 37 | 27 | 17 | 07 |
170
171 "precr.qb.ph $s0, $t1, $t0 \n"
172 "precr.qb.ph $s1, $t9, $t8 \n"
173 "precrq.qb.ph $s2, $t1, $t0 \n"
174 "precrq.qb.ph $s3, $t9, $t8 \n"
175
176 // s0 = | 25 | 05 | 24 | 04 |
177 // s1 = | 27 | 07 | 26 | 06 |
178 // s2 = | 35 | 15 | 34 | 14 |
179 // s3 = | 37 | 17 | 36 | 16 |
180
181 "precr.qb.ph $t0, $s1, $s0 \n"
182 "precrq.qb.ph $t1, $s1, $s0 \n"
183 "precr.qb.ph $t8, $s3, $s2 \n"
184 "precrq.qb.ph $t9, $s3, $s2 \n"
185
186 // t0 = | 07 | 06 | 05 | 04 |
187 // t1 = | 27 | 26 | 25 | 24 |
188 // t8 = | 17 | 16 | 15 | 14 |
189 // t9 = | 37 | 36 | 35 | 34 |
190
191 "addu $s0, %[dst], %[dst_stride] \n"
192 "addu $s1, $s0, %[dst_stride] \n"
193 "addu $s2, $s1, %[dst_stride] \n"
194
195 "sw $s4, 0(%[dst]) \n"
196 "sw $t0, 4(%[dst]) \n"
197 "sw $s6, 0($s0) \n"
198 "sw $t8, 4($s0) \n"
199 "sw $s5, 0($s1) \n"
200 "sw $t1, 4($s1) \n"
201 "sw $s7, 0($s2) \n"
202 "sw $t9, 4($s2) \n"
203
204 "addiu $AT, -1 \n"
205 "addiu %[src], 4 \n"
206
207 "bnez $AT, 1b \n"
208 " addu %[dst], $s2, %[dst_stride] \n"
209 "b 2f \n"
210 //dst + dst_stride unaligned
211 "11: \n"
212 "lw $t0, 0(%[src]) \n"
213 "lwx $t1, %[src_stride](%[src]) \n"
214 "lwx $t8, $t2(%[src]) \n"
215 "lwx $t9, $t3(%[src]) \n"
216
217 // t0 = | 30 | 20 | 10 | 00 |
218 // t1 = | 31 | 21 | 11 | 01 |
219 // t8 = | 32 | 22 | 12 | 02 |
220 // t9 = | 33 | 23 | 13 | 03 |
221
222 "precr.qb.ph $s0, $t1, $t0 \n"
223 "precr.qb.ph $s1, $t9, $t8 \n"
224 "precrq.qb.ph $s2, $t1, $t0 \n"
225 "precrq.qb.ph $s3, $t9, $t8 \n"
226
227 // s0 = | 21 | 01 | 20 | 00 |
228 // s1 = | 23 | 03 | 22 | 02 |
229 // s2 = | 31 | 11 | 30 | 10 |
230 // s3 = | 33 | 13 | 32 | 12 |
231
232 "precr.qb.ph $s4, $s1, $s0 \n"
233 "precrq.qb.ph $s5, $s1, $s0 \n"
234 "precr.qb.ph $s6, $s3, $s2 \n"
235 "precrq.qb.ph $s7, $s3, $s2 \n"
236
237 // s4 = | 03 | 02 | 01 | 00 |
238 // s5 = | 23 | 22 | 21 | 20 |
239 // s6 = | 13 | 12 | 11 | 10 |
240 // s7 = | 33 | 32 | 31 | 30 |
241
242 "lwx $t0, $t4(%[src]) \n"
243 "lwx $t1, $t5(%[src]) \n"
244 "lwx $t8, $t6(%[src]) \n"
245 "lwx $t9, $t7(%[src]) \n"
246
247 // t0 = | 34 | 24 | 14 | 04 |
248 // t1 = | 35 | 25 | 15 | 05 |
249 // t8 = | 36 | 26 | 16 | 06 |
250 // t9 = | 37 | 27 | 17 | 07 |
251
252 "precr.qb.ph $s0, $t1, $t0 \n"
253 "precr.qb.ph $s1, $t9, $t8 \n"
254 "precrq.qb.ph $s2, $t1, $t0 \n"
255 "precrq.qb.ph $s3, $t9, $t8 \n"
256
257 // s0 = | 25 | 05 | 24 | 04 |
258 // s1 = | 27 | 07 | 26 | 06 |
259 // s2 = | 35 | 15 | 34 | 14 |
260 // s3 = | 37 | 17 | 36 | 16 |
261
262 "precr.qb.ph $t0, $s1, $s0 \n"
263 "precrq.qb.ph $t1, $s1, $s0 \n"
264 "precr.qb.ph $t8, $s3, $s2 \n"
265 "precrq.qb.ph $t9, $s3, $s2 \n"
266
267 // t0 = | 07 | 06 | 05 | 04 |
268 // t1 = | 27 | 26 | 25 | 24 |
269 // t8 = | 17 | 16 | 15 | 14 |
270 // t9 = | 37 | 36 | 35 | 34 |
271
272 "addu $s0, %[dst], %[dst_stride] \n"
273 "addu $s1, $s0, %[dst_stride] \n"
274 "addu $s2, $s1, %[dst_stride] \n"
275
276 "swr $s4, 0(%[dst]) \n"
277 "swl $s4, 3(%[dst]) \n"
278 "swr $t0, 4(%[dst]) \n"
279 "swl $t0, 7(%[dst]) \n"
280 "swr $s6, 0($s0) \n"
281 "swl $s6, 3($s0) \n"
282 "swr $t8, 4($s0) \n"
283 "swl $t8, 7($s0) \n"
284 "swr $s5, 0($s1) \n"
285 "swl $s5, 3($s1) \n"
286 "swr $t1, 4($s1) \n"
287 "swl $t1, 7($s1) \n"
288 "swr $s7, 0($s2) \n"
289 "swl $s7, 3($s2) \n"
290 "swr $t9, 4($s2) \n"
291 "swl $t9, 7($s2) \n"
292
293 "addiu $AT, -1 \n"
294 "addiu %[src], 4 \n"
295
296 "bnez $AT, 11b \n"
297 " addu %[dst], $s2, %[dst_stride] \n"
298 "2: \n"
299 ".set pop \n"
300 ".set at \n"
301 :[src] "+r" (src),
302 [dst] "+r" (dst),
303 [width] "+r" (width)
304 :[src_stride] "r" (src_stride),
305 [dst_stride] "r" (dst_stride)
306 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
307 "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
308 );
309 }
310
TransposeUVWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)311 void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
312 uint8* dst_a, int dst_stride_a,
313 uint8* dst_b, int dst_stride_b,
314 int width) {
315 __asm__ __volatile__ (
316 ".set push \n"
317 ".set noreorder \n"
318 "beqz %[width], 2f \n"
319 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
320 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
321 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
322 "addu $t3, $t2, %[src_stride] \n"
323 "addu $t5, $t4, %[src_stride] \n"
324 "addu $t6, $t2, $t4 \n"
325 "subu $t7, $t9, %[src_stride] \n"
326 "srl $t1, %[width], 1 \n"
327
328 // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
329 "andi $t0, %[dst_a], 0x3 \n"
330 "andi $t8, %[dst_b], 0x3 \n"
331 "or $t0, $t0, $t8 \n"
332 "andi $t8, %[dst_stride_a], 0x3 \n"
333 "andi $s5, %[dst_stride_b], 0x3 \n"
334 "or $t8, $t8, $s5 \n"
335 "or $t0, $t0, $t8 \n"
336 "bnez $t0, 11f \n"
337 " nop \n"
338 // dst + dst_stride word aligned (both, a & b dst addresses)
339 "1: \n"
340 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
341 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
342 "addu $s5, %[dst_a], %[dst_stride_a] \n"
343 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
344 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
345 "addu $s6, %[dst_b], %[dst_stride_b] \n"
346
347 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
348 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
349 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
350 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
351
352 "sll $t0, $t0, 16 \n"
353 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
354 "sll $t9, $t9, 16 \n"
355 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
356
357 "sw $s3, 0($s5) \n"
358 "sw $s4, 0($s6) \n"
359
360 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
361 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
362
363 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
364 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
365 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
366 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
367 "sw $s3, 0(%[dst_a]) \n"
368 "sw $s4, 0(%[dst_b]) \n"
369
370 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
371 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
372 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
373 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
374
375 "sll $t0, $t0, 16 \n"
376 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
377 "sll $t9, $t9, 16 \n"
378 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
379 "sw $s3, 4($s5) \n"
380 "sw $s4, 4($s6) \n"
381
382 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
383 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
384
385 "addiu %[src], 4 \n"
386 "addiu $t1, -1 \n"
387 "sll $t0, %[dst_stride_a], 1 \n"
388 "sll $t8, %[dst_stride_b], 1 \n"
389 "sw $s3, 4(%[dst_a]) \n"
390 "sw $s4, 4(%[dst_b]) \n"
391 "addu %[dst_a], %[dst_a], $t0 \n"
392 "bnez $t1, 1b \n"
393 " addu %[dst_b], %[dst_b], $t8 \n"
394 "b 2f \n"
395 " nop \n"
396
397 // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
398 "11: \n"
399 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
400 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
401 "addu $s5, %[dst_a], %[dst_stride_a] \n"
402 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
403 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
404 "addu $s6, %[dst_b], %[dst_stride_b] \n"
405
406 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
407 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
408 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
409 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
410
411 "sll $t0, $t0, 16 \n"
412 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
413 "sll $t9, $t9, 16 \n"
414 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
415
416 "swr $s3, 0($s5) \n"
417 "swl $s3, 3($s5) \n"
418 "swr $s4, 0($s6) \n"
419 "swl $s4, 3($s6) \n"
420
421 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
422 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
423
424 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
425 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
426 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
427 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
428 "swr $s3, 0(%[dst_a]) \n"
429 "swl $s3, 3(%[dst_a]) \n"
430 "swr $s4, 0(%[dst_b]) \n"
431 "swl $s4, 3(%[dst_b]) \n"
432
433 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
434 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
435 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
436 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
437
438 "sll $t0, $t0, 16 \n"
439 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
440 "sll $t9, $t9, 16 \n"
441 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
442
443 "swr $s3, 4($s5) \n"
444 "swl $s3, 7($s5) \n"
445 "swr $s4, 4($s6) \n"
446 "swl $s4, 7($s6) \n"
447
448 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
449 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
450
451 "addiu %[src], 4 \n"
452 "addiu $t1, -1 \n"
453 "sll $t0, %[dst_stride_a], 1 \n"
454 "sll $t8, %[dst_stride_b], 1 \n"
455 "swr $s3, 4(%[dst_a]) \n"
456 "swl $s3, 7(%[dst_a]) \n"
457 "swr $s4, 4(%[dst_b]) \n"
458 "swl $s4, 7(%[dst_b]) \n"
459 "addu %[dst_a], %[dst_a], $t0 \n"
460 "bnez $t1, 11b \n"
461 " addu %[dst_b], %[dst_b], $t8 \n"
462
463 "2: \n"
464 ".set pop \n"
465 : [src] "+r" (src),
466 [dst_a] "+r" (dst_a),
467 [dst_b] "+r" (dst_b),
468 [width] "+r" (width),
469 [src_stride] "+r" (src_stride)
470 : [dst_stride_a] "r" (dst_stride_a),
471 [dst_stride_b] "r" (dst_stride_b)
472 : "t0", "t1", "t2", "t3", "t4", "t5",
473 "t6", "t7", "t8", "t9",
474 "s0", "s1", "s2", "s3",
475 "s4", "s5", "s6"
476 );
477 }
478
479 #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
480
481 #ifdef __cplusplus
482 } // extern "C"
483 } // namespace libyuv
484 #endif
485