• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13 
14 #include "libyuv/basic_types.h"
15 
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20 
21 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
22     (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
23 
TransposeWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)24 void TransposeWx8_DSPR2(const uint8* src,
25                         int src_stride,
26                         uint8* dst,
27                         int dst_stride,
28                         int width) {
29   __asm__ __volatile__(
30       ".set push                                         \n"
31       ".set noreorder                                    \n"
32       "sll              $t2, %[src_stride], 0x1          \n"  // src_stride x 2
33       "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
34       "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
35       "addu             $t3, $t2, %[src_stride]          \n"
36       "addu             $t5, $t4, %[src_stride]          \n"
37       "addu             $t6, $t2, $t4                    \n"
38       "andi             $t0, %[dst], 0x3                 \n"
39       "andi             $t1, %[dst_stride], 0x3          \n"
40       "or               $t0, $t0, $t1                    \n"
41       "bnez             $t0, 11f                         \n"
42       " subu            $t7, $t9, %[src_stride]          \n"
43       // dst + dst_stride word aligned
44       "1:                                                \n"
45       "lbu              $t0, 0(%[src])                   \n"
46       "lbux             $t1, %[src_stride](%[src])       \n"
47       "lbux             $t8, $t2(%[src])                 \n"
48       "lbux             $t9, $t3(%[src])                 \n"
49       "sll              $t1, $t1, 16                     \n"
50       "sll              $t9, $t9, 16                     \n"
51       "or               $t0, $t0, $t1                    \n"
52       "or               $t8, $t8, $t9                    \n"
53       "precr.qb.ph      $s0, $t8, $t0                    \n"
54       "lbux             $t0, $t4(%[src])                 \n"
55       "lbux             $t1, $t5(%[src])                 \n"
56       "lbux             $t8, $t6(%[src])                 \n"
57       "lbux             $t9, $t7(%[src])                 \n"
58       "sll              $t1, $t1, 16                     \n"
59       "sll              $t9, $t9, 16                     \n"
60       "or               $t0, $t0, $t1                    \n"
61       "or               $t8, $t8, $t9                    \n"
62       "precr.qb.ph      $s1, $t8, $t0                    \n"
63       "sw               $s0, 0(%[dst])                   \n"
64       "addiu            %[width], -1                     \n"
65       "addiu            %[src], 1                        \n"
66       "sw               $s1, 4(%[dst])                   \n"
67       "bnez             %[width], 1b                     \n"
68       " addu            %[dst], %[dst], %[dst_stride]    \n"
69       "b                2f                               \n"
70       // dst + dst_stride unaligned
71       "11:                                               \n"
72       "lbu              $t0, 0(%[src])                   \n"
73       "lbux             $t1, %[src_stride](%[src])       \n"
74       "lbux             $t8, $t2(%[src])                 \n"
75       "lbux             $t9, $t3(%[src])                 \n"
76       "sll              $t1, $t1, 16                     \n"
77       "sll              $t9, $t9, 16                     \n"
78       "or               $t0, $t0, $t1                    \n"
79       "or               $t8, $t8, $t9                    \n"
80       "precr.qb.ph      $s0, $t8, $t0                    \n"
81       "lbux             $t0, $t4(%[src])                 \n"
82       "lbux             $t1, $t5(%[src])                 \n"
83       "lbux             $t8, $t6(%[src])                 \n"
84       "lbux             $t9, $t7(%[src])                 \n"
85       "sll              $t1, $t1, 16                     \n"
86       "sll              $t9, $t9, 16                     \n"
87       "or               $t0, $t0, $t1                    \n"
88       "or               $t8, $t8, $t9                    \n"
89       "precr.qb.ph      $s1, $t8, $t0                    \n"
90       "swr              $s0, 0(%[dst])                   \n"
91       "swl              $s0, 3(%[dst])                   \n"
92       "addiu            %[width], -1                     \n"
93       "addiu            %[src], 1                        \n"
94       "swr              $s1, 4(%[dst])                   \n"
95       "swl              $s1, 7(%[dst])                   \n"
96       "bnez             %[width], 11b                    \n"
97       "addu             %[dst], %[dst], %[dst_stride]    \n"
98       "2:                                                \n"
99       ".set pop                                          \n"
100       : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
101       : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
102       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
103 }
104 
TransposeWx8_Fast_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)105 void TransposeWx8_Fast_DSPR2(const uint8* src,
106                              int src_stride,
107                              uint8* dst,
108                              int dst_stride,
109                              int width) {
110   __asm__ __volatile__(
111       ".set noat                                         \n"
112       ".set push                                         \n"
113       ".set noreorder                                    \n"
114       "beqz             %[width], 2f                     \n"
115       " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
116       "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
117       "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
118       "addu             $t3, $t2, %[src_stride]          \n"
119       "addu             $t5, $t4, %[src_stride]          \n"
120       "addu             $t6, $t2, $t4                    \n"
121 
122       "srl              $AT, %[width], 0x2               \n"
123       "andi             $t0, %[dst], 0x3                 \n"
124       "andi             $t1, %[dst_stride], 0x3          \n"
125       "or               $t0, $t0, $t1                    \n"
126       "bnez             $t0, 11f                         \n"
127       " subu            $t7, $t9, %[src_stride]          \n"
128       // dst + dst_stride word aligned
129       "1:                                                \n"
130       "lw               $t0, 0(%[src])                   \n"
131       "lwx              $t1, %[src_stride](%[src])       \n"
132       "lwx              $t8, $t2(%[src])                 \n"
133       "lwx              $t9, $t3(%[src])                 \n"
134 
135       // t0 = | 30 | 20 | 10 | 00 |
136       // t1 = | 31 | 21 | 11 | 01 |
137       // t8 = | 32 | 22 | 12 | 02 |
138       // t9 = | 33 | 23 | 13 | 03 |
139 
140       "precr.qb.ph     $s0, $t1, $t0                     \n"
141       "precr.qb.ph     $s1, $t9, $t8                     \n"
142       "precrq.qb.ph    $s2, $t1, $t0                     \n"
143       "precrq.qb.ph    $s3, $t9, $t8                     \n"
144 
145       // s0 = | 21 | 01 | 20 | 00 |
146       // s1 = | 23 | 03 | 22 | 02 |
147       // s2 = | 31 | 11 | 30 | 10 |
148       // s3 = | 33 | 13 | 32 | 12 |
149 
150       "precr.qb.ph     $s4, $s1, $s0                     \n"
151       "precrq.qb.ph    $s5, $s1, $s0                     \n"
152       "precr.qb.ph     $s6, $s3, $s2                     \n"
153       "precrq.qb.ph    $s7, $s3, $s2                     \n"
154 
155       // s4 = | 03 | 02 | 01 | 00 |
156       // s5 = | 23 | 22 | 21 | 20 |
157       // s6 = | 13 | 12 | 11 | 10 |
158       // s7 = | 33 | 32 | 31 | 30 |
159 
160       "lwx              $t0, $t4(%[src])                 \n"
161       "lwx              $t1, $t5(%[src])                 \n"
162       "lwx              $t8, $t6(%[src])                 \n"
163       "lwx              $t9, $t7(%[src])                 \n"
164 
165       // t0 = | 34 | 24 | 14 | 04 |
166       // t1 = | 35 | 25 | 15 | 05 |
167       // t8 = | 36 | 26 | 16 | 06 |
168       // t9 = | 37 | 27 | 17 | 07 |
169 
170       "precr.qb.ph     $s0, $t1, $t0                     \n"
171       "precr.qb.ph     $s1, $t9, $t8                     \n"
172       "precrq.qb.ph    $s2, $t1, $t0                     \n"
173       "precrq.qb.ph    $s3, $t9, $t8                     \n"
174 
175       // s0 = | 25 | 05 | 24 | 04 |
176       // s1 = | 27 | 07 | 26 | 06 |
177       // s2 = | 35 | 15 | 34 | 14 |
178       // s3 = | 37 | 17 | 36 | 16 |
179 
180       "precr.qb.ph     $t0, $s1, $s0                     \n"
181       "precrq.qb.ph    $t1, $s1, $s0                     \n"
182       "precr.qb.ph     $t8, $s3, $s2                     \n"
183       "precrq.qb.ph    $t9, $s3, $s2                     \n"
184 
185       // t0 = | 07 | 06 | 05 | 04 |
186       // t1 = | 27 | 26 | 25 | 24 |
187       // t8 = | 17 | 16 | 15 | 14 |
188       // t9 = | 37 | 36 | 35 | 34 |
189 
190       "addu            $s0, %[dst], %[dst_stride]        \n"
191       "addu            $s1, $s0, %[dst_stride]           \n"
192       "addu            $s2, $s1, %[dst_stride]           \n"
193 
194       "sw              $s4, 0(%[dst])                    \n"
195       "sw              $t0, 4(%[dst])                    \n"
196       "sw              $s6, 0($s0)                       \n"
197       "sw              $t8, 4($s0)                       \n"
198       "sw              $s5, 0($s1)                       \n"
199       "sw              $t1, 4($s1)                       \n"
200       "sw              $s7, 0($s2)                       \n"
201       "sw              $t9, 4($s2)                       \n"
202 
203       "addiu            $AT, -1                          \n"
204       "addiu            %[src], 4                        \n"
205 
206       "bnez             $AT, 1b                          \n"
207       " addu            %[dst], $s2, %[dst_stride]       \n"
208       "b                2f                               \n"
209       // dst + dst_stride unaligned
210       "11:                                               \n"
211       "lw               $t0, 0(%[src])                   \n"
212       "lwx              $t1, %[src_stride](%[src])       \n"
213       "lwx              $t8, $t2(%[src])                 \n"
214       "lwx              $t9, $t3(%[src])                 \n"
215 
216       // t0 = | 30 | 20 | 10 | 00 |
217       // t1 = | 31 | 21 | 11 | 01 |
218       // t8 = | 32 | 22 | 12 | 02 |
219       // t9 = | 33 | 23 | 13 | 03 |
220 
221       "precr.qb.ph     $s0, $t1, $t0                     \n"
222       "precr.qb.ph     $s1, $t9, $t8                     \n"
223       "precrq.qb.ph    $s2, $t1, $t0                     \n"
224       "precrq.qb.ph    $s3, $t9, $t8                     \n"
225 
226       // s0 = | 21 | 01 | 20 | 00 |
227       // s1 = | 23 | 03 | 22 | 02 |
228       // s2 = | 31 | 11 | 30 | 10 |
229       // s3 = | 33 | 13 | 32 | 12 |
230 
231       "precr.qb.ph     $s4, $s1, $s0                     \n"
232       "precrq.qb.ph    $s5, $s1, $s0                     \n"
233       "precr.qb.ph     $s6, $s3, $s2                     \n"
234       "precrq.qb.ph    $s7, $s3, $s2                     \n"
235 
236       // s4 = | 03 | 02 | 01 | 00 |
237       // s5 = | 23 | 22 | 21 | 20 |
238       // s6 = | 13 | 12 | 11 | 10 |
239       // s7 = | 33 | 32 | 31 | 30 |
240 
241       "lwx              $t0, $t4(%[src])                 \n"
242       "lwx              $t1, $t5(%[src])                 \n"
243       "lwx              $t8, $t6(%[src])                 \n"
244       "lwx              $t9, $t7(%[src])                 \n"
245 
246       // t0 = | 34 | 24 | 14 | 04 |
247       // t1 = | 35 | 25 | 15 | 05 |
248       // t8 = | 36 | 26 | 16 | 06 |
249       // t9 = | 37 | 27 | 17 | 07 |
250 
251       "precr.qb.ph     $s0, $t1, $t0                     \n"
252       "precr.qb.ph     $s1, $t9, $t8                     \n"
253       "precrq.qb.ph    $s2, $t1, $t0                     \n"
254       "precrq.qb.ph    $s3, $t9, $t8                     \n"
255 
256       // s0 = | 25 | 05 | 24 | 04 |
257       // s1 = | 27 | 07 | 26 | 06 |
258       // s2 = | 35 | 15 | 34 | 14 |
259       // s3 = | 37 | 17 | 36 | 16 |
260 
261       "precr.qb.ph     $t0, $s1, $s0                     \n"
262       "precrq.qb.ph    $t1, $s1, $s0                     \n"
263       "precr.qb.ph     $t8, $s3, $s2                     \n"
264       "precrq.qb.ph    $t9, $s3, $s2                     \n"
265 
266       // t0 = | 07 | 06 | 05 | 04 |
267       // t1 = | 27 | 26 | 25 | 24 |
268       // t8 = | 17 | 16 | 15 | 14 |
269       // t9 = | 37 | 36 | 35 | 34 |
270 
271       "addu            $s0, %[dst], %[dst_stride]        \n"
272       "addu            $s1, $s0, %[dst_stride]           \n"
273       "addu            $s2, $s1, %[dst_stride]           \n"
274 
275       "swr              $s4, 0(%[dst])                   \n"
276       "swl              $s4, 3(%[dst])                   \n"
277       "swr              $t0, 4(%[dst])                   \n"
278       "swl              $t0, 7(%[dst])                   \n"
279       "swr              $s6, 0($s0)                      \n"
280       "swl              $s6, 3($s0)                      \n"
281       "swr              $t8, 4($s0)                      \n"
282       "swl              $t8, 7($s0)                      \n"
283       "swr              $s5, 0($s1)                      \n"
284       "swl              $s5, 3($s1)                      \n"
285       "swr              $t1, 4($s1)                      \n"
286       "swl              $t1, 7($s1)                      \n"
287       "swr              $s7, 0($s2)                      \n"
288       "swl              $s7, 3($s2)                      \n"
289       "swr              $t9, 4($s2)                      \n"
290       "swl              $t9, 7($s2)                      \n"
291 
292       "addiu            $AT, -1                          \n"
293       "addiu            %[src], 4                        \n"
294 
295       "bnez             $AT, 11b                         \n"
296       " addu            %[dst], $s2, %[dst_stride]       \n"
297       "2:                                                \n"
298       ".set pop                                          \n"
299       ".set at                                           \n"
300       : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
301       : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
302       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
303         "s2", "s3", "s4", "s5", "s6", "s7");
304 }
305 
TransposeUVWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)306 void TransposeUVWx8_DSPR2(const uint8* src,
307                           int src_stride,
308                           uint8* dst_a,
309                           int dst_stride_a,
310                           uint8* dst_b,
311                           int dst_stride_b,
312                           int width) {
313   __asm__ __volatile__(
314       ".set push                                         \n"
315       ".set noreorder                                    \n"
316       "beqz            %[width], 2f                      \n"
317       " sll            $t2, %[src_stride], 0x1           \n"  // src_stride x 2
318       "sll             $t4, %[src_stride], 0x2           \n"  // src_stride x 4
319       "sll             $t9, %[src_stride], 0x3           \n"  // src_stride x 8
320       "addu            $t3, $t2, %[src_stride]           \n"
321       "addu            $t5, $t4, %[src_stride]           \n"
322       "addu            $t6, $t2, $t4                     \n"
323       "subu            $t7, $t9, %[src_stride]           \n"
324       "srl             $t1, %[width], 1                  \n"
325 
326       // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
327       "andi            $t0, %[dst_a], 0x3                \n"
328       "andi            $t8, %[dst_b], 0x3                \n"
329       "or              $t0, $t0, $t8                     \n"
330       "andi            $t8, %[dst_stride_a], 0x3         \n"
331       "andi            $s5, %[dst_stride_b], 0x3         \n"
332       "or              $t8, $t8, $s5                     \n"
333       "or              $t0, $t0, $t8                     \n"
334       "bnez            $t0, 11f                          \n"
335       " nop                                              \n"
336       // dst + dst_stride word aligned (both, a & b dst addresses)
337       "1:                                                \n"
338       "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
339       "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
340       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
341       "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
342       "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
343       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
344 
345       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
346       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
347       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
348       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
349 
350       "sll             $t0, $t0, 16                      \n"
351       "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
352       "sll             $t9, $t9, 16                      \n"
353       "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
354 
355       "sw              $s3, 0($s5)                       \n"
356       "sw              $s4, 0($s6)                       \n"
357 
358       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
359       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
360 
361       "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
362       "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
363       "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
364       "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
365       "sw              $s3, 0(%[dst_a])                  \n"
366       "sw              $s4, 0(%[dst_b])                  \n"
367 
368       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
369       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
370       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
371       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
372 
373       "sll             $t0, $t0, 16                      \n"
374       "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
375       "sll             $t9, $t9, 16                      \n"
376       "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
377       "sw              $s3, 4($s5)                       \n"
378       "sw              $s4, 4($s6)                       \n"
379 
380       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
381       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
382 
383       "addiu           %[src], 4                         \n"
384       "addiu           $t1, -1                           \n"
385       "sll             $t0, %[dst_stride_a], 1           \n"
386       "sll             $t8, %[dst_stride_b], 1           \n"
387       "sw              $s3, 4(%[dst_a])                  \n"
388       "sw              $s4, 4(%[dst_b])                  \n"
389       "addu            %[dst_a], %[dst_a], $t0           \n"
390       "bnez            $t1, 1b                           \n"
391       " addu           %[dst_b], %[dst_b], $t8           \n"
392       "b               2f                                \n"
393       " nop                                              \n"
394 
395       // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
396       "11:                                               \n"
397       "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
398       "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
399       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
400       "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
401       "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
402       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
403 
404       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
405       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
406       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
407       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
408 
409       "sll             $t0, $t0, 16                      \n"
410       "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
411       "sll             $t9, $t9, 16                      \n"
412       "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
413 
414       "swr             $s3, 0($s5)                       \n"
415       "swl             $s3, 3($s5)                       \n"
416       "swr             $s4, 0($s6)                       \n"
417       "swl             $s4, 3($s6)                       \n"
418 
419       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
420       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
421 
422       "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
423       "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
424       "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
425       "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
426       "swr             $s3, 0(%[dst_a])                  \n"
427       "swl             $s3, 3(%[dst_a])                  \n"
428       "swr             $s4, 0(%[dst_b])                  \n"
429       "swl             $s4, 3(%[dst_b])                  \n"
430 
431       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
432       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
433       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
434       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
435 
436       "sll             $t0, $t0, 16                      \n"
437       "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
438       "sll             $t9, $t9, 16                      \n"
439       "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
440 
441       "swr             $s3, 4($s5)                       \n"
442       "swl             $s3, 7($s5)                       \n"
443       "swr             $s4, 4($s6)                       \n"
444       "swl             $s4, 7($s6)                       \n"
445 
446       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
447       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
448 
449       "addiu           %[src], 4                         \n"
450       "addiu           $t1, -1                           \n"
451       "sll             $t0, %[dst_stride_a], 1           \n"
452       "sll             $t8, %[dst_stride_b], 1           \n"
453       "swr             $s3, 4(%[dst_a])                  \n"
454       "swl             $s3, 7(%[dst_a])                  \n"
455       "swr             $s4, 4(%[dst_b])                  \n"
456       "swl             $s4, 7(%[dst_b])                  \n"
457       "addu            %[dst_a], %[dst_a], $t0           \n"
458       "bnez            $t1, 11b                          \n"
459       " addu           %[dst_b], %[dst_b], $t8           \n"
460 
461       "2:                                                \n"
462       ".set pop                                          \n"
463       : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
464         [width] "+r"(width), [src_stride] "+r"(src_stride)
465       : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
466       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
467         "s2", "s3", "s4", "s5", "s6");
468 }
469 
470 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
471 
472 #ifdef __cplusplus
473 }  // extern "C"
474 }  // namespace libyuv
475 #endif
476