• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/basic_types.h"
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC MIPS DSPR2
20 #if !defined(LIBYUV_DISABLE_MIPS) && \
21     defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
22     (_MIPS_SIM == _MIPS_SIM_ABI32)
23 
ScaleRowDown2_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)24 void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
25                          uint8* dst, int dst_width) {
26   __asm__ __volatile__(
27     ".set push                                     \n"
28     ".set noreorder                                \n"
29 
30     "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
31     "beqz           $t9, 2f                        \n"
32     " nop                                          \n"
33 
34   "1:                                              \n"
35     "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
36     "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
37     "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
38     "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
39     "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
40     "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
41     "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
42     "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
43     // TODO(fbarchard): Use odd pixels instead of even.
44     "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
45     "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
46     "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
47     "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
48     "addiu          %[src_ptr], %[src_ptr], 32     \n"
49     "addiu          $t9, $t9, -1                   \n"
50     "sw             $t8, 0(%[dst])                 \n"
51     "sw             $t0, 4(%[dst])                 \n"
52     "sw             $t1, 8(%[dst])                 \n"
53     "sw             $t2, 12(%[dst])                \n"
54     "bgtz           $t9, 1b                        \n"
55     " addiu         %[dst], %[dst], 16             \n"
56 
57   "2:                                              \n"
58     "andi           $t9, %[dst_width], 0xf         \n"  // residue
59     "beqz           $t9, 3f                        \n"
60     " nop                                          \n"
61 
62   "21:                                             \n"
63     "lbu            $t0, 0(%[src_ptr])             \n"
64     "addiu          %[src_ptr], %[src_ptr], 2      \n"
65     "addiu          $t9, $t9, -1                   \n"
66     "sb             $t0, 0(%[dst])                 \n"
67     "bgtz           $t9, 21b                       \n"
68     " addiu         %[dst], %[dst], 1              \n"
69 
70   "3:                                              \n"
71     ".set pop                                      \n"
72   : [src_ptr] "+r" (src_ptr),
73     [dst] "+r" (dst)
74   : [dst_width] "r" (dst_width)
75   : "t0", "t1", "t2", "t3", "t4", "t5",
76     "t6", "t7", "t8", "t9"
77   );
78 }
79 
ScaleRowDown2Box_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)80 void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
81                             uint8* dst, int dst_width) {
82   const uint8* t = src_ptr + src_stride;
83 
84   __asm__ __volatile__ (
85     ".set push                                    \n"
86     ".set noreorder                               \n"
87 
88     "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
89     "bltz           $t9, 2f                       \n"
90     " nop                                         \n"
91 
92   "1:                                             \n"
93     "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
94     "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
95     "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
96     "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
97     "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
98     "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
99     "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
100     "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
101     "addiu          $t9, $t9, -1                  \n"
102     "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
103     "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
104     "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
105     "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
106     "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
107     "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
108     "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
109     "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
110     "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
111     "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
112     "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
113     "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
114     "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
115     "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
116     "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
117     "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
118     "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
119     "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
120     "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
121     "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
122     "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
123     "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
124     "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
125     "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
126     "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
127     "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
128     "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
129     "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
130     "addiu          %[src_ptr], %[src_ptr], 16    \n"
131     "addiu          %[t], %[t], 16                \n"
132     "sb             $t0, 0(%[dst])                \n"
133     "sb             $t4, 1(%[dst])                \n"
134     "sb             $t1, 2(%[dst])                \n"
135     "sb             $t5, 3(%[dst])                \n"
136     "sb             $t2, 4(%[dst])                \n"
137     "sb             $t6, 5(%[dst])                \n"
138     "sb             $t3, 6(%[dst])                \n"
139     "sb             $t7, 7(%[dst])                \n"
140     "bgtz           $t9, 1b                       \n"
141     " addiu         %[dst], %[dst], 8             \n"
142 
143   "2:                                             \n"
144     "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
145     "beqz           $t9, 3f                       \n"
146     " nop                                         \n"
147 
148     "21:                                          \n"
149     "lwr            $t1, 0(%[src_ptr])            \n"
150     "lwl            $t1, 3(%[src_ptr])            \n"
151     "lwr            $t2, 0(%[t])                  \n"
152     "lwl            $t2, 3(%[t])                  \n"
153     "srl            $t8, $t1, 16                  \n"
154     "ins            $t1, $t2, 16, 16              \n"
155     "ins            $t2, $t8, 0, 16               \n"
156     "raddu.w.qb     $t1, $t1                      \n"
157     "raddu.w.qb     $t2, $t2                      \n"
158     "shra_r.w       $t1, $t1, 2                   \n"
159     "shra_r.w       $t2, $t2, 2                   \n"
160     "sb             $t1, 0(%[dst])                \n"
161     "sb             $t2, 1(%[dst])                \n"
162     "addiu          %[src_ptr], %[src_ptr], 4     \n"
163     "addiu          $t9, $t9, -2                  \n"
164     "addiu          %[t], %[t], 4                 \n"
165     "bgtz           $t9, 21b                      \n"
166     " addiu         %[dst], %[dst], 2             \n"
167 
168   "3:                                             \n"
169     ".set pop                                     \n"
170 
171   : [src_ptr] "+r" (src_ptr),
172     [dst] "+r" (dst), [t] "+r" (t)
173   : [dst_width] "r" (dst_width)
174   : "t0", "t1", "t2", "t3", "t4", "t5",
175     "t6", "t7", "t8", "t9"
176   );
177 }
178 
ScaleRowDown4_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)179 void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
180                          uint8* dst, int dst_width) {
181   __asm__ __volatile__ (
182       ".set push                                    \n"
183       ".set noreorder                               \n"
184 
185       "srl            $t9, %[dst_width], 3          \n"
186       "beqz           $t9, 2f                       \n"
187       " nop                                         \n"
188 
189      "1:                                            \n"
190       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
191       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
192       "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
193       "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
194       "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
195       "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
196       "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
197       "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
198       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
199       "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
200       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
201       "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
202       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
203       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
204       "addiu          %[src_ptr], %[src_ptr], 32    \n"
205       "addiu          $t9, $t9, -1                  \n"
206       "sw             $t1, 0(%[dst])                \n"
207       "sw             $t5, 4(%[dst])                \n"
208       "bgtz           $t9, 1b                       \n"
209       " addiu         %[dst], %[dst], 8             \n"
210 
211     "2:                                             \n"
212       "andi           $t9, %[dst_width], 7          \n"  // residue
213       "beqz           $t9, 3f                       \n"
214       " nop                                         \n"
215 
216     "21:                                            \n"
217       "lbu            $t1, 0(%[src_ptr])            \n"
218       "addiu          %[src_ptr], %[src_ptr], 4     \n"
219       "addiu          $t9, $t9, -1                  \n"
220       "sb             $t1, 0(%[dst])                \n"
221       "bgtz           $t9, 21b                      \n"
222       " addiu         %[dst], %[dst], 1             \n"
223 
224     "3:                                             \n"
225       ".set pop                                     \n"
226       : [src_ptr] "+r" (src_ptr),
227         [dst] "+r" (dst)
228       : [dst_width] "r" (dst_width)
229       : "t1", "t2", "t3", "t4", "t5",
230         "t6", "t7", "t8", "t9"
231   );
232 }
233 
ScaleRowDown4Box_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)234 void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
235                             uint8* dst, int dst_width) {
236   intptr_t stride = src_stride;
237   const uint8* s1 = src_ptr + stride;
238   const uint8* s2 = s1 + stride;
239   const uint8* s3 = s2 + stride;
240 
241   __asm__ __volatile__ (
242       ".set push                                  \n"
243       ".set noreorder                             \n"
244 
245       "srl           $t9, %[dst_width], 1         \n"
246       "andi          $t8, %[dst_width], 1         \n"
247 
248      "1:                                          \n"
249       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
250       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
251       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
252       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
253       "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
254       "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
255       "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
256       "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
257       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
258       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
259       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
260       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
261       "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
262       "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
263       "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
264       "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
265       "add           $t0, $t0, $t1                \n"
266       "add           $t1, $t2, $t3                \n"
267       "add           $t0, $t0, $t1                \n"
268       "add           $t4, $t4, $t5                \n"
269       "add           $t6, $t6, $t7                \n"
270       "add           $t4, $t4, $t6                \n"
271       "shra_r.w      $t0, $t0, 4                  \n"
272       "shra_r.w      $t4, $t4, 4                  \n"
273       "sb            $t0, 0(%[dst])               \n"
274       "sb            $t4, 1(%[dst])               \n"
275       "addiu         %[src_ptr], %[src_ptr], 8    \n"
276       "addiu         %[s1], %[s1], 8              \n"
277       "addiu         %[s2], %[s2], 8              \n"
278       "addiu         %[s3], %[s3], 8              \n"
279       "addiu         $t9, $t9, -1                 \n"
280       "bgtz          $t9, 1b                      \n"
281       " addiu        %[dst], %[dst], 2            \n"
282       "beqz          $t8, 2f                      \n"
283       " nop                                       \n"
284 
285       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
286       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
287       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
288       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
289       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
290       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
291       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
292       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
293       "add           $t0, $t0, $t1                \n"
294       "add           $t1, $t2, $t3                \n"
295       "add           $t0, $t0, $t1                \n"
296       "shra_r.w      $t0, $t0, 4                  \n"
297       "sb            $t0, 0(%[dst])               \n"
298 
299       "2:                                         \n"
300       ".set pop                                   \n"
301 
302       : [src_ptr] "+r" (src_ptr),
303         [dst] "+r" (dst),
304         [s1] "+r" (s1),
305         [s2] "+r" (s2),
306         [s3] "+r" (s3)
307       : [dst_width] "r" (dst_width)
308       : "t0", "t1", "t2", "t3", "t4", "t5",
309         "t6","t7", "t8", "t9"
310   );
311 }
312 
ScaleRowDown34_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)313 void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
314                           uint8* dst, int dst_width) {
315   __asm__ __volatile__ (
316       ".set push                                          \n"
317       ".set noreorder                                     \n"
318     "1:                                                   \n"
319       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
320       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
321       "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
322       "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
323       "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
324       "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
325       "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
326       "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
327       "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
328       "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
329       "addiu           %[dst_width], %[dst_width], -24    \n"
330       "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
331       "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
332       "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
333       "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
334       "addiu           %[src_ptr], %[src_ptr], 32         \n"
335       "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
336       "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
337       "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
338       "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
339       "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
340       "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
341       "sw              $t1, 0(%[dst])                     \n"
342       "sw              $t0, 4(%[dst])                     \n"
343       "sw              $t3, 8(%[dst])                     \n"
344       "sw              $t5, 12(%[dst])                    \n"
345       "sw              $t9, 16(%[dst])                    \n"
346       "sw              $t7, 20(%[dst])                    \n"
347       "bnez            %[dst_width], 1b                   \n"
348       " addiu          %[dst], %[dst], 24                 \n"
349       ".set pop                                           \n"
350       : [src_ptr] "+r" (src_ptr),
351         [dst] "+r" (dst),
352         [dst_width] "+r" (dst_width)
353       :
354       : "t0", "t1", "t2", "t3", "t4", "t5",
355         "t6","t7", "t8", "t9"
356   );
357 }
358 
ScaleRowDown34_0_Box_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * d,int dst_width)359 void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
360                                 uint8* d, int dst_width) {
361   __asm__ __volatile__ (
362       ".set push                                         \n"
363       ".set noreorder                                    \n"
364       "repl.ph           $t3, 3                          \n"  // 0x00030003
365 
366     "1:                                                  \n"
367       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
368       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
369       "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
370       "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
371       "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
372       "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
373       "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
374       "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
375       "raddu.w.qb        $t0, $t0                        \n"
376       "raddu.w.qb        $t1, $t1                        \n"
377       "shra_r.w          $t0, $t0, 1                     \n"
378       "shra_r.w          $t1, $t1, 1                     \n"
379       "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
380       "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
381       "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
382       "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
383       "addu.ph           $t2, $t2, $t4                   \n"
384       "addu.ph           $t6, $t6, $t5                   \n"
385       "sll               $t5, $t0, 1                     \n"
386       "add               $t0, $t5, $t0                   \n"
387       "shra_r.ph         $t2, $t2, 2                     \n"
388       "shra_r.ph         $t6, $t6, 2                     \n"
389       "shll.ph           $t4, $t2, 1                     \n"
390       "addq.ph           $t4, $t4, $t2                   \n"
391       "addu              $t0, $t0, $t1                   \n"
392       "addiu             %[src_ptr], %[src_ptr], 4       \n"
393       "shra_r.w          $t0, $t0, 2                     \n"
394       "addu.ph           $t6, $t6, $t4                   \n"
395       "shra_r.ph         $t6, $t6, 2                     \n"
396       "srl               $t1, $t6, 16                    \n"
397       "addiu             %[dst_width], %[dst_width], -3  \n"
398       "sb                $t1, 0(%[d])                    \n"
399       "sb                $t0, 1(%[d])                    \n"
400       "sb                $t6, 2(%[d])                    \n"
401       "bgtz              %[dst_width], 1b                \n"
402       " addiu            %[d], %[d], 3                   \n"
403     "3:                                                  \n"
404       ".set pop                                          \n"
405       : [src_ptr] "+r" (src_ptr),
406         [src_stride] "+r" (src_stride),
407         [d] "+r" (d),
408         [dst_width] "+r" (dst_width)
409       :
410       : "t0", "t1", "t2", "t3",
411         "t4", "t5", "t6"
412   );
413 }
414 
ScaleRowDown34_1_Box_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * d,int dst_width)415 void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
416                                 uint8* d, int dst_width) {
417   __asm__ __volatile__ (
418       ".set push                                           \n"
419       ".set noreorder                                      \n"
420       "repl.ph           $t2, 3                            \n"  // 0x00030003
421 
422     "1:                                                    \n"
423       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
424       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
425       "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
426       "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
427       "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
428       "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
429       "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
430       "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
431       "raddu.w.qb        $t0, $t0                          \n"
432       "raddu.w.qb        $t1, $t1                          \n"
433       "shra_r.w          $t0, $t0, 1                       \n"
434       "shra_r.w          $t1, $t1, 1                       \n"
435       "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
436       "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
437       "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
438       "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
439       "addu.ph           $t4, $t4, $t3                     \n"
440       "addu.ph           $t6, $t6, $t5                     \n"
441       "shra_r.ph         $t6, $t6, 2                       \n"
442       "shra_r.ph         $t4, $t4, 2                       \n"
443       "addu.ph           $t6, $t6, $t4                     \n"
444       "addiu             %[src_ptr], %[src_ptr], 4         \n"
445       "shra_r.ph         $t6, $t6, 1                       \n"
446       "addu              $t0, $t0, $t1                     \n"
447       "addiu             %[dst_width], %[dst_width], -3    \n"
448       "shra_r.w          $t0, $t0, 1                       \n"
449       "srl               $t1, $t6, 16                      \n"
450       "sb                $t1, 0(%[d])                      \n"
451       "sb                $t0, 1(%[d])                      \n"
452       "sb                $t6, 2(%[d])                      \n"
453       "bgtz              %[dst_width], 1b                  \n"
454       " addiu            %[d], %[d], 3                     \n"
455     "3:                                                    \n"
456       ".set pop                                            \n"
457       : [src_ptr] "+r" (src_ptr),
458         [src_stride] "+r" (src_stride),
459         [d] "+r" (d),
460         [dst_width] "+r" (dst_width)
461       :
462       : "t0", "t1", "t2", "t3",
463         "t4", "t5", "t6"
464   );
465 }
466 
ScaleRowDown38_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)467 void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
468                           uint8* dst, int dst_width) {
469   __asm__ __volatile__ (
470       ".set push                                     \n"
471       ".set noreorder                                \n"
472 
473     "1:                                              \n"
474       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
475       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
476       "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
477       "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
478       "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
479       "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
480       "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
481       "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
482       "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
483       "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
484       "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
485       "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
486       "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
487       "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
488       "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
489       "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
490       "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
491       "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
492       "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
493       "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
494       "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
495       "addiu      %[src_ptr], %[src_ptr], 32         \n"
496       "addiu      %[dst_width], %[dst_width], -12    \n"
497       "addiu      $t8,%[dst_width], -12              \n"
498       "sw         $t1, 0(%[dst])                     \n"
499       "sw         $t4, 4(%[dst])                     \n"
500       "sw         $t6, 8(%[dst])                     \n"
501       "bgez       $t8, 1b                            \n"
502       " addiu     %[dst], %[dst], 12                 \n"
503       ".set pop                                      \n"
504       : [src_ptr] "+r" (src_ptr),
505         [dst] "+r" (dst),
506         [dst_width] "+r" (dst_width)
507       :
508       : "t0", "t1", "t2", "t3", "t4",
509         "t5", "t6", "t7", "t8"
510   );
511 }
512 
ScaleRowDown38_2_Box_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)513 void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
514                                 uint8* dst_ptr, int dst_width) {
515   intptr_t stride = src_stride;
516   const uint8* t = src_ptr + stride;
517   const int c = 0x2AAA;
518 
519   __asm__ __volatile__ (
520       ".set push                                         \n"
521       ".set noreorder                                    \n"
522 
523     "1:                                                  \n"
524       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
525       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
526       "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
527       "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
528       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
529       "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
530       "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
531       "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
532       "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
533       "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
534       "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
535       "srl             $t4, $t4, 2                       \n"  // t4 / 4
536       "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
537       "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
538       "addu            $t6, $t5, $t6                     \n"
539       "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
540       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
541       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
542       "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
543       "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
544       "addu            $t0, $t0, $t2                     \n"
545       "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
546       "addiu           %[src_ptr], %[src_ptr], 8         \n"
547       "addiu           %[t], %[t], 8                     \n"
548       "addiu           %[dst_width], %[dst_width], -3    \n"
549       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
550       "srl             $t6, $t6, 16                      \n"
551       "srl             $t0, $t0, 16                      \n"
552       "sb              $t4, -1(%[dst_ptr])               \n"
553       "sb              $t6, -2(%[dst_ptr])               \n"
554       "bgtz            %[dst_width], 1b                  \n"
555       " sb             $t0, -3(%[dst_ptr])               \n"
556       ".set pop                                          \n"
557       : [src_ptr] "+r" (src_ptr),
558         [dst_ptr] "+r" (dst_ptr),
559         [t] "+r" (t),
560         [dst_width] "+r" (dst_width)
561       : [c] "r" (c)
562       : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
563   );
564 }
565 
ScaleRowDown38_3_Box_DSPR2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)566 void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
567                                 ptrdiff_t src_stride,
568                                 uint8* dst_ptr, int dst_width) {
569   intptr_t stride = src_stride;
570   const uint8* s1 = src_ptr + stride;
571   stride += stride;
572   const uint8* s2 = src_ptr + stride;
573   const int c1 = 0x1C71;
574   const int c2 = 0x2AAA;
575 
576   __asm__ __volatile__ (
577       ".set push                                         \n"
578       ".set noreorder                                    \n"
579 
580     "1:                                                  \n"
581       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
582       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
583       "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
584       "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
585       "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
586       "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
587       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
588       "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
589       "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
590       "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
591       "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
592       "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
593       "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
594       "addu            $t7, $t7, $t8                     \n"
595       "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
596       "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
597       "addu            $t6, $t6, $t8                     \n"
598       "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
599       "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
600       "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
601       "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
602       "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
603       "addu            $t7, $t7, $t8                     \n"
604       "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
605       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
606       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
607       "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
608       "raddu.w.qb      $t0, $t0                          \n"
609       "raddu.w.qb      $t2, $t2                          \n"
610       "raddu.w.qb      $t4, $t4                          \n"
611       "addu            $t0, $t0, $t2                     \n"
612       "addu            $t0, $t0, $t4                     \n"
613       "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
614       "addiu           %[src_ptr], %[src_ptr], 8         \n"
615       "addiu           %[s1], %[s1], 8                   \n"
616       "addiu           %[s2], %[s2], 8                   \n"
617       "addiu           %[dst_width], %[dst_width], -3    \n"
618       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
619       "srl             $t6, $t6, 16                      \n"
620       "srl             $t7, $t7, 16                      \n"
621       "srl             $t0, $t0, 16                      \n"
622       "sb              $t6, -1(%[dst_ptr])               \n"
623       "sb              $t7, -2(%[dst_ptr])               \n"
624       "bgtz            %[dst_width], 1b                  \n"
625       " sb             $t0, -3(%[dst_ptr])               \n"
626       ".set pop                                          \n"
627       : [src_ptr] "+r" (src_ptr),
628         [dst_ptr] "+r" (dst_ptr),
629         [s1] "+r" (s1),
630         [s2] "+r" (s2),
631         [dst_width] "+r" (dst_width)
632       : [c1] "r" (c1), [c2] "r" (c2)
633       : "t0", "t1", "t2", "t3", "t4",
634         "t5", "t6", "t7", "t8"
635   );
636 }
637 
638 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
639 
640 #ifdef __cplusplus
641 }  // extern "C"
642 }  // namespace libyuv
643 #endif
644 
645