• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * MIPS DSPr2 optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5 *                          All Rights Reserved.
6 * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
7 *           Darko Laus       <darko.laus@imgtec.com>
8 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
9 *
10 * This software is provided 'as-is', without any express or implied
11 * warranty.  In no event will the authors be held liable for any damages
12 * arising from the use of this software.
13 *
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions:
17 *
18 * 1. The origin of this software must not be misrepresented; you must not
19 *    claim that you wrote the original software. If you use this software
20 *    in a product, an acknowledgment in the product documentation would be
21 *    appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be
23 *    misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution.
25 */
26
27#include "jsimd_dspr2_asm.h"
28
29
30/*****************************************************************************/
31LEAF_DSPR2(jsimd_c_null_convert_dspr2)
32/*
33 * a0     = cinfo->image_width
34 * a1     = input_buf
35 * a2     = output_buf
36 * a3     = output_row
37 * 16(sp) = num_rows
38 * 20(sp) = cinfo->num_components
39 *
40 * Null conversion for compression
41 */
42    SAVE_REGS_ON_STACK 8, s0, s1
43
44    lw          t9, 24(sp)      /* t9 = num_rows */
45    lw          s0, 28(sp)      /* s0 = cinfo->num_components */
46    andi        t0, a0, 3       /* t0 = cinfo->image_width & 3 */
47    beqz        t0, 4f          /* no residual */
48     nop
490:
50    addiu       t9, t9, -1
51    bltz        t9, 7f
52     li         t1, 0
531:
54    sll         t3, t1, 2
55    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
56    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
57    sll         t4, a3, 2
58    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
59    addu        t2, t2, t1
60    addu        s1, t5, a0
61    addu        t6, t5, t0
622:
63    lbu         t3, 0(t2)
64    addiu       t5, t5, 1
65    sb          t3, -1(t5)
66    bne         t6, t5, 2b
67     addu       t2, t2, s0
683:
69    lbu         t3, 0(t2)
70    addu        t4, t2, s0
71    addu        t7, t4, s0
72    addu        t8, t7, s0
73    addu        t2, t8, s0
74    lbu         t4, 0(t4)
75    lbu         t7, 0(t7)
76    lbu         t8, 0(t8)
77    addiu       t5, t5, 4
78    sb          t3, -4(t5)
79    sb          t4, -3(t5)
80    sb          t7, -2(t5)
81    bne         s1, t5, 3b
82     sb         t8, -1(t5)
83    addiu       t1, t1, 1
84    bne         t1, s0, 1b
85     nop
86    addiu       a1, a1, 4
87    bgez        t9, 0b
88     addiu      a3, a3, 1
89    b           7f
90     nop
914:
92    addiu       t9, t9, -1
93    bltz        t9, 7f
94     li         t1, 0
955:
96    sll         t3, t1, 2
97    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
98    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
99    sll         t4, a3, 2
100    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
101    addu        t2, t2, t1
102    addu        s1, t5, a0
103    addu        t6, t5, t0
1046:
105    lbu         t3, 0(t2)
106    addu        t4, t2, s0
107    addu        t7, t4, s0
108    addu        t8, t7, s0
109    addu        t2, t8, s0
110    lbu         t4, 0(t4)
111    lbu         t7, 0(t7)
112    lbu         t8, 0(t8)
113    addiu       t5, t5, 4
114    sb          t3, -4(t5)
115    sb          t4, -3(t5)
116    sb          t7, -2(t5)
117    bne         s1, t5, 6b
118     sb         t8, -1(t5)
119    addiu       t1, t1, 1
120    bne         t1, s0, 5b
121     nop
122    addiu       a1, a1, 4
123    bgez        t9, 4b
124     addiu      a3, a3, 1
1257:
126    RESTORE_REGS_FROM_STACK 8, s0, s1
127
128    j           ra
129     nop
130
131END(jsimd_c_null_convert_dspr2)
132
133
134/*****************************************************************************/
135/*
136 * jsimd_extrgb_ycc_convert_dspr2
137 * jsimd_extbgr_ycc_convert_dspr2
138 * jsimd_extrgbx_ycc_convert_dspr2
139 * jsimd_extbgrx_ycc_convert_dspr2
140 * jsimd_extxbgr_ycc_convert_dspr2
141 * jsimd_extxrgb_ycc_convert_dspr2
142 *
143 * Colorspace conversion RGB -> YCbCr
144 */
145
146.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
147                                             r_offs, g_offs, b_offs
148
149.macro DO_RGB_TO_YCC  r, g, b, inptr
150    lbu         \r, \r_offs(\inptr)
151    lbu         \g, \g_offs(\inptr)
152    lbu         \b, \b_offs(\inptr)
153    addiu       \inptr, \pixel_size
154.endm
155
156LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
157/*
158 * a0     = cinfo->image_width
159 * a1     = input_buf
160 * a2     = output_buf
161 * a3     = output_row
162 * 16(sp) = num_rows
163 */
164    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
166    lw          t7, 48(sp)      /* t7 = num_rows */
167    li          s0, 0x4c8b      /* FIX(0.29900) */
168    li          s1, 0x9646      /* FIX(0.58700) */
169    li          s2, 0x1d2f      /* FIX(0.11400) */
170    li          s3, 0xffffd4cd  /* -FIX(0.16874) */
171    li          s4, 0xffffab33  /* -FIX(0.33126) */
172    li          s5, 0x8000      /* FIX(0.50000) */
173    li          s6, 0xffff94d1  /* -FIX(0.41869) */
174    li          s7, 0xffffeb2f  /* -FIX(0.08131) */
175    li          t8, 0x807fff    /* CBCR_OFFSET + ONE_HALF-1 */
176
1770:
178    addiu       t7, -1          /* --num_rows */
179    lw          t6, 0(a1)       /* t6 = input_buf[0] */
180    lw          t0, 0(a2)
181    lw          t1, 4(a2)
182    lw          t2, 8(a2)
183    sll         t3, a3, 2
184    lwx         t0, t3(t0)      /* t0 = output_buf[0][output_row] */
185    lwx         t1, t3(t1)      /* t1 = output_buf[1][output_row] */
186    lwx         t2, t3(t2)      /* t2 = output_buf[2][output_row] */
187
188    addu        t9, t2, a0      /* t9 = end address */
189    addiu       a3, 1
190
1911:
192    DO_RGB_TO_YCC t3, t4, t5, t6
193
194    mtlo        s5, $ac0
195    mtlo        t8, $ac1
196    mtlo        t8, $ac2
197    maddu       $ac0, s2, t5
198    maddu       $ac1, s5, t5
199    maddu       $ac2, s5, t3
200    maddu       $ac0, s0, t3
201    maddu       $ac1, s3, t3
202    maddu       $ac2, s6, t4
203    maddu       $ac0, s1, t4
204    maddu       $ac1, s4, t4
205    maddu       $ac2, s7, t5
206    extr.w      t3, $ac0, 16
207    extr.w      t4, $ac1, 16
208    extr.w      t5, $ac2, 16
209    sb          t3, 0(t0)
210    sb          t4, 0(t1)
211    sb          t5, 0(t2)
212    addiu       t0, 1
213    addiu       t2, 1
214    bne         t2, t9, 1b
215     addiu      t1, 1
216    bgtz        t7, 0b
217     addiu      a1, 4
218
219    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
221    j           ra
222     nop
223END(jsimd_\colorid\()_ycc_convert_dspr2)
224
225.purgem DO_RGB_TO_YCC
226
227.endm
228
229/*-------------------------------------id -- pix R  G  B */
230GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
231GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
232GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
233GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
234GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
235GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
236
237
238/*****************************************************************************/
239/*
240 * jsimd_ycc_extrgb_convert_dspr2
241 * jsimd_ycc_extbgr_convert_dspr2
242 * jsimd_ycc_extrgbx_convert_dspr2
243 * jsimd_ycc_extbgrx_convert_dspr2
244 * jsimd_ycc_extxbgr_convert_dspr2
245 * jsimd_ycc_extxrgb_convert_dspr2
246 *
247 * Colorspace conversion YCbCr -> RGB
248 */
249
250.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
251                                             r_offs, g_offs, b_offs, a_offs
252
253.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
254    sb          \scratch0, \r_offs(\outptr)
255    sb          \scratch1, \g_offs(\outptr)
256    sb          \scratch2, \b_offs(\outptr)
257.if (\pixel_size == 4)
258    li          t0, 0xFF
259    sb          t0, \a_offs(\outptr)
260.endif
261    addiu       \outptr, \pixel_size
262.endm
263
264LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
265/*
266 * a0     = cinfo->image_width
267 * a1     = input_buf
268 * a2     = input_row
269 * a3     = output_buf
270 * 16(sp) = num_rows
271 */
272    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
273
274    lw          s1, 48(sp)
275    li          t3, 0x8000
276    li          t4, 0x166e9     /* FIX(1.40200) */
277    li          t5, 0x1c5a2     /* FIX(1.77200) */
278    li          t6, 0xffff492e  /* -FIX(0.71414) */
279    li          t7, 0xffffa7e6  /* -FIX(0.34414) */
280    repl.ph     t8, 128
281
2820:
283    lw          s0, 0(a3)
284    lw          t0, 0(a1)
285    lw          t1, 4(a1)
286    lw          t2, 8(a1)
287    sll         s5, a2, 2
288    addiu       s1, -1
289    lwx         s2, s5(t0)
290    lwx         s3, s5(t1)
291    lwx         s4, s5(t2)
292    addu        t9, s2, a0
293    addiu       a2, 1
294
2951:
296    lbu         s7, 0(s4)       /* cr */
297    lbu         s6, 0(s3)       /* cb */
298    lbu         s5, 0(s2)       /* y */
299    addiu       s2, 1
300    addiu       s4, 1
301    addiu       s7, -128
302    addiu       s6, -128
303    mul         t2, t7, s6
304    mul         t0, t6, s7      /* Crgtab[cr] */
305    sll         s7, 15
306    mulq_rs.w   t1, t4, s7      /* Crrtab[cr] */
307    sll         s6, 15
308    addu        t2, t3          /* Cbgtab[cb] */
309    addu        t2, t0
310
311    mulq_rs.w   t0, t5, s6      /* Cbbtab[cb] */
312    sra         t2, 16
313    addu        t1, s5
314    addu        t2, s5          /* add y */
315    ins         t2, t1, 16, 16
316    subu.ph     t2, t2, t8
317    addu        t0, s5
318    shll_s.ph   t2, t2, 8
319    subu        t0, 128
320    shra.ph     t2, t2, 8
321    shll_s.w    t0, t0, 24
322    addu.ph     t2, t2, t8      /* clip & store */
323    sra         t0, t0, 24
324    sra         t1, t2, 16
325    addiu       t0, 128
326
327    STORE_YCC_TO_RGB t1, t2, t0, s0
328
329    bne         s2, t9, 1b
330     addiu      s3, 1
331    bgtz        s1, 0b
332     addiu      a3, 4
333
334    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
335
336    j           ra
337     nop
338END(jsimd_ycc_\colorid\()_convert_dspr2)
339
340.purgem STORE_YCC_TO_RGB
341
342.endm
343
344/*-------------------------------------id -- pix R  G  B  A */
345GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
346GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
347GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
348GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
349GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
350GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
351
352
353/*****************************************************************************/
354/*
355 * jsimd_extrgb_gray_convert_dspr2
356 * jsimd_extbgr_gray_convert_dspr2
357 * jsimd_extrgbx_gray_convert_dspr2
358 * jsimd_extbgrx_gray_convert_dspr2
359 * jsimd_extxbgr_gray_convert_dspr2
360 * jsimd_extxrgb_gray_convert_dspr2
361 *
362 * Colorspace conversion RGB -> GRAY
363 */
364
365.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
366                                              r_offs, g_offs, b_offs
367
368.macro DO_RGB_TO_GRAY  r, g, b, inptr
369    lbu         \r, \r_offs(\inptr)
370    lbu         \g, \g_offs(\inptr)
371    lbu         \b, \b_offs(\inptr)
372    addiu       \inptr, \pixel_size
373.endm
374
375LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
376/*
377 * a0     = cinfo->image_width
378 * a1     = input_buf
379 * a2     = output_buf
380 * a3     = output_row
381 * 16(sp) = num_rows
382 */
383    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
384
385    li          s0, 0x4c8b      /* s0 = FIX(0.29900) */
386    li          s1, 0x9646      /* s1 = FIX(0.58700) */
387    li          s2, 0x1d2f      /* s2 = FIX(0.11400) */
388    li          s7, 0x8000      /* s7 = FIX(0.50000) */
389    lw          s6, 48(sp)
390    andi        t7, a0, 3
391
3920:
393    addiu       s6, -1          /* s6 = num_rows */
394    lw          t0, 0(a1)
395    lw          t1, 0(a2)
396    sll         t3, a3, 2
397    lwx         t1, t3(t1)
398    addiu       a3, 1
399    addu        t9, t1, a0
400    subu        t8, t9, t7
401    beq         t1, t8, 2f
402     nop
403
4041:
405    DO_RGB_TO_GRAY t3, t4, t5, t0
406    DO_RGB_TO_GRAY s3, s4, s5, t0
407
408    mtlo        s7, $ac0
409    maddu       $ac0, s2, t5
410    maddu       $ac0, s1, t4
411    maddu       $ac0, s0, t3
412    mtlo        s7, $ac1
413    maddu       $ac1, s2, s5
414    maddu       $ac1, s1, s4
415    maddu       $ac1, s0, s3
416    extr.w      t6, $ac0, 16
417
418    DO_RGB_TO_GRAY t3, t4, t5, t0
419    DO_RGB_TO_GRAY s3, s4, s5, t0
420
421    mtlo        s7, $ac0
422    maddu       $ac0, s2, t5
423    maddu       $ac0, s1, t4
424    extr.w      t2, $ac1, 16
425    maddu       $ac0, s0, t3
426    mtlo        s7, $ac1
427    maddu       $ac1, s2, s5
428    maddu       $ac1, s1, s4
429    maddu       $ac1, s0, s3
430    extr.w      t5, $ac0, 16
431    sb          t6, 0(t1)
432    sb          t2, 1(t1)
433    extr.w      t3, $ac1, 16
434    addiu       t1, 4
435    sb          t5, -2(t1)
436    sb          t3, -1(t1)
437    bne         t1, t8, 1b
438     nop
439
4402:
441    beqz        t7, 4f
442     nop
443
4443:
445    DO_RGB_TO_GRAY t3, t4, t5, t0
446
447    mtlo        s7, $ac0
448    maddu       $ac0, s2, t5
449    maddu       $ac0, s1, t4
450    maddu       $ac0, s0, t3
451    extr.w      t6, $ac0, 16
452    sb          t6, 0(t1)
453    addiu       t1, 1
454    bne         t1, t9, 3b
455     nop
456
4574:
458    bgtz        s6, 0b
459     addiu      a1, 4
460
461    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
462
463    j           ra
464     nop
465END(jsimd_\colorid\()_gray_convert_dspr2)
466
467.purgem DO_RGB_TO_GRAY
468
469.endm
470
471/*-------------------------------------id --  pix R  G  B */
472GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
473GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
474GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
475GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
476GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
477GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
478
479
480/*****************************************************************************/
481/*
482 * jsimd_h2v2_merged_upsample_dspr2
483 * jsimd_h2v2_extrgb_merged_upsample_dspr2
484 * jsimd_h2v2_extrgbx_merged_upsample_dspr2
485 * jsimd_h2v2_extbgr_merged_upsample_dspr2
486 * jsimd_h2v2_extbgrx_merged_upsample_dspr2
487 * jsimd_h2v2_extxbgr_merged_upsample_dspr2
488 * jsimd_h2v2_extxrgb_merged_upsample_dspr2
489 *
490 * Merged h2v2 upsample routines
491 */
492.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
493                                            r1_offs, g1_offs, \
494                                            b1_offs, a1_offs, \
495                                            r2_offs, g2_offs, \
496                                            b2_offs, a2_offs
497
498.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
499                            scratch5 outptr
500    sb          \scratch0, \r1_offs(\outptr)
501    sb          \scratch1, \g1_offs(\outptr)
502    sb          \scratch2, \b1_offs(\outptr)
503    sb          \scratch3, \r2_offs(\outptr)
504    sb          \scratch4, \g2_offs(\outptr)
505    sb          \scratch5, \b2_offs(\outptr)
506.if (\pixel_size == 8)
507    li          \scratch0, 0xFF
508    sb          \scratch0, \a1_offs(\outptr)
509    sb          \scratch0, \a2_offs(\outptr)
510.endif
511    addiu       \outptr, \pixel_size
512.endm
513
514.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
515    sb          \scratch0, \r1_offs(\outptr)
516    sb          \scratch1, \g1_offs(\outptr)
517    sb          \scratch2, \b1_offs(\outptr)
518
519.if (\pixel_size == 8)
520    li          t0, 0xFF
521    sb          t0, \a1_offs(\outptr)
522.endif
523.endm
524
525LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
526/*
527 * a0     = cinfo->output_width
528 * a1     = input_buf
529 * a2     = in_row_group_ctr
530 * a3     = output_buf
531 * 16(sp) = cinfo->sample_range_limit
532 */
533    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
534
535    lw          t9, 56(sp)      /* cinfo->sample_range_limit */
536    lw          v0, 0(a1)
537    lw          v1, 4(a1)
538    lw          t0, 8(a1)
539    sll         t1, a2, 3
540    addiu       t2, t1, 4
541    sll         t3, a2, 2
542    lw          t4, 0(a3)       /* t4 = output_buf[0] */
543    lwx         t1, t1(v0)      /* t1 = input_buf[0][in_row_group_ctr*2] */
544    lwx         t2, t2(v0)      /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
545    lwx         t5, t3(v1)      /* t5 = input_buf[1][in_row_group_ctr] */
546    lwx         t6, t3(t0)      /* t6 = input_buf[2][in_row_group_ctr] */
547    lw          t7, 4(a3)       /* t7 = output_buf[1] */
548    li          s1, 0xe6ea
549    addiu       t8, s1, 0x7fff    /* t8 = 0x166e9 [FIX(1.40200)] */
550    addiu       s0, t8, 0x5eb9    /* s0 = 0x1c5a2 [FIX(1.77200)] */
551    addiu       s1, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
552    xori        s2, s1, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
553    srl         t3, a0, 1
554    blez        t3, 2f
555     addu       t0, t5, t3      /* t0 = end address */
556 1:
557    lbu         t3, 0(t5)
558    lbu         s3, 0(t6)
559    addiu       t5, t5, 1
560    addiu       t3, t3, -128    /* (cb - 128) */
561    addiu       s3, s3, -128    /* (cr - 128) */
562    mult        $ac1, s1, t3
563    madd        $ac1, s2, s3
564    sll         s3, s3, 15
565    sll         t3, t3, 15
566    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
567    extr_r.w    s5, $ac1, 16
568    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
569    lbu         v0, 0(t1)
570    addiu       t6, t6, 1
571    addiu       t1, t1, 2
572    addu        t3, v0, s4      /* y+cred */
573    addu        s3, v0, s5      /* y+cgreen */
574    addu        v1, v0, s6      /* y+cblue */
575    addu        t3, t9, t3      /* y+cred */
576    addu        s3, t9, s3      /* y+cgreen */
577    addu        v1, t9, v1      /* y+cblue */
578    lbu         AT, 0(t3)
579    lbu         s7, 0(s3)
580    lbu         ra, 0(v1)
581    lbu         v0, -1(t1)
582    addu        t3, v0, s4      /* y+cred */
583    addu        s3, v0, s5      /* y+cgreen */
584    addu        v1, v0, s6      /* y+cblue */
585    addu        t3, t9, t3      /* y+cred */
586    addu        s3, t9, s3      /* y+cgreen */
587    addu        v1, t9, v1      /* y+cblue */
588    lbu         t3, 0(t3)
589    lbu         s3, 0(s3)
590    lbu         v1, 0(v1)
591    lbu         v0, 0(t2)
592
593    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
594
595    addu        t3, v0, s4      /* y+cred */
596    addu        s3, v0, s5      /* y+cgreen */
597    addu        v1, v0, s6      /* y+cblue */
598    addu        t3, t9, t3      /* y+cred */
599    addu        s3, t9, s3      /* y+cgreen */
600    addu        v1, t9, v1      /* y+cblue */
601    lbu         AT, 0(t3)
602    lbu         s7, 0(s3)
603    lbu         ra, 0(v1)
604    lbu         v0, 1(t2)
605    addiu       t2, t2, 2
606    addu        t3, v0, s4      /* y+cred */
607    addu        s3, v0, s5      /* y+cgreen */
608    addu        v1, v0, s6      /* y+cblue */
609    addu        t3, t9, t3      /* y+cred */
610    addu        s3, t9, s3      /* y+cgreen */
611    addu        v1, t9, v1      /* y+cblue */
612    lbu         t3, 0(t3)
613    lbu         s3, 0(s3)
614    lbu         v1, 0(v1)
615
616    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
617
618    bne         t0, t5, 1b
619     nop
6202:
621    andi        t0, a0, 1
622    beqz        t0, 4f
623     lbu        t3, 0(t5)
624    lbu         s3, 0(t6)
625    addiu       t3, t3, -128    /* (cb - 128) */
626    addiu       s3, s3, -128    /* (cr - 128) */
627    mult        $ac1, s1, t3
628    madd        $ac1, s2, s3
629    sll         s3, s3, 15
630    sll         t3, t3, 15
631    lbu         v0, 0(t1)
632    extr_r.w    s5, $ac1, 16
633    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
634    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
635    addu        t3, v0, s4      /* y+cred */
636    addu        s3, v0, s5      /* y+cgreen */
637    addu        v1, v0, s6      /* y+cblue */
638    addu        t3, t9, t3      /* y+cred */
639    addu        s3, t9, s3      /* y+cgreen */
640    addu        v1, t9, v1      /* y+cblue */
641    lbu         t3, 0(t3)
642    lbu         s3, 0(s3)
643    lbu         v1, 0(v1)
644    lbu         v0, 0(t2)
645
646    STORE_H2V2_1_PIXEL t3, s3, v1, t4
647
648    addu        t3, v0, s4      /* y+cred */
649    addu        s3, v0, s5      /* y+cgreen */
650    addu        v1, v0, s6      /* y+cblue */
651    addu        t3, t9, t3      /* y+cred */
652    addu        s3, t9, s3      /* y+cgreen */
653    addu        v1, t9, v1      /* y+cblue */
654    lbu         t3, 0(t3)
655    lbu         s3, 0(s3)
656    lbu         v1, 0(v1)
657
658    STORE_H2V2_1_PIXEL t3, s3, v1, t7
6594:
660    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
661
662    j           ra
663     nop
664
665END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
666
667.purgem STORE_H2V2_1_PIXEL
668.purgem STORE_H2V2_2_PIXELS
669.endm
670
671/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
672GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
673GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
674GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
675GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
676GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
677GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
678
679
680/*****************************************************************************/
681/*
682 * jsimd_h2v1_merged_upsample_dspr2
683 * jsimd_h2v1_extrgb_merged_upsample_dspr2
684 * jsimd_h2v1_extrgbx_merged_upsample_dspr2
685 * jsimd_h2v1_extbgr_merged_upsample_dspr2
686 * jsimd_h2v1_extbgrx_merged_upsample_dspr2
687 * jsimd_h2v1_extxbgr_merged_upsample_dspr2
688 * jsimd_h2v1_extxrgb_merged_upsample_dspr2
689 *
690 * Merged h2v1 upsample routines
691 */
692
693.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
694                                            r1_offs, g1_offs, \
695                                            b1_offs, a1_offs, \
696                                            r2_offs, g2_offs, \
697                                            b2_offs, a2_offs
698
699.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
700                            scratch5 outptr
701    sb          \scratch0, \r1_offs(\outptr)
702    sb          \scratch1, \g1_offs(\outptr)
703    sb          \scratch2, \b1_offs(\outptr)
704    sb          \scratch3, \r2_offs(\outptr)
705    sb          \scratch4, \g2_offs(\outptr)
706    sb          \scratch5, \b2_offs(\outptr)
707.if (\pixel_size == 8)
708    li          t0, 0xFF
709    sb          t0, \a1_offs(\outptr)
710    sb          t0, \a2_offs(\outptr)
711.endif
712    addiu       \outptr, \pixel_size
713.endm
714
715.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
716    sb          \scratch0, \r1_offs(\outptr)
717    sb          \scratch1, \g1_offs(\outptr)
718    sb          \scratch2, \b1_offs(\outptr)
719.if (\pixel_size == 8)
720    li          t0, 0xFF
721    sb          t0, \a1_offs(\outptr)
722.endif
723.endm
724
725LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
726/*
727 * a0     = cinfo->output_width
728 * a1     = input_buf
729 * a2     = in_row_group_ctr
730 * a3     = output_buf
731 * 16(sp) = range_limit
732 */
733    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
734
735    li          t0, 0xe6ea
736    lw          t1, 0(a1)         /* t1 = input_buf[0] */
737    lw          t2, 4(a1)         /* t2 = input_buf[1] */
738    lw          t3, 8(a1)         /* t3 = input_buf[2] */
739    lw          t8, 56(sp)        /* t8 = range_limit */
740    addiu       s1, t0, 0x7fff    /* s1 = 0x166e9 [FIX(1.40200)] */
741    addiu       s2, s1, 0x5eb9    /* s2 = 0x1c5a2 [FIX(1.77200)] */
742    addiu       s0, t0, 0x9916    /* s0 = 0x8000 */
743    addiu       s4, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
744    xori        s3, s4, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
745    srl         t0, a0, 1
746    sll         t4, a2, 2
747    lwx         s5, t4(t1)      /* s5 = inptr0 */
748    lwx         s6, t4(t2)      /* s6 = inptr1 */
749    lwx         s7, t4(t3)      /* s7 = inptr2 */
750    lw          t7, 0(a3)       /* t7 = outptr */
751    blez        t0, 2f
752     addu       t9, s6, t0      /* t9 = end address */
7531:
754    lbu         t2, 0(s6)       /* t2 = cb */
755    lbu         t0, 0(s7)       /* t0 = cr */
756    lbu         t1, 0(s5)       /* t1 = y */
757    addiu       t2, t2, -128    /* t2 = cb - 128 */
758    addiu       t0, t0, -128    /* t0 = cr - 128 */
759    mult        $ac1, s4, t2
760    madd        $ac1, s3, t0
761    sll         t0, t0, 15
762    sll         t2, t2, 15
763    mulq_rs.w   t0, s1, t0      /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
764    extr_r.w    t5, $ac1, 16
765    mulq_rs.w   t6, s2, t2      /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
766    addiu       s7, s7, 1
767    addiu       s6, s6, 1
768    addu        t2, t1, t0      /* t2 = y + cred */
769    addu        t3, t1, t5      /* t3 = y + cgreen */
770    addu        t4, t1, t6      /* t4 = y + cblue */
771    addu        t2, t8, t2
772    addu        t3, t8, t3
773    addu        t4, t8, t4
774    lbu         t1, 1(s5)
775    lbu         v0, 0(t2)
776    lbu         v1, 0(t3)
777    lbu         ra, 0(t4)
778    addu        t2, t1, t0
779    addu        t3, t1, t5
780    addu        t4, t1, t6
781    addu        t2, t8, t2
782    addu        t3, t8, t3
783    addu        t4, t8, t4
784    lbu         t2, 0(t2)
785    lbu         t3, 0(t3)
786    lbu         t4, 0(t4)
787
788    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
789
790    bne         t9, s6, 1b
791     addiu      s5, s5, 2
7922:
793    andi        t0, a0, 1
794    beqz        t0, 4f
795     nop
7963:
797    lbu         t2, 0(s6)
798    lbu         t0, 0(s7)
799    lbu         t1, 0(s5)
800    addiu       t2, t2, -128    /* (cb - 128) */
801    addiu       t0, t0, -128    /* (cr - 128) */
802    mul         t3, s4, t2
803    mul         t4, s3, t0
804    sll         t0, t0, 15
805    sll         t2, t2, 15
806    mulq_rs.w   t0, s1, t0      /* (C1*cr + ONE_HALF)>> SCALEBITS */
807    mulq_rs.w   t6, s2, t2      /* (C2*cb + ONE_HALF)>> SCALEBITS */
808    addu        t3, t3, s0
809    addu        t3, t4, t3
810    sra         t5, t3, 16      /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
811    addu        t2, t1, t0      /* y + cred */
812    addu        t3, t1, t5      /* y + cgreen */
813    addu        t4, t1, t6      /* y + cblue */
814    addu        t2, t8, t2
815    addu        t3, t8, t3
816    addu        t4, t8, t4
817    lbu         t2, 0(t2)
818    lbu         t3, 0(t3)
819    lbu         t4, 0(t4)
820
821    STORE_H2V1_1_PIXEL t2, t3, t4, t7
8224:
823    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
824
825    j           ra
826     nop
827
828END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
829
830.purgem STORE_H2V1_1_PIXEL
831.purgem STORE_H2V1_2_PIXELS
832.endm
833
834/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
835GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
836GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
837GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
838GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
839GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
840GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
841
842
843/*****************************************************************************/
844/*
845 * jsimd_h2v2_fancy_upsample_dspr2
846 *
847 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
848 */
849LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
850/*
851 * a0 = cinfo->max_v_samp_factor
852 * a1 = downsampled_width
853 * a2 = input_data
854 * a3 = output_data_ptr
855 */
856    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
857
858    li            s4, 0
859    lw            s2, 0(a3)       /* s2 = *output_data_ptr */
8600:
861    li            t9, 2
862    lw            s1, -4(a2)      /* s1 = inptr1 */
863
8641:
865    lw            s0, 0(a2)       /* s0 = inptr0 */
866    lwx           s3, s4(s2)
867    addiu         s5, a1, -2      /* s5 = downsampled_width - 2 */
868    srl           t4, s5, 1
869    sll           t4, t4, 1
870    lbu           t0, 0(s0)
871    lbu           t1, 1(s0)
872    lbu           t2, 0(s1)
873    lbu           t3, 1(s1)
874    addiu         s0, 2
875    addiu         s1, 2
876    addu          t8, s0, t4      /* t8 = end address */
877    andi          s5, s5, 1       /* s5 = residual */
878    sll           t4, t0, 1
879    sll           t6, t1, 1
880    addu          t0, t0, t4      /* t0 = (*inptr0++) * 3 */
881    addu          t1, t1, t6      /* t1 = (*inptr0++) * 3 */
882    addu          t7, t0, t2      /* t7 = thiscolsum */
883    addu          t6, t1, t3      /* t5 = nextcolsum */
884    sll           t0, t7, 2       /* t0 = thiscolsum * 4 */
885    subu          t1, t0, t7      /* t1 = thiscolsum * 3 */
886    shra_r.w      t0, t0, 4
887    addiu         t1, 7
888    addu          t1, t1, t6
889    srl           t1, t1, 4
890    sb            t0, 0(s3)
891    sb            t1, 1(s3)
892    beq           t8, s0, 22f     /* skip to final iteration if width == 3 */
893     addiu        s3, 2
8942:
895    lh            t0, 0(s0)       /* t0 = A3|A2 */
896    lh            t2, 0(s1)       /* t2 = B3|B2 */
897    addiu         s0, 2
898    addiu         s1, 2
899    preceu.ph.qbr t0, t0          /* t0 = 0|A3|0|A2 */
900    preceu.ph.qbr t2, t2          /* t2 = 0|B3|0|B2 */
901    shll.ph       t1, t0, 1
902    sll           t3, t6, 1
903    addu.ph       t0, t1, t0      /* t0 = A3*3|A2*3 */
904    addu          t3, t3, t6      /* t3 = this * 3 */
905    addu.ph       t0, t0, t2      /* t0 = next2|next1 */
906    addu          t1, t3, t7
907    andi          t7, t0, 0xFFFF  /* t7 = next1 */
908    sll           t2, t7, 1
909    addu          t2, t7, t2      /* t2 = next1*3 */
910    addu          t4, t2, t6
911    srl           t6, t0, 16      /* t6 = next2 */
912    shra_r.w      t1, t1, 4       /* t1 = (this*3 + last + 8) >> 4 */
913    addu          t0, t3, t7
914    addiu         t0, 7
915    srl           t0, t0, 4       /* t0 = (this*3 + next1 + 7) >> 4 */
916    shra_r.w      t4, t4, 4       /* t3 = (next1*3 + this + 8) >> 4 */
917    addu          t2, t2, t6
918    addiu         t2, 7
919    srl           t2, t2, 4       /* t2 = (next1*3 + next2 + 7) >> 4 */
920    sb            t1, 0(s3)
921    sb            t0, 1(s3)
922    sb            t4, 2(s3)
923    sb            t2, 3(s3)
924    bne           t8, s0, 2b
925     addiu        s3, 4
92622:
927    beqz          s5, 4f
928     addu         t8, s0, s5
9293:
930    lbu           t0, 0(s0)
931    lbu           t2, 0(s1)
932    addiu         s0, 1
933    addiu         s1, 1
934    sll           t3, t6, 1
935    sll           t1, t0, 1
936    addu          t1, t0, t1      /* t1 = inptr0 * 3 */
937    addu          t3, t3, t6      /* t3 = thiscolsum * 3 */
938    addu          t5, t1, t2
939    addu          t1, t3, t7
940    shra_r.w      t1, t1, 4
941    addu          t0, t3, t5
942    addiu         t0, 7
943    srl           t0, t0, 4
944    sb            t1, 0(s3)
945    sb            t0, 1(s3)
946    addiu         s3, 2
947    move          t7, t6
948    bne           t8, s0, 3b
949     move         t6, t5
9504:
951    sll           t0, t6, 2       /* t0 = thiscolsum * 4 */
952    subu          t1, t0, t6      /* t1 = thiscolsum * 3 */
953    addu          t1, t1, t7
954    addiu         s4, 4
955    shra_r.w      t1, t1, 4
956    addiu         t0, 7
957    srl           t0, t0, 4
958    sb            t1, 0(s3)
959    sb            t0, 1(s3)
960    addiu         t9, -1
961    addiu         s3, 2
962    bnez          t9, 1b
963     lw           s1, 4(a2)
964    srl           t0, s4, 2
965    subu          t0, a0, t0
966    bgtz          t0, 0b
967     addiu        a2, 4
968
969    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
970
971    j             ra
972     nop
973END(jsimd_h2v2_fancy_upsample_dspr2)
974
975
976/*****************************************************************************/
977LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
978/*
979 * a0 = cinfo->max_v_samp_factor
980 * a1 = downsampled_width
981 * a2 = input_data
982 * a3 = output_data_ptr
983 */
984    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
985
986    .set at
987
988    beqz          a0, 3f
989     sll          t0, a0, 2
990    lw            s1, 0(a3)
991    li            s3, 0x10001
992    addu          s0, s1, t0
9930:
994    addiu         t8, a1, -2
995    srl           t9, t8, 2
996    lw            t7, 0(a2)
997    lw            s2, 0(s1)
998    lbu           t0, 0(t7)
999    lbu           t1, 1(t7)       /* t1 = inptr[1] */
1000    sll           t2, t0, 1
1001    addu          t2, t2, t0      /* t2 = invalue*3 */
1002    addu          t2, t2, t1
1003    shra_r.w      t2, t2, 2
1004    sb            t0, 0(s2)
1005    sb            t2, 1(s2)
1006    beqz          t9, 11f
1007     addiu        s2, 2
10081:
1009    ulw           t0, 0(t7)       /* t0 = |P3|P2|P1|P0| */
1010    ulw           t1, 1(t7)
1011    ulh           t2, 4(t7)       /* t2 = |0|0|P5|P4| */
1012    preceu.ph.qbl t3, t0          /* t3 = |0|P3|0|P2| */
1013    preceu.ph.qbr t0, t0          /* t0 = |0|P1|0|P0| */
1014    preceu.ph.qbr t2, t2          /* t2 = |0|P5|0|P4| */
1015    preceu.ph.qbl t4, t1          /* t4 = |0|P4|0|P3| */
1016    preceu.ph.qbr t1, t1          /* t1 = |0|P2|0|P1| */
1017    shll.ph       t5, t4, 1
1018    shll.ph       t6, t1, 1
1019    addu.ph       t5, t5, t4      /* t5 = |P4*3|P3*3| */
1020    addu.ph       t6, t6, t1      /* t6 = |P2*3|P1*3| */
1021    addu.ph       t4, t3, s3
1022    addu.ph       t0, t0, s3
1023    addu.ph       t4, t4, t5
1024    addu.ph       t0, t0, t6
1025    shrl.ph       t4, t4, 2       /* t4 = |0|P3|0|P2| */
1026    shrl.ph       t0, t0, 2       /* t0 = |0|P1|0|P0| */
1027    addu.ph       t2, t2, t5
1028    addu.ph       t3, t3, t6
1029    shra_r.ph     t2, t2, 2       /* t2 = |0|P5|0|P4| */
1030    shra_r.ph     t3, t3, 2       /* t3 = |0|P3|0|P2| */
1031    shll.ph       t2, t2, 8
1032    shll.ph       t3, t3, 8
1033    or            t2, t4, t2
1034    or            t3, t3, t0
1035    addiu         t9, -1
1036    usw           t3, 0(s2)
1037    usw           t2, 4(s2)
1038    addiu         s2, 8
1039    bgtz          t9, 1b
1040     addiu        t7, 4
104111:
1042    andi          t8, 3
1043    beqz          t8, 22f
1044     addiu        t7, 1
1045
10462:
1047    lbu           t0, 0(t7)
1048    addiu         t7, 1
1049    sll           t1, t0, 1
1050    addu          t2, t0, t1      /* t2 = invalue */
1051    lbu           t3, -2(t7)
1052    lbu           t4, 0(t7)
1053    addiu         t3, 1
1054    addiu         t4, 2
1055    addu          t3, t3, t2
1056    addu          t4, t4, t2
1057    srl           t3, 2
1058    srl           t4, 2
1059    sb            t3, 0(s2)
1060    sb            t4, 1(s2)
1061    addiu         t8, -1
1062    bgtz          t8, 2b
1063     addiu        s2, 2
1064
106522:
1066    lbu           t0, 0(t7)
1067    lbu           t2, -1(t7)
1068    sll           t1, t0, 1
1069    addu          t1, t1, t0      /* t1 = invalue * 3 */
1070    addu          t1, t1, t2
1071    addiu         t1, 1
1072    srl           t1, t1, 2
1073    sb            t1, 0(s2)
1074    sb            t0, 1(s2)
1075    addiu         s1, 4
1076    bne           s1, s0, 0b
1077     addiu        a2, 4
10783:
1079    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1080
1081    j             ra
1082     nop
1083END(jsimd_h2v1_fancy_upsample_dspr2)
1084
1085
1086/*****************************************************************************/
1087LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
1088/*
1089 * a0     = cinfo->image_width
1090 * a1     = cinfo->max_v_samp_factor
1091 * a2     = compptr->v_samp_factor
1092 * a3     = compptr->width_in_blocks
1093 * 16(sp) = input_data
1094 * 20(sp) = output_data
1095 */
1096    .set at
1097
1098    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1099
1100    beqz        a2, 7f
1101     lw         s1, 44(sp)      /* s1 = output_data */
1102    lw          s0, 40(sp)      /* s0 = input_data */
1103    srl         s2, a0, 2
1104    andi        t9, a0, 2
1105    srl         t7, t9, 1
1106    addu        s2, t7, s2
1107    sll         t0, a3, 3       /* t0 = width_in_blocks*DCT */
1108    srl         t7, t0, 1
1109    subu        s2, t7, s2
11100:
1111    andi        t6, a0, 1       /* t6 = temp_index */
1112    addiu       t6, -1
1113    lw          t4, 0(s1)       /* t4 = outptr */
1114    lw          t5, 0(s0)       /* t5 = inptr0 */
1115    li          s3, 0           /* s3 = bias */
1116    srl         t7, a0, 1       /* t7 = image_width1 */
1117    srl         s4, t7, 2
1118    andi        t8, t7, 3
11191:
1120    ulhu        t0, 0(t5)
1121    ulhu        t1, 2(t5)
1122    ulhu        t2, 4(t5)
1123    ulhu        t3, 6(t5)
1124    raddu.w.qb  t0, t0
1125    raddu.w.qb  t1, t1
1126    raddu.w.qb  t2, t2
1127    raddu.w.qb  t3, t3
1128    shra.ph     t0, t0, 1
1129    shra_r.ph   t1, t1, 1
1130    shra.ph     t2, t2, 1
1131    shra_r.ph   t3, t3, 1
1132    sb          t0, 0(t4)
1133    sb          t1, 1(t4)
1134    sb          t2, 2(t4)
1135    sb          t3, 3(t4)
1136    addiu       s4, -1
1137    addiu       t4, 4
1138    bgtz        s4, 1b
1139     addiu      t5, 8
1140    beqz        t8, 3f
1141     addu       s4, t4, t8
11422:
1143    ulhu        t0, 0(t5)
1144    raddu.w.qb  t0, t0
1145    addqh.w     t0, t0, s3
1146    xori        s3, s3, 1
1147    sb          t0, 0(t4)
1148    addiu       t4, 1
1149    bne         t4, s4, 2b
1150     addiu      t5, 2
11513:
1152    lbux        t1, t6(t5)
1153    sll         t1, 1
1154    addqh.w     t2, t1, s3      /* t2 = pixval1 */
1155    xori        s3, s3, 1
1156    addqh.w     t3, t1, s3      /* t3 = pixval2 */
1157    blez        s2, 5f
1158     append     t3, t2,  8
1159    addu        t5, t4, s2      /* t5 = loop_end2 */
11604:
1161    ush         t3, 0(t4)
1162    addiu       s2, -1
1163    bgtz        s2, 4b
1164     addiu      t4,  2
11655:
1166    beqz        t9, 6f
1167     nop
1168    sb          t2, 0(t4)
11696:
1170    addiu       s1, 4
1171    addiu       a2, -1
1172    bnez        a2, 0b
1173     addiu      s0, 4
11747:
1175    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1176
1177    j           ra
1178    nop
1179END(jsimd_h2v1_downsample_dspr2)
1180
1181
1182/*****************************************************************************/
1183LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
1184/*
1185 * a0     = cinfo->image_width
1186 * a1     = cinfo->max_v_samp_factor
1187 * a2     = compptr->v_samp_factor
1188 * a3     = compptr->width_in_blocks
1189 * 16(sp) = input_data
1190 * 20(sp) = output_data
1191 */
1192    .set at
1193
1194    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1195
1196    beqz        a2, 8f
1197     lw         s1, 52(sp)      /* s1 = output_data */
1198    lw          s0, 48(sp)      /* s0 = input_data */
1199
1200    andi        t6, a0, 1       /* t6 = temp_index */
1201    addiu       t6, -1
1202    srl         t7, a0, 1       /* t7 = image_width1 */
1203    srl         s4, t7, 2
1204    andi        t8, t7, 3
1205    andi        t9, a0, 2
1206    srl         s2, a0, 2
1207    srl         t7, t9, 1
1208    addu        s2, t7, s2
1209    sll         t0, a3, 3       /* s2 = width_in_blocks*DCT */
1210    srl         t7, t0, 1
1211    subu        s2, t7, s2
12120:
1213    lw          t4, 0(s1)       /* t4 = outptr */
1214    lw          t5, 0(s0)       /* t5 = inptr0 */
1215    lw          s7, 4(s0)       /* s7 = inptr1 */
1216    li          s6, 1           /* s6 = bias */
12172:
1218    ulw         t0, 0(t5)       /* t0 = |P3|P2|P1|P0| */
1219    ulw         t1, 0(s7)       /* t1 = |Q3|Q2|Q1|Q0| */
1220    ulw         t2, 4(t5)
1221    ulw         t3, 4(s7)
1222    precrq.ph.w t7, t0, t1      /* t2 = |P3|P2|Q3|Q2| */
1223    ins         t0, t1, 16, 16  /* t0 = |Q1|Q0|P1|P0| */
1224    raddu.w.qb  t1, t7
1225    raddu.w.qb  t0, t0
1226    shra_r.w    t1, t1, 2
1227    addiu       t0, 1
1228    srl         t0, 2
1229    precrq.ph.w t7, t2, t3
1230    ins         t2, t3, 16, 16
1231    raddu.w.qb  t7, t7
1232    raddu.w.qb  t2, t2
1233    shra_r.w    t7, t7, 2
1234    addiu       t2, 1
1235    srl         t2, 2
1236    sb          t0, 0(t4)
1237    sb          t1, 1(t4)
1238    sb          t2, 2(t4)
1239    sb          t7, 3(t4)
1240    addiu       t4, 4
1241    addiu       t5, 8
1242    addiu       s4, s4, -1
1243    bgtz        s4, 2b
1244     addiu      s7, 8
1245    beqz        t8, 4f
1246     addu       t8, t4, t8
12473:
1248    ulhu        t0, 0(t5)
1249    ulhu        t1, 0(s7)
1250    ins         t0, t1, 16, 16
1251    raddu.w.qb  t0, t0
1252    addu        t0, t0, s6
1253    srl         t0, 2
1254    xori        s6, s6, 3
1255    sb          t0, 0(t4)
1256    addiu       t5, 2
1257    addiu       t4, 1
1258    bne         t8, t4, 3b
1259     addiu      s7, 2
12604:
1261    lbux        t1, t6(t5)
1262    sll         t1, 1
1263    lbux        t0, t6(s7)
1264    sll         t0, 1
1265    addu        t1, t1, t0
1266    addu        t3, t1, s6
1267    srl         t0, t3, 2       /* t2 = pixval1 */
1268    xori        s6, s6, 3
1269    addu        t2, t1, s6
1270    srl         t1, t2, 2       /* t3 = pixval2 */
1271    blez        s2, 6f
1272     append     t1, t0, 8
12735:
1274    ush         t1, 0(t4)
1275    addiu       s2, -1
1276    bgtz        s2, 5b
1277     addiu      t4, 2
12786:
1279    beqz        t9, 7f
1280     nop
1281    sb          t0, 0(t4)
12827:
1283    addiu       s1, 4
1284    addiu       a2, -1
1285    bnez        a2, 0b
1286     addiu      s0, 8
12878:
1288    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1289
1290    j           ra
1291     nop
1292END(jsimd_h2v2_downsample_dspr2)
1293
1294
1295/*****************************************************************************/
1296LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
1297/*
1298 * a0     = input_data
1299 * a1     = output_data
1300 * a2     = compptr->v_samp_factor
1301 * a3     = cinfo->max_v_samp_factor
1302 * 16(sp) = cinfo->smoothing_factor
1303 * 20(sp) = compptr->width_in_blocks
1304 * 24(sp) = cinfo->image_width
1305 */
1306    .set at
1307
1308    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1309
1310    lw          s7, 52(sp)      /* compptr->width_in_blocks */
1311    lw          s0, 56(sp)      /* cinfo->image_width */
1312    lw          s6, 48(sp)      /* cinfo->smoothing_factor */
1313    sll         s7, 3           /* output_cols = width_in_blocks * DCTSIZE */
1314    sll         v0, s7, 1
1315    subu        v0, v0, s0
1316    blez        v0, 2f
1317    move        v1, zero
1318    addiu       t0, a3, 2       /* t0 = cinfo->max_v_samp_factor + 2 */
13190:
1320    addiu       t1, a0, -4
1321    sll         t2, v1, 2
1322    lwx         t1, t2(t1)
1323    move        t3, v0
1324    addu        t1, t1, s0
1325    lbu         t2, -1(t1)
13261:
1327    addiu       t3, t3, -1
1328    sb          t2, 0(t1)
1329    bgtz        t3, 1b
1330    addiu       t1, t1, 1
1331    addiu       v1, v1, 1
1332    bne         v1, t0, 0b
1333    nop
13342:
1335    li          v0, 80
1336    mul         v0, s6, v0
1337    li          v1, 16384
1338    move        t4, zero
1339    move        t5, zero
1340    subu        t6, v1, v0      /* t6 = 16384 - tmp_smoot_f * 80 */
1341    sll         t7, s6, 4       /* t7 = tmp_smoot_f * 16 */
13423:
1343/* Special case for first column: pretend column -1 is same as column 0 */
1344    sll         v0, t4, 2
1345    lwx         t8, v0(a1)      /*  outptr = output_data[outrow] */
1346    sll         v1, t5, 2
1347    addiu       t9, v1, 4
1348    addiu       s0, v1, -4
1349    addiu       s1, v1, 8
1350    lwx         s2, v1(a0)      /* inptr0 = input_data[inrow] */
1351    lwx         t9, t9(a0)      /* inptr1 = input_data[inrow+1] */
1352    lwx         s0, s0(a0)      /* above_ptr = input_data[inrow-1] */
1353    lwx         s1, s1(a0)      /* below_ptr = input_data[inrow+2] */
1354    lh          v0, 0(s2)
1355    lh          v1, 0(t9)
1356    lh          t0, 0(s0)
1357    lh          t1, 0(s1)
1358    ins         v0, v1, 16, 16
1359    ins         t0, t1, 16, 16
1360    raddu.w.qb  t2, v0
1361    raddu.w.qb  s3, t0
1362    lbu         v0, 0(s2)
1363    lbu         v1, 2(s2)
1364    lbu         t0, 0(t9)
1365    lbu         t1, 2(t9)
1366    addu        v0, v0, v1
1367    mult        $ac1, t2, t6
1368    addu        t0, t0, t1
1369    lbu         t2, 2(s0)
1370    addu        t0, t0, v0
1371    lbu         t3, 2(s1)
1372    addu        s3, t0, s3
1373    lbu         v0, 0(s0)
1374    lbu         t0, 0(s1)
1375    sll         s3, s3, 1
1376    addu        v0, v0, t2
1377    addu        t0, t0, t3
1378    addu        t0, t0, v0
1379    addu        s3, t0, s3
1380    madd        $ac1, s3, t7
1381    extr_r.w    v0, $ac1, 16
1382    addiu       t8, t8, 1
1383    addiu       s2, s2, 2
1384    addiu       t9, t9, 2
1385    addiu       s0, s0, 2
1386    addiu       s1, s1, 2
1387    sb          v0, -1(t8)
1388    addiu       s4, s7, -2
1389    and         s4, s4, 3
1390    addu        s5, s4, t8      /* end address */
13914:
1392    lh          v0, 0(s2)
1393    lh          v1, 0(t9)
1394    lh          t0, 0(s0)
1395    lh          t1, 0(s1)
1396    ins         v0, v1, 16, 16
1397    ins         t0, t1, 16, 16
1398    raddu.w.qb  t2, v0
1399    raddu.w.qb  s3, t0
1400    lbu         v0, -1(s2)
1401    lbu         v1, 2(s2)
1402    lbu         t0, -1(t9)
1403    lbu         t1, 2(t9)
1404    addu        v0, v0, v1
1405    mult        $ac1, t2, t6
1406    addu        t0, t0, t1
1407    lbu         t2, 2(s0)
1408    addu        t0, t0, v0
1409    lbu         t3, 2(s1)
1410    addu        s3, t0, s3
1411    lbu         v0, -1(s0)
1412    lbu         t0, -1(s1)
1413    sll         s3, s3, 1
1414    addu        v0, v0, t2
1415    addu        t0, t0, t3
1416    addu        t0, t0, v0
1417    addu        s3, t0, s3
1418    madd        $ac1, s3, t7
1419    extr_r.w    t2, $ac1, 16
1420    addiu       t8, t8, 1
1421    addiu       s2, s2, 2
1422    addiu       t9, t9, 2
1423    addiu       s0, s0, 2
1424    sb          t2, -1(t8)
1425    bne         s5, t8, 4b
1426    addiu       s1, s1, 2
1427    addiu       s5, s7, -2
1428    subu        s5, s5, s4
1429    addu        s5, s5, t8      /* end address */
14305:
1431    lh          v0, 0(s2)
1432    lh          v1, 0(t9)
1433    lh          t0, 0(s0)
1434    lh          t1, 0(s1)
1435    ins         v0, v1, 16, 16
1436    ins         t0, t1, 16, 16
1437    raddu.w.qb  t2, v0
1438    raddu.w.qb  s3, t0
1439    lbu         v0, -1(s2)
1440    lbu         v1, 2(s2)
1441    lbu         t0, -1(t9)
1442    lbu         t1, 2(t9)
1443    addu        v0, v0, v1
1444    mult        $ac1, t2, t6
1445    addu        t0, t0, t1
1446    lbu         t2, 2(s0)
1447    addu        t0, t0, v0
1448    lbu         t3, 2(s1)
1449    addu        s3, t0, s3
1450    lbu         v0, -1(s0)
1451    lbu         t0, -1(s1)
1452    sll         s3, s3, 1
1453    addu        v0, v0, t2
1454    addu        t0, t0, t3
1455    lh          v1, 2(t9)
1456    addu        t0, t0, v0
1457    lh          v0, 2(s2)
1458    addu        s3, t0, s3
1459    lh          t0, 2(s0)
1460    lh          t1, 2(s1)
1461    madd        $ac1, s3, t7
1462    extr_r.w    t2, $ac1, 16
1463    ins         t0, t1, 16, 16
1464    ins         v0, v1, 16, 16
1465    raddu.w.qb  s3, t0
1466    lbu         v1, 4(s2)
1467    lbu         t0, 1(t9)
1468    lbu         t1, 4(t9)
1469    sb          t2, 0(t8)
1470    raddu.w.qb  t3, v0
1471    lbu         v0, 1(s2)
1472    addu        t0, t0, t1
1473    mult        $ac1, t3, t6
1474    addu        v0, v0, v1
1475    lbu         t2, 4(s0)
1476    addu        t0, t0, v0
1477    lbu         v0, 1(s0)
1478    addu        s3, t0, s3
1479    lbu         t0, 1(s1)
1480    lbu         t3, 4(s1)
1481    addu        v0, v0, t2
1482    sll         s3, s3, 1
1483    addu        t0, t0, t3
1484    lh          v1, 4(t9)
1485    addu        t0, t0, v0
1486    lh          v0, 4(s2)
1487    addu        s3, t0, s3
1488    lh          t0, 4(s0)
1489    lh          t1, 4(s1)
1490    madd        $ac1, s3, t7
1491    extr_r.w    t2, $ac1, 16
1492    ins         t0, t1, 16, 16
1493    ins         v0, v1, 16, 16
1494    raddu.w.qb  s3, t0
1495    lbu         v1, 6(s2)
1496    lbu         t0, 3(t9)
1497    lbu         t1, 6(t9)
1498    sb          t2, 1(t8)
1499    raddu.w.qb  t3, v0
1500    lbu         v0, 3(s2)
1501    addu        t0, t0, t1
1502    mult        $ac1, t3, t6
1503    addu        v0, v0, v1
1504    lbu         t2, 6(s0)
1505    addu        t0, t0, v0
1506    lbu         v0, 3(s0)
1507    addu        s3, t0, s3
1508    lbu         t0, 3(s1)
1509    lbu         t3, 6(s1)
1510    addu        v0, v0, t2
1511    sll         s3, s3, 1
1512    addu        t0, t0, t3
1513    lh          v1, 6(t9)
1514    addu        t0, t0, v0
1515    lh          v0, 6(s2)
1516    addu        s3, t0, s3
1517    lh          t0, 6(s0)
1518    lh          t1, 6(s1)
1519    madd        $ac1, s3, t7
1520    extr_r.w    t3, $ac1, 16
1521    ins         t0, t1, 16, 16
1522    ins         v0, v1, 16, 16
1523    raddu.w.qb  s3, t0
1524    lbu         v1, 8(s2)
1525    lbu         t0, 5(t9)
1526    lbu         t1, 8(t9)
1527    sb          t3, 2(t8)
1528    raddu.w.qb  t2, v0
1529    lbu         v0, 5(s2)
1530    addu        t0, t0, t1
1531    mult        $ac1, t2, t6
1532    addu        v0, v0, v1
1533    lbu         t2, 8(s0)
1534    addu        t0, t0, v0
1535    lbu         v0, 5(s0)
1536    addu        s3, t0, s3
1537    lbu         t0, 5(s1)
1538    lbu         t3, 8(s1)
1539    addu        v0, v0, t2
1540    sll         s3, s3, 1
1541    addu        t0, t0, t3
1542    addiu       t8, t8, 4
1543    addu        t0, t0, v0
1544    addiu       s2, s2, 8
1545    addu        s3, t0, s3
1546    addiu       t9, t9, 8
1547    madd        $ac1, s3, t7
1548    extr_r.w    t1, $ac1, 16
1549    addiu       s0, s0, 8
1550    addiu       s1, s1, 8
1551    bne         s5, t8, 5b
1552    sb          t1, -1(t8)
1553/* Special case for last column */
1554    lh          v0, 0(s2)
1555    lh          v1, 0(t9)
1556    lh          t0, 0(s0)
1557    lh          t1, 0(s1)
1558    ins         v0, v1, 16, 16
1559    ins         t0, t1, 16, 16
1560    raddu.w.qb  t2, v0
1561    raddu.w.qb  s3, t0
1562    lbu         v0, -1(s2)
1563    lbu         v1, 1(s2)
1564    lbu         t0, -1(t9)
1565    lbu         t1, 1(t9)
1566    addu        v0, v0, v1
1567    mult        $ac1, t2, t6
1568    addu        t0, t0, t1
1569    lbu         t2, 1(s0)
1570    addu        t0, t0, v0
1571    lbu         t3, 1(s1)
1572    addu        s3, t0, s3
1573    lbu         v0, -1(s0)
1574    lbu         t0, -1(s1)
1575    sll         s3, s3, 1
1576    addu        v0, v0, t2
1577    addu        t0, t0, t3
1578    addu        t0, t0, v0
1579    addu        s3, t0, s3
1580    madd        $ac1, s3, t7
1581    extr_r.w    t0, $ac1, 16
1582    addiu       t5, t5, 2
1583    sb          t0, 0(t8)
1584    addiu       t4, t4, 1
1585    bne         t4, a2, 3b
1586    addiu       t5, t5, 2
1587
1588    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1589
1590    j           ra
1591     nop
1592
1593END(jsimd_h2v2_smooth_downsample_dspr2)
1594
1595
1596/*****************************************************************************/
1597LEAF_DSPR2(jsimd_int_upsample_dspr2)
1598/*
1599 * a0     = upsample->h_expand[compptr->component_index]
1600 * a1     = upsample->v_expand[compptr->component_index]
1601 * a2     = input_data
1602 * a3     = output_data_ptr
1603 * 16(sp) = cinfo->output_width
1604 * 20(sp) = cinfo->max_v_samp_factor
1605 */
1606    .set at
1607
1608    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1609
1610    lw          s0, 0(a3)       /* s0 = output_data */
1611    lw          s1, 32(sp)      /* s1 = cinfo->output_width */
1612    lw          s2, 36(sp)      /* s2 = cinfo->max_v_samp_factor */
1613    li          t6, 0           /* t6 = inrow */
1614    beqz        s2, 10f
1615     li         s3, 0           /* s3 = outrow */
16160:
1617    addu        t0, a2, t6
1618    addu        t7, s0, s3
1619    lw          t3, 0(t0)       /* t3 = inptr */
1620    lw          t8, 0(t7)       /* t8 = outptr */
1621    beqz        s1, 4f
1622     addu       t5, t8, s1      /* t5 = outend */
16231:
1624    lb          t2, 0(t3)       /* t2 = invalue = *inptr++ */
1625    addiu       t3, 1
1626    beqz        a0, 3f
1627     move       t0, a0          /* t0 = h_expand */
16282:
1629    sb          t2, 0(t8)
1630    addiu       t0, -1
1631    bgtz        t0, 2b
1632     addiu      t8, 1
16333:
1634    bgt         t5, t8, 1b
1635     nop
16364:
1637    addiu       t9, a1, -1      /* t9 = v_expand - 1 */
1638    blez        t9, 9f
1639     nop
16405:
1641    lw          t3, 0(s0)
1642    lw          t4, 4(s0)
1643    subu        t0, s1, 0xF
1644    blez        t0, 7f
1645     addu       t5, t3, s1      /* t5 = end address */
1646    andi        t7, s1, 0xF     /* t7 = residual */
1647    subu        t8, t5, t7
16486:
1649    ulw         t0, 0(t3)
1650    ulw         t1, 4(t3)
1651    ulw         t2, 8(t3)
1652    usw         t0, 0(t4)
1653    ulw         t0, 12(t3)
1654    usw         t1, 4(t4)
1655    usw         t2, 8(t4)
1656    usw         t0, 12(t4)
1657    addiu       t3, 16
1658    bne         t3, t8, 6b
1659     addiu      t4, 16
1660    beqz        t7, 8f
1661     nop
16627:
1663    lbu         t0, 0(t3)
1664    sb          t0, 0(t4)
1665    addiu       t3, 1
1666    bne         t3, t5, 7b
1667     addiu      t4, 1
16688:
1669    addiu       t9, -1
1670    bgtz        t9, 5b
1671     addiu      s0, 8
16729:
1673    addu        s3, s3, a1
1674    bne         s3, s2, 0b
1675     addiu      t6, 1
167610:
1677    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1678
1679    j           ra
1680     nop
1681END(jsimd_int_upsample_dspr2)
1682
1683
1684/*****************************************************************************/
1685LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
1686/*
1687 * a0 = cinfo->max_v_samp_factor
1688 * a1 = cinfo->output_width
1689 * a2 = input_data
1690 * a3 = output_data_ptr
1691 */
1692    lw          t7, 0(a3)       /* t7 = output_data */
1693    andi        t8, a1, 0xf     /* t8 = residual */
1694    sll         t0, a0, 2
1695    blez        a0, 4f
1696     addu       t9, t7, t0      /* t9 = output_data end address */
16970:
1698    lw          t5, 0(t7)       /* t5 = outptr */
1699    lw          t6, 0(a2)       /* t6 = inptr */
1700    addu        t3, t5, a1      /* t3 = outptr + output_width (end address) */
1701    subu        t3, t8          /* t3 = end address - residual */
1702    beq         t5, t3, 2f
1703     move       t4, t8
17041:
1705    ulw         t0, 0(t6)       /* t0 = |P3|P2|P1|P0| */
1706    ulw         t2, 4(t6)       /* t2 = |P7|P6|P5|P4| */
1707    srl         t1, t0, 16      /* t1 = |X|X|P3|P2| */
1708    ins         t0, t0, 16, 16  /* t0 = |P1|P0|P1|P0| */
1709    ins         t1, t1, 16, 16  /* t1 = |P3|P2|P3|P2| */
1710    ins         t0, t0, 8, 16   /* t0 = |P1|P1|P0|P0| */
1711    ins         t1, t1, 8, 16   /* t1 = |P3|P3|P2|P2| */
1712    usw         t0, 0(t5)
1713    usw         t1, 4(t5)
1714    srl         t0, t2, 16      /* t0 = |X|X|P7|P6| */
1715    ins         t2, t2, 16, 16  /* t2 = |P5|P4|P5|P4| */
1716    ins         t0, t0, 16, 16  /* t0 = |P7|P6|P7|P6| */
1717    ins         t2, t2, 8, 16   /* t2 = |P5|P5|P4|P4| */
1718    ins         t0, t0, 8, 16   /* t0 = |P7|P7|P6|P6| */
1719    usw         t2, 8(t5)
1720    usw         t0, 12(t5)
1721    addiu       t5, 16
1722    bne         t5, t3, 1b
1723     addiu      t6, 8
1724    beqz        t8, 3f
1725     move       t4, t8
17262:
1727    lbu         t1, 0(t6)
1728    sb          t1, 0(t5)
1729    sb          t1, 1(t5)
1730    addiu       t4, -2
1731    addiu       t6, 1
1732    bgtz        t4, 2b
1733     addiu      t5, 2
17343:
1735    addiu       t7, 4
1736    bne         t9, t7, 0b
1737     addiu      a2, 4
17384:
1739    j           ra
1740     nop
1741END(jsimd_h2v1_upsample_dspr2)
1742
1743
1744/*****************************************************************************/
1745LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
1746/*
1747 * a0 = cinfo->max_v_samp_factor
1748 * a1 = cinfo->output_width
1749 * a2 = input_data
1750 * a3 = output_data_ptr
1751 */
1752    lw          t7, 0(a3)
1753    blez        a0, 7f
1754     andi       t9, a1, 0xf     /* t9 = residual */
17550:
1756    lw          t6, 0(a2)       /* t6 = inptr */
1757    lw          t5, 0(t7)       /* t5 = outptr */
1758    addu        t8, t5, a1      /* t8 = outptr end address */
1759    subu        t8, t9          /* t8 = end address - residual */
1760    beq         t5, t8, 2f
1761     move       t4, t9
17621:
1763    ulw         t0, 0(t6)
1764    srl         t1, t0, 16
1765    ins         t0, t0, 16, 16
1766    ins         t0, t0, 8, 16
1767    ins         t1, t1, 16, 16
1768    ins         t1, t1, 8, 16
1769    ulw         t2, 4(t6)
1770    usw         t0, 0(t5)
1771    usw         t1, 4(t5)
1772    srl         t3, t2, 16
1773    ins         t2, t2, 16, 16
1774    ins         t2, t2, 8, 16
1775    ins         t3, t3, 16, 16
1776    ins         t3, t3, 8, 16
1777    usw         t2, 8(t5)
1778    usw         t3, 12(t5)
1779    addiu       t5, 16
1780    bne         t5, t8, 1b
1781     addiu      t6, 8
1782    beqz        t9, 3f
1783     move       t4, t9
17842:
1785    lbu         t0, 0(t6)
1786    sb          t0, 0(t5)
1787    sb          t0, 1(t5)
1788    addiu       t4, -2
1789    addiu       t6, 1
1790    bgtz        t4, 2b
1791     addiu      t5, 2
17923:
1793    lw          t6, 0(t7)       /* t6 = outptr[0] */
1794    lw          t5, 4(t7)       /* t5 = outptr[1] */
1795    addu        t4, t6, a1      /* t4 = new end address */
1796    beq         a1, t9, 5f
1797     subu       t8, t4, t9
17984:
1799    ulw         t0, 0(t6)
1800    ulw         t1, 4(t6)
1801    ulw         t2, 8(t6)
1802    usw         t0, 0(t5)
1803    ulw         t0, 12(t6)
1804    usw         t1, 4(t5)
1805    usw         t2, 8(t5)
1806    usw         t0, 12(t5)
1807    addiu       t6, 16
1808    bne         t6, t8, 4b
1809     addiu      t5, 16
1810    beqz        t9, 6f
1811     nop
18125:
1813    lbu         t0, 0(t6)
1814    sb          t0, 0(t5)
1815    addiu       t6, 1
1816    bne         t6, t4, 5b
1817     addiu      t5, 1
18186:
1819    addiu       t7, 8
1820    addiu       a0, -2
1821    bgtz        a0, 0b
1822     addiu      a2, 4
18237:
1824    j           ra
1825     nop
1826END(jsimd_h2v2_upsample_dspr2)
1827
1828
1829/*****************************************************************************/
1830LEAF_DSPR2(jsimd_idct_islow_dspr2)
1831/*
1832 * a0 = coef_block
1833 * a1 = compptr->dcttable
1834 * a2 = output
1835 * a3 = range_limit
1836 */
1837    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1838
1839    addiu       sp, sp, -256
1840    move        v0, sp
1841    addiu       v1, zero, 8     /* v1 = DCTSIZE = 8 */
18421:
1843    lh          s4, 32(a0)      /* s4 = inptr[16] */
1844    lh          s5, 64(a0)      /* s5 = inptr[32] */
1845    lh          s6, 96(a0)      /* s6 = inptr[48] */
1846    lh          t1, 112(a0)     /* t1 = inptr[56] */
1847    lh          t7, 16(a0)      /* t7 = inptr[8] */
1848    lh          t5, 80(a0)      /* t5 = inptr[40] */
1849    lh          t3, 48(a0)      /* t3 = inptr[24] */
1850    or          s4, s4, t1
1851    or          s4, s4, t3
1852    or          s4, s4, t5
1853    or          s4, s4, t7
1854    or          s4, s4, s5
1855    or          s4, s4, s6
1856    bnez        s4, 2f
1857     addiu      v1, v1, -1
1858    lh          s5, 0(a1)       /* quantptr[DCTSIZE*0] */
1859    lh          s6, 0(a0)       /* inptr[DCTSIZE*0] */
1860    mul         s5, s5, s6      /* DEQUANTIZE(inptr[0], quantptr[0]) */
1861    sll         s5, s5, 2
1862    sw          s5, 0(v0)
1863    sw          s5, 32(v0)
1864    sw          s5, 64(v0)
1865    sw          s5, 96(v0)
1866    sw          s5, 128(v0)
1867    sw          s5, 160(v0)
1868    sw          s5, 192(v0)
1869    b           3f
1870     sw         s5, 224(v0)
18712:
1872    lh          t0, 112(a1)
1873    lh          t2, 48(a1)
1874    lh          t4, 80(a1)
1875    lh          t6, 16(a1)
1876    mul         t0, t0, t1      /* DEQUANTIZE(inptr[DCTSIZE*7],
1877                                              quantptr[DCTSIZE*7]) */
1878    mul         t1, t2, t3      /* DEQUANTIZE(inptr[DCTSIZE*3],
1879                                              quantptr[DCTSIZE*3]) */
1880    mul         t2, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*5],
1881                                              quantptr[DCTSIZE*5]) */
1882    mul         t3, t6, t7      /* DEQUANTIZE(inptr[DCTSIZE*1],
1883                                              quantptr[DCTSIZE*1]) */
1884    lh          t4, 32(a1)
1885    lh          t5, 32(a0)
1886    lh          t6, 96(a1)
1887    lh          t7, 96(a0)
1888    addu        s0, t0, t1       /* z3 = tmp0 + tmp2 */
1889    addu        s1, t1, t2       /* z2 = tmp1 + tmp2 */
1890    addu        s2, t2, t3       /* z4 = tmp1 + tmp3 */
1891    addu        s3, s0, s2       /* z3 + z4 */
1892    addiu       t9, zero, 9633   /* FIX_1_175875602 */
1893    mul         s3, s3, t9       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
1894    addu        t8, t0, t3       /* z1 = tmp0 + tmp3 */
1895    addiu       t9, zero, 2446   /* FIX_0_298631336 */
1896    mul         t0, t0, t9       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
1897    addiu       t9, zero, 16819  /* FIX_2_053119869 */
1898    mul         t2, t2, t9       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
1899    addiu       t9, zero, 25172  /* FIX_3_072711026 */
1900    mul         t1, t1, t9       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
1901    addiu       t9, zero, 12299  /* FIX_1_501321110 */
1902    mul         t3, t3, t9       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
1903    addiu       t9, zero, 16069  /* FIX_1_961570560 */
1904    mul         s0, s0, t9       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
1905    addiu       t9, zero, 3196   /* FIX_0_390180644 */
1906    mul         s2, s2, t9       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
1907    addiu       t9, zero, 7373   /* FIX_0_899976223 */
1908    mul         t8, t8, t9       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
1909    addiu       t9, zero, 20995  /* FIX_2_562915447 */
1910    mul         s1, s1, t9       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
1911    subu        s0, s3, s0       /* z3 += z5 */
1912    addu        t0, t0, s0       /* tmp0 += z3 */
1913    addu        t1, t1, s0       /* tmp2 += z3 */
1914    subu        s2, s3, s2       /* z4 += z5 */
1915    addu        t2, t2, s2       /* tmp1 += z4 */
1916    addu        t3, t3, s2       /* tmp3 += z4 */
1917    subu        t0, t0, t8       /* tmp0 += z1 */
1918    subu        t1, t1, s1       /* tmp2 += z2 */
1919    subu        t2, t2, s1       /* tmp1 += z2 */
1920    subu        t3, t3, t8       /* tmp3 += z1 */
1921    mul         s0, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*2],
1922                                               quantptr[DCTSIZE*2]) */
1923    addiu       t9, zero, 6270   /* FIX_0_765366865 */
1924    mul         s1, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*6],
1925                                               quantptr[DCTSIZE*6]) */
1926    lh          t4, 0(a1)
1927    lh          t5, 0(a0)
1928    lh          t6, 64(a1)
1929    lh          t7, 64(a0)
1930    mul         s2, t9, s0       /* MULTIPLY(z2, FIX_0_765366865) */
1931    mul         t5, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*0],
1932                                               quantptr[DCTSIZE*0]) */
1933    mul         t6, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*4],
1934                                               quantptr[DCTSIZE*4]) */
1935    addiu       t9, zero, 4433   /* FIX_0_541196100 */
1936    addu        s3, s0, s1       /* z2 + z3 */
1937    mul         s3, s3, t9       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
1938    addiu       t9, zero, 15137  /* FIX_1_847759065 */
1939    mul         t8, s1, t9       /* MULTIPLY(z3, FIX_1_847759065) */
1940    addu        t4, t5, t6
1941    subu        t5, t5, t6
1942    sll         t4, t4, 13      /* tmp0 = (z2 + z3) << CONST_BITS */
1943    sll         t5, t5, 13      /* tmp1 = (z2 - z3) << CONST_BITS */
1944    addu        t7, s3, s2      /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
1945    subu        t6, s3, t8      /* tmp2 =
1946                                     z1 + MULTIPLY(z3, -FIX_1_847759065) */
1947    addu        s0, t4, t7
1948    subu        s1, t4, t7
1949    addu        s2, t5, t6
1950    subu        s3, t5, t6
1951    addu        t4, s0, t3
1952    subu        s0, s0, t3
1953    addu        t3, s2, t1
1954    subu        s2, s2, t1
1955    addu        t1, s3, t2
1956    subu        s3, s3, t2
1957    addu        t2, s1, t0
1958    subu        s1, s1, t0
1959    shra_r.w    t4, t4, 11
1960    shra_r.w    t3, t3, 11
1961    shra_r.w    t1, t1, 11
1962    shra_r.w    t2, t2, 11
1963    shra_r.w    s1, s1, 11
1964    shra_r.w    s3, s3, 11
1965    shra_r.w    s2, s2, 11
1966    shra_r.w    s0, s0, 11
1967    sw          t4, 0(v0)
1968    sw          t3, 32(v0)
1969    sw          t1, 64(v0)
1970    sw          t2, 96(v0)
1971    sw          s1, 128(v0)
1972    sw          s3, 160(v0)
1973    sw          s2, 192(v0)
1974    sw          s0, 224(v0)
19753:
1976    addiu       a1, a1, 2
1977    addiu       a0, a0, 2
1978    bgtz        v1, 1b
1979     addiu      v0, v0, 4
1980    move        v0, sp
1981    addiu       v1, zero, 8
19824:
1983    lw          t0, 8(v0)       /* z2 = (JLONG)wsptr[2] */
1984    lw          t1, 24(v0)      /* z3 = (JLONG)wsptr[6] */
1985    lw          t2, 0(v0)       /* (JLONG)wsptr[0] */
1986    lw          t3, 16(v0)      /* (JLONG)wsptr[4] */
1987    lw          s4, 4(v0)       /* (JLONG)wsptr[1] */
1988    lw          s5, 12(v0)      /* (JLONG)wsptr[3] */
1989    lw          s6, 20(v0)      /* (JLONG)wsptr[5] */
1990    lw          s7, 28(v0)      /* (JLONG)wsptr[7] */
1991    or          s4, s4, t0
1992    or          s4, s4, t1
1993    or          s4, s4, t3
1994    or          s4, s4, s7
1995    or          s4, s4, s5
1996    or          s4, s4, s6
1997    bnez        s4, 5f
1998     addiu      v1, v1, -1
1999    shra_r.w    s5, t2, 5
2000    andi        s5, s5, 0x3ff
2001    lbux        s5, s5(a3)
2002    lw          s1, 0(a2)
2003    replv.qb    s5, s5
2004    usw         s5, 0(s1)
2005    usw         s5, 4(s1)
2006    b           6f
2007     nop
20085:
2009    addu        t4, t0, t1       /* z2 + z3 */
2010    addiu       t8, zero, 4433   /* FIX_0_541196100 */
2011    mul         t5, t4, t8       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
2012    addiu       t8, zero, 15137  /* FIX_1_847759065 */
2013    mul         t1, t1, t8       /* MULTIPLY(z3, FIX_1_847759065) */
2014    addiu       t8, zero, 6270   /* FIX_0_765366865 */
2015    mul         t0, t0, t8       /* MULTIPLY(z2, FIX_0_765366865) */
2016    addu        t4, t2, t3       /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
2017    subu        t2, t2, t3       /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
2018    sll         t4, t4, 13       /* tmp0 =
2019                                      (wsptr[0] + wsptr[4]) << CONST_BITS */
2020    sll         t2, t2, 13       /* tmp1 =
2021                                      (wsptr[0] - wsptr[4]) << CONST_BITS */
2022    subu        t1, t5, t1       /* tmp2 =
2023                                      z1 + MULTIPLY(z3, -FIX_1_847759065) */
2024    subu        t3, t2, t1       /* tmp12 = tmp1 - tmp2 */
2025    addu        t2, t2, t1       /* tmp11 = tmp1 + tmp2 */
2026    addu        t5, t5, t0       /* tmp3 =
2027                                      z1 + MULTIPLY(z2, FIX_0_765366865) */
2028    subu        t1, t4, t5       /* tmp13 = tmp0 - tmp3 */
2029    addu        t0, t4, t5       /* tmp10 = tmp0 + tmp3 */
2030    lw          t4, 28(v0)       /* tmp0 = (JLONG)wsptr[7] */
2031    lw          t6, 12(v0)       /* tmp2 = (JLONG)wsptr[3] */
2032    lw          t5, 20(v0)       /* tmp1 = (JLONG)wsptr[5] */
2033    lw          t7, 4(v0)        /* tmp3 = (JLONG)wsptr[1] */
2034    addu        s0, t4, t6       /* z3 = tmp0 + tmp2 */
2035    addiu       t8, zero, 9633   /* FIX_1_175875602 */
2036    addu        s1, t5, t7       /* z4 = tmp1 + tmp3 */
2037    addu        s2, s0, s1       /* z3 + z4 */
2038    mul         s2, s2, t8       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
2039    addu        s3, t4, t7       /* z1 = tmp0 + tmp3 */
2040    addu        t9, t5, t6       /* z2 = tmp1 + tmp2 */
2041    addiu       t8, zero, 16069  /* FIX_1_961570560 */
2042    mul         s0, s0, t8       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
2043    addiu       t8, zero, 3196   /* FIX_0_390180644 */
2044    mul         s1, s1, t8       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
2045    addiu       t8, zero, 2446   /* FIX_0_298631336 */
2046    mul         t4, t4, t8       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
2047    addiu       t8, zero, 7373   /* FIX_0_899976223 */
2048    mul         s3, s3, t8       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
2049    addiu       t8, zero, 16819  /* FIX_2_053119869 */
2050    mul         t5, t5, t8       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
2051    addiu       t8, zero, 20995  /* FIX_2_562915447 */
2052    mul         t9, t9, t8       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
2053    addiu       t8, zero, 25172  /* FIX_3_072711026 */
2054    mul         t6, t6, t8       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
2055    addiu       t8, zero, 12299  /* FIX_1_501321110 */
2056    mul         t7, t7, t8       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
2057    subu        s0, s2, s0       /* z3 += z5 */
2058    subu        s1, s2, s1       /* z4 += z5 */
2059    addu        t4, t4, s0
2060    subu        t4, t4, s3      /* tmp0 */
2061    addu        t5, t5, s1
2062    subu        t5, t5, t9      /* tmp1 */
2063    addu        t6, t6, s0
2064    subu        t6, t6, t9      /* tmp2 */
2065    addu        t7, t7, s1
2066    subu        t7, t7, s3      /* tmp3 */
2067    addu        s0, t0, t7
2068    subu        t0, t0, t7
2069    addu        t7, t2, t6
2070    subu        t2, t2, t6
2071    addu        t6, t3, t5
2072    subu        t3, t3, t5
2073    addu        t5, t1, t4
2074    subu        t1, t1, t4
2075    shra_r.w    s0, s0, 18
2076    shra_r.w    t7, t7, 18
2077    shra_r.w    t6, t6, 18
2078    shra_r.w    t5, t5, 18
2079    shra_r.w    t1, t1, 18
2080    shra_r.w    t3, t3, 18
2081    shra_r.w    t2, t2, 18
2082    shra_r.w    t0, t0, 18
2083    andi        s0, s0, 0x3ff
2084    andi        t7, t7, 0x3ff
2085    andi        t6, t6, 0x3ff
2086    andi        t5, t5, 0x3ff
2087    andi        t1, t1, 0x3ff
2088    andi        t3, t3, 0x3ff
2089    andi        t2, t2, 0x3ff
2090    andi        t0, t0, 0x3ff
2091    lw          s1, 0(a2)
2092    lbux        s0, s0(a3)
2093    lbux        t7, t7(a3)
2094    lbux        t6, t6(a3)
2095    lbux        t5, t5(a3)
2096    lbux        t1, t1(a3)
2097    lbux        t3, t3(a3)
2098    lbux        t2, t2(a3)
2099    lbux        t0, t0(a3)
2100    sb          s0, 0(s1)
2101    sb          t7, 1(s1)
2102    sb          t6, 2(s1)
2103    sb          t5, 3(s1)
2104    sb          t1, 4(s1)
2105    sb          t3, 5(s1)
2106    sb          t2, 6(s1)
2107    sb          t0, 7(s1)
21086:
2109    addiu       v0, v0, 32
2110    bgtz        v1, 4b
2111     addiu      a2, a2, 4
2112    addiu       sp, sp, 256
2113
2114    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2115
2116    j           ra
2117     nop
2118
2119END(jsimd_idct_islow_dspr2)
2120
2121
2122/*****************************************************************************/
2123LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
2124/*
2125 * a0 = inptr
2126 * a1 = quantptr
2127 * a2 = wsptr
2128 * a3 = mips_idct_ifast_coefs
2129 */
2130    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2131
2132    addiu         t9, a0, 16      /* end address */
2133    or            AT, a3, zero
2134
21350:
2136    lw            s0, 0(a1)       /* quantptr[DCTSIZE*0] */
2137    lw            t0, 0(a0)       /* inptr[DCTSIZE*0] */
2138    lw            t1, 16(a0)      /* inptr[DCTSIZE*1] */
2139    muleq_s.w.phl v0, t0, s0      /* tmp0 ... */
2140    lw            t2, 32(a0)      /* inptr[DCTSIZE*2] */
2141    lw            t3, 48(a0)      /* inptr[DCTSIZE*3] */
2142    lw            t4, 64(a0)      /* inptr[DCTSIZE*4] */
2143    lw            t5, 80(a0)      /* inptr[DCTSIZE*5] */
2144    muleq_s.w.phr t0, t0, s0      /* ... tmp0 ... */
2145    lw            t6, 96(a0)      /* inptr[DCTSIZE*6] */
2146    lw            t7, 112(a0)     /* inptr[DCTSIZE*7] */
2147    or            s4, t1, t2
2148    or            s5, t3, t4
2149    bnez          s4, 1f
2150     ins          t0, v0, 16, 16  /* ... tmp0 */
2151    bnez          s5, 1f
2152     or           s6, t5, t6
2153    or            s6, s6, t7
2154    bnez          s6, 1f
2155     sw           t0, 0(a2)       /* wsptr[DCTSIZE*0] */
2156    sw            t0, 16(a2)      /* wsptr[DCTSIZE*1] */
2157    sw            t0, 32(a2)      /* wsptr[DCTSIZE*2] */
2158    sw            t0, 48(a2)      /* wsptr[DCTSIZE*3] */
2159    sw            t0, 64(a2)      /* wsptr[DCTSIZE*4] */
2160    sw            t0, 80(a2)      /* wsptr[DCTSIZE*5] */
2161    sw            t0, 96(a2)      /* wsptr[DCTSIZE*6] */
2162    sw            t0, 112(a2)     /* wsptr[DCTSIZE*7] */
2163    addiu         a0, a0, 4
2164    b             2f
2165     addiu        a1, a1, 4
2166
21671:
2168    lw            s1, 32(a1)      /* quantptr[DCTSIZE*2] */
2169    lw            s2, 64(a1)      /* quantptr[DCTSIZE*4] */
2170    muleq_s.w.phl v0, t2, s1      /* tmp1 ... */
2171    muleq_s.w.phr t2, t2, s1      /* ... tmp1 ... */
2172    lw            s0, 16(a1)      /* quantptr[DCTSIZE*1] */
2173    lw            s1, 48(a1)      /* quantptr[DCTSIZE*3] */
2174    lw            s3, 96(a1)      /* quantptr[DCTSIZE*6] */
2175    muleq_s.w.phl v1, t4, s2      /* tmp2 ... */
2176    muleq_s.w.phr t4, t4, s2      /* ... tmp2 ... */
2177    lw            s2, 80(a1)      /* quantptr[DCTSIZE*5] */
2178    lw            t8, 4(AT)       /* FIX(1.414213562) */
2179    ins           t2, v0, 16, 16  /* ... tmp1 */
2180    muleq_s.w.phl v0, t6, s3      /* tmp3 ... */
2181    muleq_s.w.phr t6, t6, s3      /* ... tmp3 ... */
2182    ins           t4, v1, 16, 16  /* ... tmp2 */
2183    addq.ph       s4, t0, t4      /* tmp10 */
2184    subq.ph       s5, t0, t4      /* tmp11 */
2185    ins           t6, v0, 16, 16  /* ... tmp3 */
2186    subq.ph       s6, t2, t6      /* tmp12 ... */
2187    addq.ph       s7, t2, t6      /* tmp13 */
2188    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
2189    addq.ph       t0, s4, s7      /* tmp0 */
2190    subq.ph       t6, s4, s7      /* tmp3 */
2191    muleq_s.w.phl v0, t1, s0      /* tmp4 ... */
2192    muleq_s.w.phr t1, t1, s0      /* ... tmp4 ... */
2193    shll_s.ph     s6, s6, 1       /* x2 */
2194    lw            s3, 112(a1)     /* quantptr[DCTSIZE*7] */
2195    subq.ph       s6, s6, s7      /* ... tmp12 */
2196    muleq_s.w.phl v1, t7, s3      /* tmp7 ... */
2197    muleq_s.w.phr t7, t7, s3      /* ... tmp7 ... */
2198    ins           t1, v0, 16, 16  /* ... tmp4 */
2199    addq.ph       t2, s5, s6      /* tmp1 */
2200    subq.ph       t4, s5, s6      /* tmp2 */
2201    muleq_s.w.phl v0, t5, s2      /* tmp6 ... */
2202    muleq_s.w.phr t5, t5, s2      /* ... tmp6 ... */
2203    ins           t7, v1, 16, 16  /* ... tmp7 */
2204    addq.ph       s5, t1, t7      /* z11 */
2205    subq.ph       s6, t1, t7      /* z12 */
2206    muleq_s.w.phl v1, t3, s1      /* tmp5 ... */
2207    muleq_s.w.phr t3, t3, s1      /* ... tmp5 ... */
2208    ins           t5, v0, 16, 16  /* ... tmp6 */
2209    ins           t3, v1, 16, 16  /* ... tmp5 */
2210    addq.ph       s7, t5, t3      /* z13 */
2211    subq.ph       v0, t5, t3      /* z10 */
2212    addq.ph       t7, s5, s7      /* tmp7 */
2213    subq.ph       s5, s5, s7      /* tmp11 ... */
2214    addq.ph       v1, v0, s6      /* z5 ... */
2215    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
2216    lw            t8, 8(AT)       /* FIX(1.847759065) */
2217    lw            s4, 0(AT)       /* FIX(1.082392200) */
2218    addq.ph       s0, t0, t7
2219    subq.ph       s1, t0, t7
2220    mulq_s.ph     v1, v1, t8      /* ... z5 */
2221    shll_s.ph     s5, s5, 1       /* x2 */
2222    lw            t8, 12(AT)      /* FIX(-2.613125930) */
2223    sw            s0, 0(a2)       /* wsptr[DCTSIZE*0] */
2224    shll_s.ph     v0, v0, 1       /* x4 */
2225    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
2226    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
2227    shll_s.ph     v1, v1, 1       /* x2 */
2228    addiu         a0, a0, 4
2229    addiu         a1, a1, 4
2230    sw            s1, 112(a2)     /* wsptr[DCTSIZE*7] */
2231    shll_s.ph     s6, v0, 1       /* x4 */
2232    shll_s.ph     s4, s4, 1       /* x2 */
2233    addq.ph       s6, s6, v1      /* ... tmp12 */
2234    subq.ph       t5, s6, t7      /* tmp6 */
2235    subq.ph       s4, s4, v1      /* ... tmp10 */
2236    subq.ph       t3, s5, t5      /* tmp5 */
2237    addq.ph       s2, t2, t5
2238    addq.ph       t1, s4, t3      /* tmp4 */
2239    subq.ph       s3, t2, t5
2240    sw            s2, 16(a2)      /* wsptr[DCTSIZE*1] */
2241    sw            s3, 96(a2)      /* wsptr[DCTSIZE*6] */
2242    addq.ph       v0, t4, t3
2243    subq.ph       v1, t4, t3
2244    sw            v0, 32(a2)      /* wsptr[DCTSIZE*2] */
2245    sw            v1, 80(a2)      /* wsptr[DCTSIZE*5] */
2246    addq.ph       v0, t6, t1
2247    subq.ph       v1, t6, t1
2248    sw            v0, 64(a2)      /* wsptr[DCTSIZE*4] */
2249    sw            v1, 48(a2)      /* wsptr[DCTSIZE*3] */
2250
22512:
2252    bne           a0, t9, 0b
2253     addiu        a2, a2, 4
2254
2255    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2256
2257    j             ra
2258     nop
2259
2260END(jsimd_idct_ifast_cols_dspr2)
2261
2262
2263/*****************************************************************************/
2264LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
2265/*
2266 * a0 = wsptr
2267 * a1 = output_buf
2268 * a2 = output_col
2269 * a3 = mips_idct_ifast_coefs
2270 */
2271    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2272
2273    addiu         t9, a0, 128     /* end address */
2274    lui           s8, 0x8080
2275    ori           s8, s8, 0x8080
2276
22770:
2278    lw            AT, 36(sp)      /* restore $a3 (mips_idct_ifast_coefs) */
2279    lw            t0, 0(a0)       /* wsptr[DCTSIZE*0+0/1]  b a */
2280    lw            s0, 16(a0)      /* wsptr[DCTSIZE*1+0/1]  B A */
2281    lw            t2, 4(a0)       /* wsptr[DCTSIZE*0+2/3]  d c */
2282    lw            s2, 20(a0)      /* wsptr[DCTSIZE*1+2/3]  D C */
2283    lw            t4, 8(a0)       /* wsptr[DCTSIZE*0+4/5]  f e */
2284    lw            s4, 24(a0)      /* wsptr[DCTSIZE*1+4/5]  F E */
2285    lw            t6, 12(a0)      /* wsptr[DCTSIZE*0+6/7]  h g */
2286    lw            s6, 28(a0)      /* wsptr[DCTSIZE*1+6/7]  H G */
2287    precrq.ph.w   t1, s0, t0      /* B b */
2288    ins           t0, s0, 16, 16  /* A a */
2289    bnez          t1, 1f
2290     or           s0, t2, s2
2291    bnez          s0, 1f
2292     or           s0, t4, s4
2293    bnez          s0, 1f
2294     or           s0, t6, s6
2295    bnez          s0, 1f
2296     shll_s.ph    s0, t0, 2       /* A a */
2297    lw            a3, 0(a1)
2298    lw            AT, 4(a1)
2299    precrq.ph.w   t0, s0, s0      /* A A */
2300    ins           s0, s0, 16, 16  /* a a */
2301    addu          a3, a3, a2
2302    addu          AT, AT, a2
2303    precrq.qb.ph  t0, t0, t0      /* A A A A */
2304    precrq.qb.ph  s0, s0, s0      /* a a a a */
2305    addu.qb       s0, s0, s8
2306    addu.qb       t0, t0, s8
2307    sw            s0, 0(a3)
2308    sw            s0, 4(a3)
2309    sw            t0, 0(AT)
2310    sw            t0, 4(AT)
2311    addiu         a0, a0, 32
2312    bne           a0, t9, 0b
2313     addiu        a1, a1, 8
2314    b             2f
2315     nop
2316
23171:
2318    precrq.ph.w   t3, s2, t2
2319    ins           t2, s2, 16, 16
2320    precrq.ph.w   t5, s4, t4
2321    ins           t4, s4, 16, 16
2322    precrq.ph.w   t7, s6, t6
2323    ins           t6, s6, 16, 16
2324    lw            t8, 4(AT)       /* FIX(1.414213562) */
2325    addq.ph       s4, t0, t4      /* tmp10 */
2326    subq.ph       s5, t0, t4      /* tmp11 */
2327    subq.ph       s6, t2, t6      /* tmp12 ... */
2328    addq.ph       s7, t2, t6      /* tmp13 */
2329    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
2330    addq.ph       t0, s4, s7      /* tmp0 */
2331    subq.ph       t6, s4, s7      /* tmp3 */
2332    shll_s.ph     s6, s6, 1       /* x2 */
2333    subq.ph       s6, s6, s7      /* ... tmp12 */
2334    addq.ph       t2, s5, s6      /* tmp1 */
2335    subq.ph       t4, s5, s6      /* tmp2 */
2336    addq.ph       s5, t1, t7      /* z11 */
2337    subq.ph       s6, t1, t7      /* z12 */
2338    addq.ph       s7, t5, t3      /* z13 */
2339    subq.ph       v0, t5, t3      /* z10 */
2340    addq.ph       t7, s5, s7      /* tmp7 */
2341    subq.ph       s5, s5, s7      /* tmp11 ... */
2342    addq.ph       v1, v0, s6      /* z5 ... */
2343    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
2344    lw            t8, 8(AT)       /* FIX(1.847759065) */
2345    lw            s4, 0(AT)       /* FIX(1.082392200) */
2346    addq.ph       s0, t0, t7      /* tmp0 + tmp7 */
2347    subq.ph       s7, t0, t7      /* tmp0 - tmp7 */
2348    mulq_s.ph     v1, v1, t8      /* ... z5 */
2349    lw            a3, 0(a1)
2350    lw            t8, 12(AT)      /* FIX(-2.613125930) */
2351    shll_s.ph     s5, s5, 1       /* x2 */
2352    addu          a3, a3, a2
2353    shll_s.ph     v0, v0, 1       /* x4 */
2354    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
2355    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
2356    shll_s.ph     v1, v1, 1       /* x2 */
2357    addiu         a0, a0, 32
2358    addiu         a1, a1, 8
2359    shll_s.ph     s6, v0, 1       /* x4 */
2360    shll_s.ph     s4, s4, 1       /* x2 */
2361    addq.ph       s6, s6, v1      /* ... tmp12 */
2362    shll_s.ph     s0, s0, 2
2363    subq.ph       t5, s6, t7      /* tmp6 */
2364    subq.ph       s4, s4, v1      /* ... tmp10 */
2365    subq.ph       t3, s5, t5      /* tmp5 */
2366    shll_s.ph     s7, s7, 2
2367    addq.ph       t1, s4, t3      /* tmp4 */
2368    addq.ph       s1, t2, t5      /* tmp1 + tmp6 */
2369    subq.ph       s6, t2, t5      /* tmp1 - tmp6 */
2370    addq.ph       s2, t4, t3      /* tmp2 + tmp5 */
2371    subq.ph       s5, t4, t3      /* tmp2 - tmp5 */
2372    addq.ph       s4, t6, t1      /* tmp3 + tmp4 */
2373    subq.ph       s3, t6, t1      /* tmp3 - tmp4 */
2374    shll_s.ph     s1, s1, 2
2375    shll_s.ph     s2, s2, 2
2376    shll_s.ph     s3, s3, 2
2377    shll_s.ph     s4, s4, 2
2378    shll_s.ph     s5, s5, 2
2379    shll_s.ph     s6, s6, 2
2380    precrq.ph.w   t0, s1, s0      /* B A */
2381    ins           s0, s1, 16, 16  /* b a */
2382    precrq.ph.w   t2, s3, s2      /* D C */
2383    ins           s2, s3, 16, 16  /* d c */
2384    precrq.ph.w   t4, s5, s4      /* F E */
2385    ins           s4, s5, 16, 16  /* f e */
2386    precrq.ph.w   t6, s7, s6      /* H G */
2387    ins           s6, s7, 16, 16  /* h g */
2388    precrq.qb.ph  t0, t2, t0      /* D C B A */
2389    precrq.qb.ph  s0, s2, s0      /* d c b a */
2390    precrq.qb.ph  t4, t6, t4      /* H G F E */
2391    precrq.qb.ph  s4, s6, s4      /* h g f e */
2392    addu.qb       s0, s0, s8
2393    addu.qb       s4, s4, s8
2394    sw            s0, 0(a3)       /* outptr[0/1/2/3]       d c b a */
2395    sw            s4, 4(a3)       /* outptr[4/5/6/7]       h g f e */
2396    lw            a3, -4(a1)
2397    addu.qb       t0, t0, s8
2398    addu          a3, a3, a2
2399    addu.qb       t4, t4, s8
2400    sw            t0, 0(a3)       /* outptr[0/1/2/3]       D C B A */
2401    bne           a0, t9, 0b
2402     sw           t4, 4(a3)       /* outptr[4/5/6/7]       H G F E */
2403
24042:
2405
2406    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2407
2408    j             ra
2409     nop
2410
2411END(jsimd_idct_ifast_rows_dspr2)
2412
2413
2414/*****************************************************************************/
2415LEAF_DSPR2(jsimd_fdct_islow_dspr2)
2416/*
2417 * a0 = data
2418 */
2419    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2420
2421    lui         t0, 6437
2422    ori         t0, 2260
2423    lui         t1, 9633
2424    ori         t1, 11363
2425    lui         t2, 0xd39e
2426    ori         t2, 0xe6dc
2427    lui         t3, 0xf72d
2428    ori         t3, 9633
2429    lui         t4, 2261
2430    ori         t4, 9633
2431    lui         t5, 0xd39e
2432    ori         t5, 6437
2433    lui         t6, 9633
2434    ori         t6, 0xd39d
2435    lui         t7, 0xe6dc
2436    ori         t7, 2260
2437    lui         t8, 4433
2438    ori         t8, 10703
2439    lui         t9, 0xd630
2440    ori         t9, 4433
2441    li          s8, 8
2442    move        a1, a0
24431:
2444    lw          s0, 0(a1)       /* tmp0 = 1|0 */
2445    lw          s1, 4(a1)       /* tmp1 = 3|2 */
2446    lw          s2, 8(a1)       /* tmp2 = 5|4 */
2447    lw          s3, 12(a1)      /* tmp3 = 7|6 */
2448    packrl.ph   s1, s1, s1      /* tmp1 = 2|3 */
2449    packrl.ph   s3, s3, s3      /* tmp3 = 6|7 */
2450    subq.ph     s7, s1, s2      /* tmp7 = 2-5|3-4 = t5|t4 */
2451    subq.ph     s5, s0, s3      /* tmp5 = 1-6|0-7 = t6|t7 */
2452    mult        $0, $0          /* ac0  = 0 */
2453    dpa.w.ph    $ac0, s7, t0    /* ac0 += t5*  6437 + t4*  2260 */
2454    dpa.w.ph    $ac0, s5, t1    /* ac0 += t6*  9633 + t7* 11363 */
2455    mult        $ac1, $0, $0    /* ac1  = 0 */
2456    dpa.w.ph    $ac1, s7, t2    /* ac1 += t5*-11362 + t4* -6436 */
2457    dpa.w.ph    $ac1, s5, t3    /* ac1 += t6* -2259 + t7*  9633 */
2458    mult        $ac2, $0, $0    /* ac2  = 0 */
2459    dpa.w.ph    $ac2, s7, t4    /* ac2 += t5*  2261 + t4*  9633 */
2460    dpa.w.ph    $ac2, s5, t5    /* ac2 += t6*-11362 + t7*  6437 */
2461    mult        $ac3, $0, $0    /* ac3  = 0 */
2462    dpa.w.ph    $ac3, s7, t6    /* ac3 += t5*  9633 + t4*-11363 */
2463    dpa.w.ph    $ac3, s5, t7    /* ac3 += t6* -6436 + t7*  2260 */
2464    addq.ph     s6, s1, s2      /* tmp6 = 2+5|3+4 = t2|t3 */
2465    addq.ph     s4, s0, s3      /* tmp4 = 1+6|0+7 = t1|t0 */
2466    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
2467    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
2468    extr_r.w    s2, $ac2, 11    /* tmp2 = (ac2 + 1024) >> 11 */
2469    extr_r.w    s3, $ac3, 11    /* tmp3 = (ac3 + 1024) >> 11 */
2470    addq.ph     s5, s4, s6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
2471    subq.ph     s7, s4, s6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
2472    sh          s0, 2(a1)
2473    sh          s1, 6(a1)
2474    sh          s2, 10(a1)
2475    sh          s3, 14(a1)
2476    mult        $0, $0          /* ac0  = 0 */
2477    dpa.w.ph    $ac0, s7, t8    /* ac0 += t12*  4433 + t13* 10703 */
2478    mult        $ac1, $0, $0    /* ac1  = 0 */
2479    dpa.w.ph    $ac1, s7, t9    /* ac1 += t12*-10704 + t13*  4433 */
2480    sra         s4, s5, 16      /* tmp4 = t11 */
2481    addiu       a1, a1, 16
2482    addiu       s8, s8, -1
2483    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
2484    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
2485    addu        s2, s5, s4      /* tmp2 = t10 + t11 */
2486    subu        s3, s5, s4      /* tmp3 = t10 - t11 */
2487    sll         s2, s2, 2       /* tmp2 = (t10 + t11) << 2 */
2488    sll         s3, s3, 2       /* tmp3 = (t10 - t11) << 2 */
2489    sh          s2, -16(a1)
2490    sh          s3, -8(a1)
2491    sh          s0, -12(a1)
2492    bgtz        s8, 1b
2493     sh         s1, -4(a1)
2494    li          t0, 2260
2495    li          t1, 11363
2496    li          t2, 9633
2497    li          t3, 6436
2498    li          t4, 6437
2499    li          t5, 2261
2500    li          t6, 11362
2501    li          t7, 2259
2502    li          t8, 4433
2503    li          t9, 10703
2504    li          a1, 10704
2505    li          s8, 8
2506
25072:
2508    lh          a2, 0(a0)       /* 0 */
2509    lh          a3, 16(a0)      /* 8 */
2510    lh          v0, 32(a0)      /* 16 */
2511    lh          v1, 48(a0)      /* 24 */
2512    lh          s4, 64(a0)      /* 32 */
2513    lh          s5, 80(a0)      /* 40 */
2514    lh          s6, 96(a0)      /* 48 */
2515    lh          s7, 112(a0)     /* 56 */
2516    addu        s2, v0, s5      /* tmp2 = 16 + 40 */
2517    subu        s5, v0, s5      /* tmp5 = 16 - 40 */
2518    addu        s3, v1, s4      /* tmp3 = 24 + 32 */
2519    subu        s4, v1, s4      /* tmp4 = 24 - 32 */
2520    addu        s0, a2, s7      /* tmp0 =  0 + 56 */
2521    subu        s7, a2, s7      /* tmp7 =  0 - 56 */
2522    addu        s1, a3, s6      /* tmp1 =  8 + 48 */
2523    subu        s6, a3, s6      /* tmp6 =  8 - 48 */
2524    addu        a2, s0, s3      /* tmp10 = tmp0 + tmp3 */
2525    subu        v1, s0, s3      /* tmp13 = tmp0 - tmp3 */
2526    addu        a3, s1, s2      /* tmp11 = tmp1 + tmp2 */
2527    subu        v0, s1, s2      /* tmp12 = tmp1 - tmp2 */
2528    mult        s7, t1          /* ac0  = tmp7 * c1 */
2529    madd        s4, t0          /* ac0 += tmp4 * c0 */
2530    madd        s5, t4          /* ac0 += tmp5 * c4 */
2531    madd        s6, t2          /* ac0 += tmp6 * c2 */
2532    mult        $ac1, s7, t2    /* ac1  = tmp7 * c2 */
2533    msub        $ac1, s4, t3    /* ac1 -= tmp4 * c3 */
2534    msub        $ac1, s5, t6    /* ac1 -= tmp5 * c6 */
2535    msub        $ac1, s6, t7    /* ac1 -= tmp6 * c7 */
2536    mult        $ac2, s7, t4    /* ac2  = tmp7 * c4 */
2537    madd        $ac2, s4, t2    /* ac2 += tmp4 * c2 */
2538    madd        $ac2, s5, t5    /* ac2 += tmp5 * c5 */
2539    msub        $ac2, s6, t6    /* ac2 -= tmp6 * c6 */
2540    mult        $ac3, s7, t0    /* ac3  = tmp7 * c0 */
2541    msub        $ac3, s4, t1    /* ac3 -= tmp4 * c1 */
2542    madd        $ac3, s5, t2    /* ac3 += tmp5 * c2 */
2543    msub        $ac3, s6, t3    /* ac3 -= tmp6 * c3 */
2544    extr_r.w    s0, $ac0, 15    /* tmp0 = (ac0 + 16384) >> 15 */
2545    extr_r.w    s1, $ac1, 15    /* tmp1 = (ac1 + 16384) >> 15 */
2546    extr_r.w    s2, $ac2, 15    /* tmp2 = (ac2 + 16384) >> 15 */
2547    extr_r.w    s3, $ac3, 15    /* tmp3 = (ac3 + 16384) >> 15 */
2548    addiu       s8, s8, -1
2549    addu        s4, a2, a3      /* tmp4 = tmp10 + tmp11 */
2550    subu        s5, a2, a3      /* tmp5 = tmp10 - tmp11 */
2551    sh          s0, 16(a0)
2552    sh          s1, 48(a0)
2553    sh          s2, 80(a0)
2554    sh          s3, 112(a0)
2555    mult        v0, t8          /* ac0  = tmp12 * c8 */
2556    madd        v1, t9          /* ac0 += tmp13 * c9 */
2557    mult        $ac1, v1, t8    /* ac1  = tmp13 * c8 */
2558    msub        $ac1, v0, a1    /* ac1 -= tmp12 * c10 */
2559    addiu       a0, a0, 2
2560    extr_r.w    s6, $ac0, 15    /* tmp6 = (ac0 + 16384) >> 15 */
2561    extr_r.w    s7, $ac1, 15    /* tmp7 = (ac1 + 16384) >> 15 */
2562    shra_r.w    s4, s4, 2       /* tmp4 = (tmp4 + 2) >> 2 */
2563    shra_r.w    s5, s5, 2       /* tmp5 = (tmp5 + 2) >> 2 */
2564    sh          s4, -2(a0)
2565    sh          s5, 62(a0)
2566    sh          s6, 30(a0)
2567    bgtz        s8, 2b
2568     sh         s7, 94(a0)
2569
2570    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2571
2572    jr          ra
2573     nop
2574
2575END(jsimd_fdct_islow_dspr2)
2576
2577
2578/**************************************************************************/
2579LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
2580/*
2581 * a0 = data
2582 */
2583    .set at
2584
2585    SAVE_REGS_ON_STACK 8, s0, s1
2586
2587    li          a1, 0x014e014e  /* FIX_1_306562965 (334 << 16) |
2588                                                   (334 & 0xffff) */
2589    li          a2, 0x008b008b  /* FIX_0_541196100 (139 << 16) |
2590                                                   (139 & 0xffff) */
2591    li          a3, 0x00620062  /* FIX_0_382683433 (98 << 16) |
2592                                                   (98 & 0xffff) */
2593    li          s1, 0x00b500b5  /* FIX_0_707106781 (181 << 16) |
2594                                                   (181 & 0xffff) */
2595
2596    move        v0, a0
2597    addiu       v1, v0, 128     /* end address */
2598
25990:
2600    lw          t0, 0(v0)       /* tmp0 = 1|0 */
2601    lw          t1, 4(v0)       /* tmp1 = 3|2 */
2602    lw          t2, 8(v0)       /* tmp2 = 5|4 */
2603    lw          t3, 12(v0)      /* tmp3 = 7|6 */
2604    packrl.ph   t1, t1, t1      /* tmp1 = 2|3 */
2605    packrl.ph   t3, t3, t3      /* tmp3 = 6|7 */
2606    subq.ph     t7, t1, t2      /* tmp7 = 2-5|3-4 = t5|t4 */
2607    subq.ph     t5, t0, t3      /* tmp5 = 1-6|0-7 = t6|t7 */
2608    addq.ph     t6, t1, t2      /* tmp6 = 2+5|3+4 = t2|t3 */
2609    addq.ph     t4, t0, t3      /* tmp4 = 1+6|0+7 = t1|t0 */
2610    addq.ph     t8, t4, t6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
2611    subq.ph     t9, t4, t6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
2612    sra         t4, t8, 16      /* tmp4 = t11 */
2613    mult        $0, $0          /* ac0  = 0 */
2614    dpa.w.ph    $ac0, t9, s1
2615    mult        $ac1, $0, $0    /* ac1  = 0 */
2616    dpa.w.ph    $ac1, t7, a3    /* ac1 += t4*98 + t5*98 */
2617    dpsx.w.ph   $ac1, t5, a3    /* ac1 += t6*98 + t7*98 */
2618    mult        $ac2, $0, $0    /* ac2  = 0 */
2619    dpa.w.ph    $ac2, t7, a2    /* ac2 += t4*139 + t5*139 */
2620    mult        $ac3, $0, $0    /* ac3  = 0 */
2621    dpa.w.ph    $ac3, t5, a1    /* ac3 += t6*334 + t7*334 */
2622    precrq.ph.w t0, t5, t7      /* t0 = t5|t6 */
2623    addq.ph     t2, t8, t4      /* tmp2 = t10 + t11 */
2624    subq.ph     t3, t8, t4      /* tmp3 = t10 - t11 */
2625    extr.w      t4, $ac0, 8
2626    mult        $0, $0          /* ac0  = 0 */
2627    dpa.w.ph    $ac0, t0, s1    /* ac0 += t5*181 + t6*181 */
2628    extr.w      t0, $ac1, 8     /* t0 = z5 */
2629    extr.w      t1, $ac2, 8     /* t1 = MULTIPLY(tmp10, 139) */
2630    extr.w      t7, $ac3, 8     /* t2 = MULTIPLY(tmp12, 334) */
2631    extr.w      t8, $ac0, 8     /* t8 = z3 = MULTIPLY(tmp11, 181) */
2632    add         t6, t1, t0      /* t6 = z2 */
2633    add         t7, t7, t0      /* t7 = z4 */
2634    subq.ph     t0, t5, t8      /* t0 = z13 = tmp7 - z3 */
2635    addq.ph     t8, t5, t8      /* t9 = z11 = tmp7 + z3 */
2636    addq.ph     t1, t0, t6      /* t1 = z13 + z2 */
2637    subq.ph     t6, t0, t6      /* t6 = z13 - z2 */
2638    addq.ph     t0, t8, t7      /* t0 = z11 + z4 */
2639    subq.ph     t7, t8, t7      /* t7 = z11 - z4 */
2640    addq.ph     t5, t4, t9
2641    subq.ph     t4, t9, t4
2642    sh          t2, 0(v0)
2643    sh          t5, 4(v0)
2644    sh          t3, 8(v0)
2645    sh          t4, 12(v0)
2646    sh          t1, 10(v0)
2647    sh          t6, 6(v0)
2648    sh          t0, 2(v0)
2649    sh          t7, 14(v0)
2650    addiu       v0, 16
2651    bne         v1, v0, 0b
2652     nop
2653    move        v0, a0
2654    addiu       v1, v0, 16
2655
26561:
2657    lh          t0, 0(v0)       /* 0 */
2658    lh          t1, 16(v0)      /* 8 */
2659    lh          t2, 32(v0)      /* 16 */
2660    lh          t3, 48(v0)      /* 24 */
2661    lh          t4, 64(v0)      /* 32 */
2662    lh          t5, 80(v0)      /* 40 */
2663    lh          t6, 96(v0)      /* 48 */
2664    lh          t7, 112(v0)     /* 56 */
2665    add         t8, t0, t7      /* t8 = tmp0 */
2666    sub         t7, t0, t7      /* t7 = tmp7 */
2667    add         t0, t1, t6      /* t0 = tmp1 */
2668    sub         t1, t1, t6      /* t1 = tmp6 */
2669    add         t6, t2, t5      /* t6 = tmp2 */
2670    sub         t5, t2, t5      /* t5 = tmp5 */
2671    add         t2, t3, t4      /* t2 = tmp3 */
2672    sub         t3, t3, t4      /* t3 = tmp4 */
2673    add         t4, t8, t2      /* t4 = tmp10 = tmp0 + tmp3 */
2674    sub         t8, t8, t2      /* t8 = tmp13 = tmp0 - tmp3 */
2675    sub         s0, t0, t6      /* s0 = tmp12 = tmp1 - tmp2 */
2676    ins         t8, s0, 16, 16  /* t8 = tmp12|tmp13 */
2677    add         t2, t0, t6      /* t2 = tmp11 = tmp1 + tmp2 */
2678    mult        $0, $0          /* ac0  = 0 */
2679    dpa.w.ph    $ac0, t8, s1    /* ac0 += t12*181 + t13*181 */
2680    add         s0, t4, t2      /* t8 = tmp10+tmp11 */
2681    sub         t4, t4, t2      /* t4 = tmp10-tmp11 */
2682    sh          s0, 0(v0)
2683    sh          t4, 64(v0)
2684    extr.w      t2, $ac0, 8     /* z1 = MULTIPLY(tmp12+tmp13,
2685                                                 FIX_0_707106781) */
2686    addq.ph     t4, t8, t2      /* t9 = tmp13 + z1 */
2687    subq.ph     t8, t8, t2      /* t2 = tmp13 - z1 */
2688    sh          t4, 32(v0)
2689    sh          t8, 96(v0)
2690    add         t3, t3, t5      /* t3 = tmp10 = tmp4 + tmp5 */
2691    add         t0, t5, t1      /* t0 = tmp11 = tmp5 + tmp6 */
2692    add         t1, t1, t7      /* t1 = tmp12 = tmp6 + tmp7 */
2693    andi        t4, a1, 0xffff
2694    mul         s0, t1, t4
2695    sra         s0, s0, 8       /* s0 = z4 =
2696                                     MULTIPLY(tmp12, FIX_1_306562965) */
2697    ins         t1, t3, 16, 16  /* t1 = tmp10|tmp12 */
2698    mult        $0, $0          /* ac0  = 0 */
2699    mulsa.w.ph  $ac0, t1, a3    /* ac0 += t10*98 - t12*98 */
2700    extr.w      t8, $ac0, 8     /* z5 = MULTIPLY(tmp10-tmp12,
2701                                                 FIX_0_382683433) */
2702    add         t2, t7, t8      /* t2 = tmp7 + z5 */
2703    sub         t7, t7, t8      /* t7 = tmp7 - z5 */
2704    andi        t4, a2, 0xffff
2705    mul         t8, t3, t4
2706    sra         t8, t8, 8       /* t8 = z2 =
2707                                     MULTIPLY(tmp10, FIX_0_541196100) */
2708    andi        t4, s1, 0xffff
2709    mul         t6, t0, t4
2710    sra         t6, t6, 8       /* t6 = z3 =
2711                                     MULTIPLY(tmp11, FIX_0_707106781) */
2712    add         t0, t6, t8      /* t0 = z3 + z2 */
2713    sub         t1, t6, t8      /* t1 = z3 - z2 */
2714    add         t3, t6, s0      /* t3 = z3 + z4 */
2715    sub         t4, t6, s0      /* t4 = z3 - z4 */
2716    sub         t5, t2, t1      /* t5 = dataptr[5] */
2717    sub         t6, t7, t0      /* t6 = dataptr[3] */
2718    add         t3, t2, t3      /* t3 = dataptr[1] */
2719    add         t4, t7, t4      /* t4 = dataptr[7] */
2720    sh          t5, 80(v0)
2721    sh          t6, 48(v0)
2722    sh          t3, 16(v0)
2723    sh          t4, 112(v0)
2724    addiu       v0, 2
2725    bne         v0, v1, 1b
2726     nop
2727
2728    RESTORE_REGS_FROM_STACK 8, s0, s1
2729
2730    j           ra
2731     nop
2732END(jsimd_fdct_ifast_dspr2)
2733
2734
2735/*****************************************************************************/
2736LEAF_DSPR2(jsimd_quantize_dspr2)
2737/*
2738 * a0 = coef_block
2739 * a1 = divisors
2740 * a2 = workspace
2741 */
2742    .set at
2743
2744    SAVE_REGS_ON_STACK 16, s0, s1, s2
2745
2746    addiu       v0, a2, 124     /* v0 = workspace_end */
2747    lh          t0, 0(a2)
2748    lh          t1, 0(a1)
2749    lh          t2, 128(a1)
2750    sra         t3, t0, 15
2751    sll         t3, t3, 1
2752    addiu       t3, t3, 1
2753    mul         t0, t0, t3
2754    lh          t4, 384(a1)
2755    lh          t5, 130(a1)
2756    lh          t6, 2(a2)
2757    lh          t7, 2(a1)
2758    lh          t8, 386(a1)
2759
27601:
2761    andi        t1, 0xffff
2762    add         t9, t0, t2
2763    andi        t9, 0xffff
2764    mul         v1, t9, t1
2765    sra         s0, t6, 15
2766    sll         s0, s0, 1
2767    addiu       s0, s0, 1
2768    addiu       t9, t4, 16
2769    srav        v1, v1, t9
2770    mul         v1, v1, t3
2771    mul         t6, t6, s0
2772    andi        t7, 0xffff
2773    addiu       a2, a2, 4
2774    addiu       a1, a1, 4
2775    add         s1, t6, t5
2776    andi        s1, 0xffff
2777    sh          v1, 0(a0)
2778
2779    mul         s2, s1, t7
2780    addiu       s1, t8, 16
2781    srav        s2, s2, s1
2782    mul         s2, s2, s0
2783    lh          t0, 0(a2)
2784    lh          t1, 0(a1)
2785    sra         t3, t0, 15
2786    sll         t3, t3, 1
2787    addiu       t3, t3, 1
2788    mul         t0, t0, t3
2789    lh          t2, 128(a1)
2790    lh          t4, 384(a1)
2791    lh          t5, 130(a1)
2792    lh          t8, 386(a1)
2793    lh          t6, 2(a2)
2794    lh          t7, 2(a1)
2795    sh          s2, 2(a0)
2796    lh          t0, 0(a2)
2797    sra         t3, t0, 15
2798    sll         t3, t3, 1
2799    addiu       t3, t3, 1
2800    mul         t0, t0, t3
2801    bne         a2, v0, 1b
2802     addiu      a0, a0, 4
2803
2804    andi        t1, 0xffff
2805    add         t9, t0, t2
2806    andi        t9, 0xffff
2807    mul         v1, t9, t1
2808    sra         s0, t6, 15
2809    sll         s0, s0, 1
2810    addiu       s0, s0, 1
2811    addiu       t9, t4, 16
2812    srav        v1, v1, t9
2813    mul         v1, v1, t3
2814    mul         t6, t6, s0
2815    andi        t7, 0xffff
2816    sh          v1, 0(a0)
2817    add         s1, t6, t5
2818    andi        s1, 0xffff
2819    mul         s2, s1, t7
2820    addiu       s1, t8, 16
2821    addiu       a2, a2, 4
2822    addiu       a1, a1, 4
2823    srav        s2, s2, s1
2824    mul         s2, s2, s0
2825    sh          s2, 2(a0)
2826
2827    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2828
2829    j           ra
2830     nop
2831
2832END(jsimd_quantize_dspr2)
2833
2834
2835#ifndef __mips_soft_float
2836
2837/*****************************************************************************/
2838LEAF_DSPR2(jsimd_quantize_float_dspr2)
2839/*
2840 * a0 = coef_block
2841 * a1 = divisors
2842 * a2 = workspace
2843 */
2844    .set at
2845
2846    li          t1, 0x46800100  /* integer representation 16384.5 */
2847    mtc1        t1, f0
2848    li          t0, 63
28490:
2850    lwc1        f2, 0(a2)
2851    lwc1        f10, 0(a1)
2852    lwc1        f4, 4(a2)
2853    lwc1        f12, 4(a1)
2854    lwc1        f6, 8(a2)
2855    lwc1        f14, 8(a1)
2856    lwc1        f8, 12(a2)
2857    lwc1        f16, 12(a1)
2858    madd.s      f2, f0, f2, f10
2859    madd.s      f4, f0, f4, f12
2860    madd.s      f6, f0, f6, f14
2861    madd.s      f8, f0, f8, f16
2862    lwc1        f10, 16(a1)
2863    lwc1        f12, 20(a1)
2864    trunc.w.s   f2, f2
2865    trunc.w.s   f4, f4
2866    trunc.w.s   f6, f6
2867    trunc.w.s   f8, f8
2868    lwc1        f14, 24(a1)
2869    lwc1        f16, 28(a1)
2870    mfc1        t1, f2
2871    mfc1        t2, f4
2872    mfc1        t3, f6
2873    mfc1        t4, f8
2874    lwc1        f2, 16(a2)
2875    lwc1        f4, 20(a2)
2876    lwc1        f6, 24(a2)
2877    lwc1        f8, 28(a2)
2878    madd.s      f2, f0, f2, f10
2879    madd.s      f4, f0, f4, f12
2880    madd.s      f6, f0, f6, f14
2881    madd.s      f8, f0, f8, f16
2882    addiu       t1, t1, -16384
2883    addiu       t2, t2, -16384
2884    addiu       t3, t3, -16384
2885    addiu       t4, t4, -16384
2886    trunc.w.s   f2, f2
2887    trunc.w.s   f4, f4
2888    trunc.w.s   f6, f6
2889    trunc.w.s   f8, f8
2890    sh          t1, 0(a0)
2891    sh          t2, 2(a0)
2892    sh          t3, 4(a0)
2893    sh          t4, 6(a0)
2894    mfc1        t1, f2
2895    mfc1        t2, f4
2896    mfc1        t3, f6
2897    mfc1        t4, f8
2898    addiu       t0, t0, -8
2899    addiu       a2, a2, 32
2900    addiu       a1, a1, 32
2901    addiu       t1, t1, -16384
2902    addiu       t2, t2, -16384
2903    addiu       t3, t3, -16384
2904    addiu       t4, t4, -16384
2905    sh          t1, 8(a0)
2906    sh          t2, 10(a0)
2907    sh          t3, 12(a0)
2908    sh          t4, 14(a0)
2909    bgez        t0, 0b
2910     addiu      a0, a0, 16
2911
2912    j           ra
2913     nop
2914
2915END(jsimd_quantize_float_dspr2)
2916
2917#endif
2918
2919
2920/*****************************************************************************/
2921LEAF_DSPR2(jsimd_idct_2x2_dspr2)
2922/*
2923 * a0 = compptr->dct_table
2924 * a1 = coef_block
2925 * a2 = output_buf
2926 * a3 = output_col
2927 */
2928    .set at
2929
2930    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2931
2932    addiu       sp, sp, -40
2933    move        v0, sp
2934    addiu       s2, zero, 29692
2935    addiu       s3, zero, -10426
2936    addiu       s4, zero, 6967
2937    addiu       s5, zero, -5906
2938    lh          t0, 0(a1)       /* t0 = inptr[DCTSIZE*0] */
2939    lh          t5, 0(a0)       /* t5 = quantptr[DCTSIZE*0] */
2940    lh          t1, 48(a1)      /* t1 = inptr[DCTSIZE*3] */
2941    lh          t6, 48(a0)      /* t6 = quantptr[DCTSIZE*3] */
2942    mul         t4, t5, t0
2943    lh          t0, 16(a1)      /* t0 = inptr[DCTSIZE*1] */
2944    lh          t5, 16(a0)      /* t5 = quantptr[DCTSIZE*1] */
2945    mul         t6, t6, t1
2946    mul         t5, t5, t0
2947    lh          t2, 80(a1)      /* t2 = inptr[DCTSIZE*5] */
2948    lh          t7, 80(a0)      /* t7 = quantptr[DCTSIZE*5] */
2949    lh          t3, 112(a1)     /* t3 = inptr[DCTSIZE*7] */
2950    lh          t8, 112(a0)     /* t8 = quantptr[DCTSIZE*7] */
2951    mul         t7, t7, t2
2952    mult        zero, zero
2953    mul         t8, t8, t3
2954    li          s0, 0x73FCD746  /* s0 = (29692 << 16) | (-10426 & 0xffff) */
2955    li          s1, 0x1B37E8EE  /* s1 = (6967 << 16) | (-5906 & 0xffff) */
2956    ins         t6, t5, 16, 16  /* t6 = t5|t6 */
2957    sll         t4, t4, 15
2958    dpa.w.ph    $ac0, t6, s0
2959    lh          t1, 2(a1)
2960    lh          t6, 2(a0)
2961    ins         t8, t7, 16, 16  /* t8 = t7|t8 */
2962    dpa.w.ph    $ac0, t8, s1
2963    mflo        t0, $ac0
2964    mul         t5, t6, t1
2965    lh          t1, 18(a1)
2966    lh          t6, 18(a0)
2967    lh          t2, 50(a1)
2968    lh          t7, 50(a0)
2969    mul         t6, t6, t1
2970    subu        t8, t4, t0
2971    mul         t7, t7, t2
2972    addu        t0, t4, t0
2973    shra_r.w    t0, t0, 13
2974    lh          t1, 82(a1)
2975    lh          t2, 82(a0)
2976    lh          t3, 114(a1)
2977    lh          t4, 114(a0)
2978    shra_r.w    t8, t8, 13
2979    mul         t1, t1, t2
2980    mul         t3, t3, t4
2981    sw          t0, 0(v0)
2982    sw          t8, 20(v0)
2983    sll         t4, t5, 15
2984    ins         t7, t6, 16, 16
2985    mult        zero, zero
2986    dpa.w.ph    $ac0, t7, s0
2987    ins         t3, t1, 16, 16
2988    lh          t1, 6(a1)
2989    lh          t6, 6(a0)
2990    dpa.w.ph    $ac0, t3, s1
2991    mflo        t0, $ac0
2992    mul         t5, t6, t1
2993    lh          t1, 22(a1)
2994    lh          t6, 22(a0)
2995    lh          t2, 54(a1)
2996    lh          t7, 54(a0)
2997    mul         t6, t6, t1
2998    subu        t8, t4, t0
2999    mul         t7, t7, t2
3000    addu        t0, t4, t0
3001    shra_r.w    t0, t0, 13
3002    lh          t1, 86(a1)
3003    lh          t2, 86(a0)
3004    lh          t3, 118(a1)
3005    lh          t4, 118(a0)
3006    shra_r.w    t8, t8, 13
3007    mul         t1, t1, t2
3008    mul         t3, t3, t4
3009    sw          t0, 4(v0)
3010    sw          t8, 24(v0)
3011    sll         t4, t5, 15
3012    ins         t7, t6, 16, 16
3013    mult        zero, zero
3014    dpa.w.ph    $ac0, t7, s0
3015    ins         t3, t1, 16, 16
3016    lh          t1, 10(a1)
3017    lh          t6, 10(a0)
3018    dpa.w.ph    $ac0, t3, s1
3019    mflo        t0, $ac0
3020    mul         t5, t6, t1
3021    lh          t1, 26(a1)
3022    lh          t6, 26(a0)
3023    lh          t2, 58(a1)
3024    lh          t7, 58(a0)
3025    mul         t6, t6, t1
3026    subu        t8, t4, t0
3027    mul         t7, t7, t2
3028    addu        t0, t4, t0
3029    shra_r.w    t0, t0, 13
3030    lh          t1, 90(a1)
3031    lh          t2, 90(a0)
3032    lh          t3, 122(a1)
3033    lh          t4, 122(a0)
3034    shra_r.w    t8, t8, 13
3035    mul         t1, t1, t2
3036    mul         t3, t3, t4
3037    sw          t0, 8(v0)
3038    sw          t8, 28(v0)
3039    sll         t4, t5, 15
3040    ins         t7, t6, 16, 16
3041    mult        zero, zero
3042    dpa.w.ph    $ac0, t7, s0
3043    ins         t3, t1, 16, 16
3044    lh          t1, 14(a1)
3045    lh          t6, 14(a0)
3046    dpa.w.ph    $ac0, t3, s1
3047    mflo        t0, $ac0
3048    mul         t5, t6, t1
3049    lh          t1, 30(a1)
3050    lh          t6, 30(a0)
3051    lh          t2, 62(a1)
3052    lh          t7, 62(a0)
3053    mul         t6, t6, t1
3054    subu        t8, t4, t0
3055    mul         t7, t7, t2
3056    addu        t0, t4, t0
3057    shra_r.w    t0, t0, 13
3058    lh          t1, 94(a1)
3059    lh          t2, 94(a0)
3060    lh          t3, 126(a1)
3061    lh          t4, 126(a0)
3062    shra_r.w    t8, t8, 13
3063    mul         t1, t1, t2
3064    mul         t3, t3, t4
3065    sw          t0, 12(v0)
3066    sw          t8, 32(v0)
3067    sll         t4, t5, 15
3068    ins         t7, t6, 16, 16
3069    mult        zero, zero
3070    dpa.w.ph    $ac0, t7, s0
3071    ins         t3, t1, 16, 16
3072    dpa.w.ph    $ac0, t3, s1
3073    mflo        t0, $ac0
3074    lw          t9, 0(a2)
3075    lw          t3, 0(v0)
3076    lw          t7, 4(v0)
3077    lw          t1, 8(v0)
3078    addu        t9, t9, a3
3079    sll         t3, t3, 15
3080    subu        t8, t4, t0
3081    addu        t0, t4, t0
3082    shra_r.w    t0, t0, 13
3083    shra_r.w    t8, t8, 13
3084    sw          t0, 16(v0)
3085    sw          t8, 36(v0)
3086    lw          t5, 12(v0)
3087    lw          t6, 16(v0)
3088    mult        t7, s2
3089    madd        t1, s3
3090    madd        t5, s4
3091    madd        t6, s5
3092    lw          t5, 24(v0)
3093    lw          t7, 28(v0)
3094    mflo        t0, $ac0
3095    lw          t8, 32(v0)
3096    lw          t2, 36(v0)
3097    mult        $ac1, t5, s2
3098    madd        $ac1, t7, s3
3099    madd        $ac1, t8, s4
3100    madd        $ac1, t2, s5
3101    addu        t1, t3, t0
3102    subu        t6, t3, t0
3103    shra_r.w    t1, t1, 20
3104    shra_r.w    t6, t6, 20
3105    mflo        t4, $ac1
3106    shll_s.w    t1, t1, 24
3107    shll_s.w    t6, t6, 24
3108    sra         t1, t1, 24
3109    sra         t6, t6, 24
3110    addiu       t1, t1, 128
3111    addiu       t6, t6, 128
3112    lw          t0, 20(v0)
3113    sb          t1, 0(t9)
3114    sb          t6, 1(t9)
3115    sll         t0, t0, 15
3116    lw          t9, 4(a2)
3117    addu        t1, t0, t4
3118    subu        t6, t0, t4
3119    addu        t9, t9, a3
3120    shra_r.w    t1, t1, 20
3121    shra_r.w    t6, t6, 20
3122    shll_s.w    t1, t1, 24
3123    shll_s.w    t6, t6, 24
3124    sra         t1, t1, 24
3125    sra         t6, t6, 24
3126    addiu       t1, t1, 128
3127    addiu       t6, t6, 128
3128    sb          t1, 0(t9)
3129    sb          t6, 1(t9)
3130    addiu       sp, sp, 40
3131
3132    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3133
3134    j           ra
3135     nop
3136
3137END(jsimd_idct_2x2_dspr2)
3138
3139
3140/*****************************************************************************/
3141LEAF_DSPR2(jsimd_idct_4x4_dspr2)
3142/*
3143 * a0     = compptr->dct_table
3144 * a1     = coef_block
3145 * a2     = output_buf
3146 * a3     = output_col
3147 * 16(sp) = workspace[DCTSIZE*4]  (buffers data between passes)
3148 */
3149    .set at
3150
3151    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3152
3153    lw          v1, 48(sp)
3154    move        t0, a1
3155    move        t1, v1
3156    li          t9, 4
3157    li          s0, 0x2e75f93e
3158    li          s1, 0x21f9ba79
3159    li          s2, 0xecc2efb0
3160    li          s3, 0x52031ccd
3161
31620:
3163    lh          s6, 32(t0)      /* inptr[DCTSIZE*2] */
3164    lh          t6, 32(a0)      /* quantptr[DCTSIZE*2] */
3165    lh          s7, 96(t0)      /* inptr[DCTSIZE*6] */
3166    lh          t7, 96(a0)      /* quantptr[DCTSIZE*6] */
3167    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
3168                                         quantptr[DCTSIZE*2]) */
3169    lh          s4, 0(t0)       /* inptr[DCTSIZE*0] */
3170    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
3171                                         quantptr[DCTSIZE*6]) */
3172    lh          s5, 0(a0)       /* quantptr[0] */
3173    li          s6, 15137
3174    li          s7, 6270
3175    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
3176    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
3177                                         quantptr[DCTSIZE*2]) */
3178    lh          t5, 112(t0)     /* inptr[DCTSIZE*7] */
3179    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
3180                                         quantptr[DCTSIZE*6]) */
3181    lh          s4, 112(a0)     /* quantptr[DCTSIZE*7] */
3182    lh          v0, 80(t0)      /* inptr[DCTSIZE*5] */
3183    lh          s5, 80(a0)      /* quantptr[DCTSIZE*5] */
3184    lh          s6, 48(a0)      /* quantptr[DCTSIZE*3] */
3185    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
3186    lh          s7, 16(a0)      /* quantptr[DCTSIZE*1] */
3187    lh          t8, 16(t0)      /* inptr[DCTSIZE*1] */
3188    subu        t6, t6, t7      /* tmp2 =
3189                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
3190    lh          t7, 48(t0)      /* inptr[DCTSIZE*3] */
3191    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
3192                                         quantptr[DCTSIZE*7]) */
3193    mul         v0, s5, v0      /* z2 = (inptr[DCTSIZE*5] *
3194                                         quantptr[DCTSIZE*5]) */
3195    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
3196                                         quantptr[DCTSIZE*3]) */
3197    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
3198                                         quantptr[DCTSIZE*1]) */
3199    addu        t3, t2, t6      /* tmp10 = tmp0 + z2 */
3200    subu        t4, t2, t6      /* tmp10 = tmp0 - z2 */
3201    mult        $ac0, zero, zero
3202    mult        $ac1, zero, zero
3203    ins         t5, v0, 16, 16
3204    ins         t7, t8, 16, 16
3205    addiu       t9, t9, -1
3206    dpa.w.ph    $ac0, t5, s0
3207    dpa.w.ph    $ac0, t7, s1
3208    dpa.w.ph    $ac1, t5, s2
3209    dpa.w.ph    $ac1, t7, s3
3210    mflo        s4, $ac0
3211    mflo        s5, $ac1
3212    addiu       a0, a0, 2
3213    addiu       t1, t1, 4
3214    addiu       t0, t0, 2
3215    addu        t6, t4, s4
3216    subu        t5, t4, s4
3217    addu        s6, t3, s5
3218    subu        s7, t3, s5
3219    shra_r.w    t6, t6, 12      /* DESCALE(tmp12 + temp1, 12) */
3220    shra_r.w    t5, t5, 12      /* DESCALE(tmp12 - temp1, 12) */
3221    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
3222    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
3223    sw          t6, 28(t1)
3224    sw          t5, 60(t1)
3225    sw          s6, -4(t1)
3226    bgtz        t9, 0b
3227     sw         s7, 92(t1)
3228    /* second loop three pass */
3229    li          t9, 3
32301:
3231    lh          s6, 34(t0)      /* inptr[DCTSIZE*2] */
3232    lh          t6, 34(a0)      /* quantptr[DCTSIZE*2] */
3233    lh          s7, 98(t0)      /* inptr[DCTSIZE*6] */
3234    lh          t7, 98(a0)      /* quantptr[DCTSIZE*6] */
3235    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
3236                                         quantptr[DCTSIZE*2]) */
3237    lh          s4, 2(t0)       /* inptr[DCTSIZE*0] */
3238    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
3239                                         quantptr[DCTSIZE*6]) */
3240    lh          s5, 2(a0)       /* quantptr[DCTSIZE*0] */
3241    li          s6, 15137
3242    li          s7, 6270
3243    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
3244    mul         v0, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
3245                                         quantptr[DCTSIZE*2]) */
3246    lh          t5, 114(t0)     /* inptr[DCTSIZE*7] */
3247    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
3248                                         quantptr[DCTSIZE*6]) */
3249    lh          s4, 114(a0)     /* quantptr[DCTSIZE*7] */
3250    lh          s5, 82(a0)      /* quantptr[DCTSIZE*5] */
3251    lh          t6, 82(t0)      /* inptr[DCTSIZE*5] */
3252    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
3253    lh          s6, 50(a0)      /* quantptr[DCTSIZE*3] */
3254    lh          t8, 18(t0)      /* inptr[DCTSIZE*1] */
3255    subu        v0, v0, t7      /* tmp2 =
3256                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
3257    lh          t7, 50(t0)      /* inptr[DCTSIZE*3] */
3258    lh          s7, 18(a0)      /* quantptr[DCTSIZE*1] */
3259    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
3260                                         quantptr[DCTSIZE*7]) */
3261    mul         t6, s5, t6      /* z2 = (inptr[DCTSIZE*5] *
3262                                         quantptr[DCTSIZE*5]) */
3263    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
3264                                         quantptr[DCTSIZE*3]) */
3265    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
3266                                         quantptr[DCTSIZE*1]) */
3267    addu        t3, t2, v0      /* tmp10 = tmp0 + z2 */
3268    subu        t4, t2, v0      /* tmp10 = tmp0 - z2 */
3269    mult        $ac0, zero, zero
3270    mult        $ac1, zero, zero
3271    ins         t5, t6, 16, 16
3272    ins         t7, t8, 16, 16
3273    dpa.w.ph    $ac0, t5, s0
3274    dpa.w.ph    $ac0, t7, s1
3275    dpa.w.ph    $ac1, t5, s2
3276    dpa.w.ph    $ac1, t7, s3
3277    mflo        t5, $ac0
3278    mflo        t6, $ac1
3279    addiu       t9, t9, -1
3280    addiu       t0, t0, 2
3281    addiu       a0, a0, 2
3282    addiu       t1, t1, 4
3283    addu        s5, t4, t5
3284    subu        s4, t4, t5
3285    addu        s6, t3, t6
3286    subu        s7, t3, t6
3287    shra_r.w    s5, s5, 12      /* DESCALE(tmp12 + temp1, 12) */
3288    shra_r.w    s4, s4, 12      /* DESCALE(tmp12 - temp1, 12) */
3289    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
3290    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
3291    sw          s5, 32(t1)
3292    sw          s4, 64(t1)
3293    sw          s6, 0(t1)
3294    bgtz        t9, 1b
3295     sw         s7, 96(t1)
3296    move        t1, v1
3297    li          s4, 15137
3298    lw          s6, 8(t1)       /* wsptr[2] */
3299    li          s5, 6270
3300    lw          s7, 24(t1)      /* wsptr[6] */
3301    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
3302                                            FIX_1_847759065) */
3303    lw          t2, 0(t1)       /* wsptr[0] */
3304    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
3305                                            -FIX_0_765366865) */
3306    lh          t5, 28(t1)      /* wsptr[7] */
3307    lh          t6, 20(t1)      /* wsptr[5] */
3308    lh          t7, 12(t1)      /* wsptr[3] */
3309    lh          t8, 4(t1)       /* wsptr[1] */
3310    ins         t5, t6, 16, 16
3311    ins         t7, t8, 16, 16
3312    mult        $ac0, zero, zero
3313    dpa.w.ph    $ac0, t5, s0
3314    dpa.w.ph    $ac0, t7, s1
3315    mult        $ac1, zero, zero
3316    dpa.w.ph    $ac1, t5, s2
3317    dpa.w.ph    $ac1, t7, s3
3318    sll         t2, t2, 14      /* tmp0 =
3319                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3320    mflo        s6, $ac0
3321    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3322       MULTIPLY(wsptr[6], -FIX_0_765366865) */
3323    subu        s4, s4, s5
3324    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
3325    mflo        s7, $ac1
3326    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
3327    addu        t7, t4, s6
3328    subu        t8, t4, s6
3329    addu        t5, t3, s7
3330    subu        t6, t3, s7
3331    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
3332    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
3333    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
3334    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
3335    sll         s4, t9, 2
3336    lw          v0, 0(a2)       /* output_buf[ctr] */
3337    shll_s.w    t5, t5, 24
3338    shll_s.w    t6, t6, 24
3339    shll_s.w    t7, t7, 24
3340    shll_s.w    t8, t8, 24
3341    sra         t5, t5, 24
3342    sra         t6, t6, 24
3343    sra         t7, t7, 24
3344    sra         t8, t8, 24
3345    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
3346    addiu       t5, t5, 128
3347    addiu       t6, t6, 128
3348    addiu       t7, t7, 128
3349    addiu       t8, t8, 128
3350    sb          t5, 0(v0)
3351    sb          t7, 1(v0)
3352    sb          t8, 2(v0)
3353    sb          t6, 3(v0)
3354    /* 2 */
3355    li          s4, 15137
3356    lw          s6, 40(t1)      /* wsptr[2] */
3357    li          s5, 6270
3358    lw          s7, 56(t1)      /* wsptr[6] */
3359    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
3360                                            FIX_1_847759065) */
3361    lw          t2, 32(t1)      /* wsptr[0] */
3362    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
3363                                            -FIX_0_765366865) */
3364    lh          t5, 60(t1)      /* wsptr[7] */
3365    lh          t6, 52(t1)      /* wsptr[5] */
3366    lh          t7, 44(t1)      /* wsptr[3] */
3367    lh          t8, 36(t1)      /* wsptr[1] */
3368    ins         t5, t6, 16, 16
3369    ins         t7, t8, 16, 16
3370    mult        $ac0, zero, zero
3371    dpa.w.ph    $ac0, t5, s0
3372    dpa.w.ph    $ac0, t7, s1
3373    mult        $ac1, zero, zero
3374    dpa.w.ph    $ac1, t5, s2
3375    dpa.w.ph    $ac1, t7, s3
3376    sll         t2, t2, 14      /* tmp0 =
3377                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3378    mflo        s6, $ac0
3379    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3380       MULTIPLY(wsptr[6], -FIX_0_765366865) */
3381    subu        s4, s4, s5
3382    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
3383    mflo        s7, $ac1
3384    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
3385    addu        t7, t4, s6
3386    subu        t8, t4, s6
3387    addu        t5, t3, s7
3388    subu        t6, t3, s7
3389    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2,
3390                                           CONST_BITS-PASS1_BITS+1) */
3391    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2,
3392                                           CONST_BITS-PASS1_BITS+1) */
3393    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1,
3394                                           CONST_BITS-PASS1_BITS+1) */
3395    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1,
3396                                           CONST_BITS-PASS1_BITS+1) */
3397    sll         s4, t9, 2
3398    lw          v0, 4(a2)       /* output_buf[ctr] */
3399    shll_s.w    t5, t5, 24
3400    shll_s.w    t6, t6, 24
3401    shll_s.w    t7, t7, 24
3402    shll_s.w    t8, t8, 24
3403    sra         t5, t5, 24
3404    sra         t6, t6, 24
3405    sra         t7, t7, 24
3406    sra         t8, t8, 24
3407    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
3408    addiu       t5, t5, 128
3409    addiu       t6, t6, 128
3410    addiu       t7, t7, 128
3411    addiu       t8, t8, 128
3412    sb          t5, 0(v0)
3413    sb          t7, 1(v0)
3414    sb          t8, 2(v0)
3415    sb          t6, 3(v0)
3416    /* 3 */
3417    li          s4, 15137
3418    lw          s6, 72(t1)      /* wsptr[2] */
3419    li          s5, 6270
3420    lw          s7, 88(t1)      /* wsptr[6] */
3421    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
3422                                            FIX_1_847759065) */
3423    lw          t2, 64(t1)      /* wsptr[0] */
3424    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
3425                                            -FIX_0_765366865) */
3426    lh          t5, 92(t1)      /* wsptr[7] */
3427    lh          t6, 84(t1)      /* wsptr[5] */
3428    lh          t7, 76(t1)      /* wsptr[3] */
3429    lh          t8, 68(t1)      /* wsptr[1] */
3430    ins         t5, t6, 16, 16
3431    ins         t7, t8, 16, 16
3432    mult        $ac0, zero, zero
3433    dpa.w.ph    $ac0, t5, s0
3434    dpa.w.ph    $ac0, t7, s1
3435    mult        $ac1, zero, zero
3436    dpa.w.ph    $ac1, t5, s2
3437    dpa.w.ph    $ac1, t7, s3
3438    sll         t2, t2, 14      /* tmp0 =
3439                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3440    mflo        s6, $ac0
3441    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3442       MULTIPLY(wsptr[6], -FIX_0_765366865) */
3443    subu        s4, s4, s5
3444    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
3445    mflo        s7, $ac1
3446    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
3447    addu        t7, t4, s6
3448    subu        t8, t4, s6
3449    addu        t5, t3, s7
3450    subu        t6, t3, s7
3451    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
3452    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
3453    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
3454    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
3455    sll         s4, t9, 2
3456    lw          v0, 8(a2)       /* output_buf[ctr] */
3457    shll_s.w    t5, t5, 24
3458    shll_s.w    t6, t6, 24
3459    shll_s.w    t7, t7, 24
3460    shll_s.w    t8, t8, 24
3461    sra         t5, t5, 24
3462    sra         t6, t6, 24
3463    sra         t7, t7, 24
3464    sra         t8, t8, 24
3465    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
3466    addiu       t5, t5, 128
3467    addiu       t6, t6, 128
3468    addiu       t7, t7, 128
3469    addiu       t8, t8, 128
3470    sb          t5, 0(v0)
3471    sb          t7, 1(v0)
3472    sb          t8, 2(v0)
3473    sb          t6, 3(v0)
3474    li          s4, 15137
3475    lw          s6, 104(t1)     /* wsptr[2] */
3476    li          s5, 6270
3477    lw          s7, 120(t1)     /* wsptr[6] */
3478    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
3479                                            FIX_1_847759065) */
3480    lw          t2, 96(t1)      /* wsptr[0] */
3481    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
3482                                            -FIX_0_765366865) */
3483    lh          t5, 124(t1)     /* wsptr[7] */
3484    lh          t6, 116(t1)     /* wsptr[5] */
3485    lh          t7, 108(t1)     /* wsptr[3] */
3486    lh          t8, 100(t1)     /* wsptr[1] */
3487    ins         t5, t6, 16, 16
3488    ins         t7, t8, 16, 16
3489    mult        $ac0, zero, zero
3490    dpa.w.ph    $ac0, t5, s0
3491    dpa.w.ph    $ac0, t7, s1
3492    mult        $ac1, zero, zero
3493    dpa.w.ph    $ac1, t5, s2
3494    dpa.w.ph    $ac1, t7, s3
3495    sll         t2, t2, 14      /* tmp0 =
3496                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3497    mflo        s6, $ac0
3498    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3499       MULTIPLY(wsptr[6], -FIX_0_765366865) */
3500    subu        s4, s4, s5
3501    addu        t3, t2, s4      /* tmp10 = tmp0 + z2; */
3502    mflo        s7, $ac1
3503    subu        t4, t2, s4      /* tmp10 = tmp0 - z2; */
3504    addu        t7, t4, s6
3505    subu        t8, t4, s6
3506    addu        t5, t3, s7
3507    subu        t6, t3, s7
3508    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
3509    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
3510    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
3511    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
3512    sll         s4, t9, 2
3513    lw          v0, 12(a2)      /* output_buf[ctr] */
3514    shll_s.w    t5, t5, 24
3515    shll_s.w    t6, t6, 24
3516    shll_s.w    t7, t7, 24
3517    shll_s.w    t8, t8, 24
3518    sra         t5, t5, 24
3519    sra         t6, t6, 24
3520    sra         t7, t7, 24
3521    sra         t8, t8, 24
3522    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
3523    addiu       t5, t5, 128
3524    addiu       t6, t6, 128
3525    addiu       t7, t7, 128
3526    addiu       t8, t8, 128
3527    sb          t5, 0(v0)
3528    sb          t7, 1(v0)
3529    sb          t8, 2(v0)
3530    sb          t6, 3(v0)
3531
3532    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3533
3534    j           ra
3535     nop
3536END(jsimd_idct_4x4_dspr2)
3537
3538
3539/*****************************************************************************/
3540LEAF_DSPR2(jsimd_idct_6x6_dspr2)
3541/*
3542 * a0 = compptr->dct_table
3543 * a1 = coef_block
3544 * a2 = output_buf
3545 * a3 = output_col
3546 */
3547    .set at
3548
3549    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3550
3551    addiu       sp, sp, -144
3552    move        v0, sp
3553    addiu       v1, v0, 24
3554    addiu       t9, zero, 5793
3555    addiu       s0, zero, 10033
3556    addiu       s1, zero, 2998
3557
35581:
3559    lh          s2, 0(a0)       /* q0 = quantptr[ 0] */
3560    lh          s3, 32(a0)      /* q1 = quantptr[16] */
3561    lh          s4, 64(a0)      /* q2 = quantptr[32] */
3562    lh          t2, 64(a1)      /* tmp2 = inptr[32] */
3563    lh          t1, 32(a1)      /* tmp1 = inptr[16] */
3564    lh          t0, 0(a1)       /* tmp0 = inptr[ 0] */
3565    mul         t2, t2, s4      /* tmp2 = tmp2 * q2 */
3566    mul         t1, t1, s3      /* tmp1 = tmp1 * q1 */
3567    mul         t0, t0, s2      /* tmp0 = tmp0 * q0 */
3568    lh          t6, 16(a1)      /* z1 = inptr[ 8] */
3569    lh          t8, 80(a1)      /* z3 = inptr[40] */
3570    lh          t7, 48(a1)      /* z2 = inptr[24] */
3571    lh          s2, 16(a0)      /* q0 = quantptr[ 8] */
3572    lh          s4, 80(a0)      /* q2 = quantptr[40] */
3573    lh          s3, 48(a0)      /* q1 = quantptr[24] */
3574    mul         t2, t2, t9      /* tmp2 = tmp2 * 5793 */
3575    mul         t1, t1, s0      /* tmp1 = tmp1 * 10033 */
3576    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
3577    mul         t6, t6, s2      /* z1 = z1 * q0 */
3578    mul         t8, t8, s4      /* z3 = z3 * q2 */
3579    mul         t7, t7, s3      /* z2 = z2 * q1 */
3580    addu        t3, t0, t2      /* tmp10 = tmp0 + tmp2 */
3581    sll         t2, t2, 1       /* tmp2 = tmp2 << 2 */
3582    subu        t4, t0, t2      /* tmp11 = tmp0 - tmp2; */
3583    subu        t5, t3, t1      /* tmp12 = tmp10 - tmp1 */
3584    addu        t3, t3, t1      /* tmp10 = tmp10 + tmp1 */
3585    addu        t1, t6, t8      /* tmp1 = z1 + z3 */
3586    mul         t1, t1, s1      /* tmp1 = tmp1 * 2998 */
3587    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
3588    subu        t2, t6, t8      /* tmp2 = z1 - z3 */
3589    subu        t2, t2, t7      /* tmp2 = tmp2 - z2 */
3590    sll         t2, t2, 2       /* tmp2 = tmp2 << 2 */
3591    addu        t0, t6, t7      /* tmp0 = z1 + z2 */
3592    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
3593    subu        s2, t8, t7      /* q0 = z3 - z2 */
3594    sll         s2, s2, 13      /* q0 = q0 << 13 */
3595    addu        t0, t0, t1      /* tmp0 = tmp0 + tmp1 */
3596    addu        t1, s2, t1      /* tmp1 = q0 + tmp1 */
3597    addu        s2, t4, t2      /* q0 = tmp11 + tmp2 */
3598    subu        s3, t4, t2      /* q1 = tmp11 - tmp2 */
3599    addu        t6, t3, t0      /* z1 = tmp10 + tmp0 */
3600    subu        t7, t3, t0      /* z2 = tmp10 - tmp0 */
3601    addu        t4, t5, t1      /* tmp11 = tmp12 + tmp1 */
3602    subu        t5, t5, t1      /* tmp12 = tmp12 - tmp1 */
3603    shra_r.w    t6, t6, 11      /* z1 = (z1 + 1024) >> 11 */
3604    shra_r.w    t7, t7, 11      /* z2 = (z2 + 1024) >> 11 */
3605    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
3606    shra_r.w    t5, t5, 11      /* tmp12 = (tmp12 + 1024) >> 11 */
3607    sw          s2, 24(v0)
3608    sw          s3, 96(v0)
3609    sw          t6, 0(v0)
3610    sw          t7, 120(v0)
3611    sw          t4, 48(v0)
3612    sw          t5, 72(v0)
3613    addiu       v0, v0, 4
3614    addiu       a1, a1, 2
3615    bne         v0, v1, 1b
3616     addiu      a0, a0, 2
3617
3618    /* Pass 2: process 6 rows from work array, store into output array. */
3619    move        v0, sp
3620    addiu       v1, v0, 144
3621
36222:
3623    lw          t0, 0(v0)
3624    lw          t2, 16(v0)
3625    lw          s5, 0(a2)
3626    addiu       t0, t0, 16
3627    sll         t0, t0, 13
3628    mul         t3, t2, t9
3629    lw          t6, 4(v0)
3630    lw          t8, 20(v0)
3631    lw          t7, 12(v0)
3632    addu        s5, s5, a3
3633    addu        s6, t6, t8
3634    mul         s6, s6, s1
3635    addu        t1, t0, t3
3636    subu        t4, t0, t3
3637    subu        t4, t4, t3
3638    lw          t3, 8(v0)
3639    mul         t0, t3, s0
3640    addu        s7, t6, t7
3641    sll         s7, s7, 13
3642    addu        s7, s6, s7
3643    subu        t2, t8, t7
3644    sll         t2, t2, 13
3645    addu        t2, s6, t2
3646    subu        s6, t6, t7
3647    subu        s6, s6, t8
3648    sll         s6, s6, 13
3649    addu        t3, t1, t0
3650    subu        t5, t1, t0
3651    addu        t6, t3, s7
3652    subu        t3, t3, s7
3653    addu        t7, t4, s6
3654    subu        t4, t4, s6
3655    addu        t8, t5, t2
3656    subu        t5, t5, t2
3657    shll_s.w    t6, t6, 6
3658    shll_s.w    t3, t3, 6
3659    shll_s.w    t7, t7, 6
3660    shll_s.w    t4, t4, 6
3661    shll_s.w    t8, t8, 6
3662    shll_s.w    t5, t5, 6
3663    sra         t6, t6, 24
3664    addiu       t6, t6, 128
3665    sra         t3, t3, 24
3666    addiu       t3, t3, 128
3667    sb          t6, 0(s5)
3668    sra         t7, t7, 24
3669    addiu       t7, t7, 128
3670    sb          t3, 5(s5)
3671    sra         t4, t4, 24
3672    addiu       t4, t4, 128
3673    sb          t7, 1(s5)
3674    sra         t8, t8, 24
3675    addiu       t8, t8, 128
3676    sb          t4, 4(s5)
3677    addiu       v0, v0, 24
3678    sra         t5, t5, 24
3679    addiu       t5, t5, 128
3680    sb          t8, 2(s5)
3681    addiu       a2, a2,  4
3682    bne         v0, v1, 2b
3683     sb         t5, 3(s5)
3684
3685    addiu       sp, sp, 144
3686
3687    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3688
3689    j           ra
3690     nop
3691
3692END(jsimd_idct_6x6_dspr2)
3693
3694
3695/*****************************************************************************/
3696LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
3697/*
3698 * a0 = compptr->dct_table
3699 * a1 = coef_block
3700 * a2 = workspace
3701 */
3702    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3703
3704    li          a3, 8
3705
37061:
3707    /* odd part */
3708    lh          t0, 48(a1)
3709    lh          t1, 48(a0)
3710    lh          t2, 16(a1)
3711    lh          t3, 16(a0)
3712    lh          t4, 80(a1)
3713    lh          t5, 80(a0)
3714    lh          t6, 112(a1)
3715    lh          t7, 112(a0)
3716    mul         t0, t0, t1      /* z2 */
3717    mul         t1, t2, t3      /* z1 */
3718    mul         t2, t4, t5      /* z3 */
3719    mul         t3, t6, t7      /* z4 */
3720    li          t4, 10703       /* FIX(1.306562965) */
3721    li          t5, 4433        /* FIX_0_541196100 */
3722    li          t6, 7053        /* FIX(0.860918669) */
3723    mul         t4, t0, t4      /* tmp11 */
3724    mul         t5, t0, t5      /* -tmp14 */
3725    addu        t7, t1, t2      /* tmp10 */
3726    addu        t8, t7, t3      /* tmp10 + z4 */
3727    mul         t6, t6, t8      /* tmp15 */
3728    li          t8, 2139        /* FIX(0.261052384) */
3729    mul         t8, t7, t8      /* MULTIPLY(tmp10, FIX(0.261052384)) */
3730    li          t7, 2295        /* FIX(0.280143716) */
3731    mul         t7, t1, t7      /* MULTIPLY(z1, FIX(0.280143716)) */
3732    addu        t9, t2, t3      /* z3 + z4 */
3733    li          s0, 8565        /* FIX(1.045510580) */
3734    mul         t9, t9, s0      /* -tmp13 */
3735    li          s0, 12112       /* FIX(1.478575242) */
3736    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242) */
3737    li          s1, 12998       /* FIX(1.586706681) */
3738    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
3739    li          s2, 5540        /* FIX(0.676326758) */
3740    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
3741    li          s3, 16244       /* FIX(1.982889723) */
3742    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
3743    subu        t1, t1, t3      /* z1-=z4 */
3744    subu        t0, t0, t2      /* z2-=z3 */
3745    addu        t2, t0, t1      /* z1+z2 */
3746    li          t3, 4433        /* FIX_0_541196100 */
3747    mul         t2, t2, t3      /* z3 */
3748    li          t3, 6270        /* FIX_0_765366865 */
3749    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
3750    li          t3, 15137       /* FIX_0_765366865 */
3751    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
3752    addu        t8, t6, t8      /* tmp12 */
3753    addu        t3, t8, t4      /* tmp12 + tmp11 */
3754    addu        t3, t3, t7      /* tmp10 */
3755    subu        t8, t8, t9      /* tmp12 + tmp13 */
3756    addu        s0, t5, s0
3757    subu        t8, t8, s0      /* tmp12 */
3758    subu        t9, t6, t9
3759    subu        s1, s1, t4
3760    addu        t9, t9, s1      /* tmp13 */
3761    subu        t6, t6, t5
3762    subu        t6, t6, s2
3763    subu        t6, t6, s3      /* tmp15 */
3764    /* even part start */
3765    lh          t4, 64(a1)
3766    lh          t5, 64(a0)
3767    lh          t7, 32(a1)
3768    lh          s0, 32(a0)
3769    lh          s1, 0(a1)
3770    lh          s2, 0(a0)
3771    lh          s3, 96(a1)
3772    lh          v0, 96(a0)
3773    mul         t4, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*4],
3774                                              quantptr[DCTSIZE*4]) */
3775    mul         t5, t7, s0      /* DEQUANTIZE(inptr[DCTSIZE*2],
3776                                              quantptr[DCTSIZE*2]) */
3777    mul         t7, s1, s2      /* DEQUANTIZE(inptr[DCTSIZE*0],
3778                                              quantptr[DCTSIZE*0]) */
3779    mul         s0, s3, v0      /* DEQUANTIZE(inptr[DCTSIZE*6],
3780                                              quantptr[DCTSIZE*6]) */
3781    /* odd part end */
3782    addu        t1, t2, t1      /* tmp11 */
3783    subu        t0, t2, t0      /* tmp14 */
3784    /* update counter and pointers */
3785    addiu       a3, a3, -1
3786    addiu       a0, a0, 2
3787    addiu       a1, a1, 2
3788    /* even part rest */
3789    li          s1, 10033
3790    li          s2, 11190
3791    mul         t4, t4, s1      /* z4 */
3792    mul         s1, t5, s2      /* z4 */
3793    sll         t5, t5, 13      /* z1 */
3794    sll         t7, t7, 13
3795    addiu       t7, t7, 1024    /* z3 */
3796    sll         s0, s0, 13      /* z2 */
3797    addu        s2, t7, t4      /* tmp10 */
3798    subu        t4, t7, t4      /* tmp11 */
3799    subu        s3, t5, s0      /* tmp12 */
3800    addu        t2, t7, s3      /* tmp21 */
3801    subu        s3, t7, s3      /* tmp24 */
3802    addu        t7, s1, s0      /* tmp12 */
3803    addu        v0, s2, t7      /* tmp20 */
3804    subu        s2, s2, t7      /* tmp25 */
3805    subu        s1, s1, t5      /* z4 - z1 */
3806    subu        s1, s1, s0      /* tmp12 */
3807    addu        s0, t4, s1      /* tmp22 */
3808    subu        t4, t4, s1      /* tmp23 */
3809    /* final output stage */
3810    addu        t5, v0, t3
3811    subu        v0, v0, t3
3812    addu        t3, t2, t1
3813    subu        t2, t2, t1
3814    addu        t1, s0, t8
3815    subu        s0, s0, t8
3816    addu        t8, t4, t9
3817    subu        t4, t4, t9
3818    addu        t9, s3, t0
3819    subu        s3, s3, t0
3820    addu        t0, s2, t6
3821    subu        s2, s2, t6
3822    sra         t5, t5, 11
3823    sra         t3, t3, 11
3824    sra         t1, t1, 11
3825    sra         t8, t8, 11
3826    sra         t9, t9, 11
3827    sra         t0, t0, 11
3828    sra         s2, s2, 11
3829    sra         s3, s3, 11
3830    sra         t4, t4, 11
3831    sra         s0, s0, 11
3832    sra         t2, t2, 11
3833    sra         v0, v0, 11
3834    sw          t5, 0(a2)
3835    sw          t3, 32(a2)
3836    sw          t1, 64(a2)
3837    sw          t8, 96(a2)
3838    sw          t9, 128(a2)
3839    sw          t0, 160(a2)
3840    sw          s2, 192(a2)
3841    sw          s3, 224(a2)
3842    sw          t4, 256(a2)
3843    sw          s0, 288(a2)
3844    sw          t2, 320(a2)
3845    sw          v0, 352(a2)
3846    bgtz        a3, 1b
3847     addiu      a2, a2, 4
3848
3849    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3850
3851    j           ra
3852     nop
3853
3854END(jsimd_idct_12x12_pass1_dspr2)
3855
3856
3857/*****************************************************************************/
3858LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
3859/*
3860 * a0 = workspace
3861 * a1 = output
3862 */
3863    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3864
3865    li          a3, 12
3866
38671:
3868    /* Odd part */
3869    lw          t0, 12(a0)
3870    lw          t1, 4(a0)
3871    lw          t2, 20(a0)
3872    lw          t3, 28(a0)
3873    li          t4, 10703       /* FIX(1.306562965) */
3874    li          t5, 4433        /* FIX_0_541196100 */
3875    mul         t4, t0, t4      /* tmp11 */
3876    mul         t5, t0, t5      /* -tmp14 */
3877    addu        t6, t1, t2      /* tmp10 */
3878    li          t7, 2139        /* FIX(0.261052384) */
3879    mul         t7, t6, t7      /* MULTIPLY(tmp10, FIX(0.261052384)) */
3880    addu        t6, t6, t3      /* tmp10 + z4 */
3881    li          t8, 7053        /* FIX(0.860918669) */
3882    mul         t6, t6, t8      /* tmp15 */
3883    li          t8, 2295        /* FIX(0.280143716) */
3884    mul         t8, t1, t8      /* MULTIPLY(z1, FIX(0.280143716)) */
3885    addu        t9, t2, t3      /* z3 + z4 */
3886    li          s0, 8565        /* FIX(1.045510580) */
3887    mul         t9, t9, s0      /* -tmp13 */
3888    li          s0, 12112       /* FIX(1.478575242) */
3889    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242)) */
3890    li          s1, 12998       /* FIX(1.586706681) */
3891    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
3892    li          s2, 5540        /* FIX(0.676326758) */
3893    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
3894    li          s3, 16244       /* FIX(1.982889723) */
3895    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
3896    subu        t1, t1, t3      /* z1 -= z4 */
3897    subu        t0, t0, t2      /* z2 -= z3 */
3898    addu        t2, t1, t0      /* z1 + z2 */
3899    li          t3, 4433        /* FIX_0_541196100 */
3900    mul         t2, t2, t3      /* z3 */
3901    li          t3, 6270        /* FIX_0_765366865 */
3902    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
3903    li          t3, 15137       /* FIX_1_847759065 */
3904    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
3905    addu        t3, t6, t7      /* tmp12 */
3906    addu        t7, t3, t4
3907    addu        t7, t7, t8      /* tmp10 */
3908    subu        t3, t3, t9
3909    subu        t3, t3, t5
3910    subu        t3, t3, s0      /* tmp12 */
3911    subu        t9, t6, t9
3912    subu        t9, t9, t4
3913    addu        t9, t9, s1      /* tmp13 */
3914    subu        t6, t6, t5
3915    subu        t6, t6, s2
3916    subu        t6, t6, s3      /* tmp15 */
3917    addu        t1, t2, t1      /* tmp11 */
3918    subu        t0, t2, t0      /* tmp14 */
3919    /* even part */
3920    lw          t2, 16(a0)      /* z4 */
3921    lw          t4, 8(a0)       /* z1 */
3922    lw          t5, 0(a0)       /* z3 */
3923    lw          t8, 24(a0)      /* z2 */
3924    li          s0, 10033       /* FIX(1.224744871) */
3925    li          s1, 11190       /* FIX(1.366025404) */
3926    mul         t2, t2, s0      /* z4 */
3927    mul         s0, t4, s1      /* z4 */
3928    addiu       t5, t5, 0x10
3929    sll         t5, t5, 13      /* z3 */
3930    sll         t4, t4, 13      /* z1 */
3931    sll         t8, t8, 13      /* z2 */
3932    subu        s1, t4, t8      /* tmp12 */
3933    addu        s2, t5, t2      /* tmp10 */
3934    subu        t2, t5, t2      /* tmp11 */
3935    addu        s3, t5, s1      /* tmp21 */
3936    subu        s1, t5, s1      /* tmp24 */
3937    addu        t5, s0, t8      /* tmp12 */
3938    addu        v0, s2, t5      /* tmp20 */
3939    subu        t5, s2, t5      /* tmp25 */
3940    subu        t4, s0, t4
3941    subu        t4, t4, t8      /* tmp12 */
3942    addu        t8, t2, t4      /* tmp22 */
3943    subu        t2, t2, t4      /* tmp23 */
3944    /* increment counter and pointers */
3945    addiu       a3, a3, -1
3946    addiu       a0, a0, 32
3947    /* Final stage */
3948    addu        t4, v0, t7
3949    subu        v0, v0, t7
3950    addu        t7, s3, t1
3951    subu        s3, s3, t1
3952    addu        t1, t8, t3
3953    subu        t8, t8, t3
3954    addu        t3, t2, t9
3955    subu        t2, t2, t9
3956    addu        t9, s1, t0
3957    subu        s1, s1, t0
3958    addu        t0, t5, t6
3959    subu        t5, t5, t6
3960    sll         t4, t4, 4
3961    sll         t7, t7, 4
3962    sll         t1, t1, 4
3963    sll         t3, t3, 4
3964    sll         t9, t9, 4
3965    sll         t0, t0, 4
3966    sll         t5, t5, 4
3967    sll         s1, s1, 4
3968    sll         t2, t2, 4
3969    sll         t8, t8, 4
3970    sll         s3, s3, 4
3971    sll         v0, v0, 4
3972    shll_s.w    t4, t4, 2
3973    shll_s.w    t7, t7, 2
3974    shll_s.w    t1, t1, 2
3975    shll_s.w    t3, t3, 2
3976    shll_s.w    t9, t9, 2
3977    shll_s.w    t0, t0, 2
3978    shll_s.w    t5, t5, 2
3979    shll_s.w    s1, s1, 2
3980    shll_s.w    t2, t2, 2
3981    shll_s.w    t8, t8, 2
3982    shll_s.w    s3, s3, 2
3983    shll_s.w    v0, v0, 2
3984    srl         t4, t4, 24
3985    srl         t7, t7, 24
3986    srl         t1, t1, 24
3987    srl         t3, t3, 24
3988    srl         t9, t9, 24
3989    srl         t0, t0, 24
3990    srl         t5, t5, 24
3991    srl         s1, s1, 24
3992    srl         t2, t2, 24
3993    srl         t8, t8, 24
3994    srl         s3, s3, 24
3995    srl         v0, v0, 24
3996    lw          t6, 0(a1)
3997    addiu       t4, t4, 0x80
3998    addiu       t7, t7, 0x80
3999    addiu       t1, t1, 0x80
4000    addiu       t3, t3, 0x80
4001    addiu       t9, t9, 0x80
4002    addiu       t0, t0, 0x80
4003    addiu       t5, t5, 0x80
4004    addiu       s1, s1, 0x80
4005    addiu       t2, t2, 0x80
4006    addiu       t8, t8, 0x80
4007    addiu       s3, s3, 0x80
4008    addiu       v0, v0, 0x80
4009    sb          t4, 0(t6)
4010    sb          t7, 1(t6)
4011    sb          t1, 2(t6)
4012    sb          t3, 3(t6)
4013    sb          t9, 4(t6)
4014    sb          t0, 5(t6)
4015    sb          t5, 6(t6)
4016    sb          s1, 7(t6)
4017    sb          t2, 8(t6)
4018    sb          t8, 9(t6)
4019    sb          s3, 10(t6)
4020    sb          v0, 11(t6)
4021    bgtz        a3, 1b
4022     addiu      a1, a1, 4
4023
4024    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
4025
4026    jr          ra
4027     nop
4028
4029END(jsimd_idct_12x12_pass2_dspr2)
4030
4031
4032/*****************************************************************************/
4033LEAF_DSPR2(jsimd_convsamp_dspr2)
4034/*
4035 * a0 = sample_data
4036 * a1 = start_col
4037 * a2 = workspace
4038 */
4039    lw            t0, 0(a0)
4040    li            t7, 0xff80ff80
4041    addu          t0, t0, a1
4042    ulw           t1, 0(t0)
4043    ulw           t2, 4(t0)
4044    preceu.ph.qbr t3, t1
4045    preceu.ph.qbl t4, t1
4046    lw            t0, 4(a0)
4047    preceu.ph.qbr t5, t2
4048    preceu.ph.qbl t6, t2
4049    addu          t0, t0, a1
4050    addu.ph       t3, t3, t7
4051    addu.ph       t4, t4, t7
4052    ulw           t1, 0(t0)
4053    ulw           t2, 4(t0)
4054    addu.ph       t5, t5, t7
4055    addu.ph       t6, t6, t7
4056    usw           t3, 0(a2)
4057    usw           t4, 4(a2)
4058    preceu.ph.qbr t3, t1
4059    preceu.ph.qbl t4, t1
4060    usw           t5, 8(a2)
4061    usw           t6, 12(a2)
4062
4063    lw            t0, 8(a0)
4064    preceu.ph.qbr t5, t2
4065    preceu.ph.qbl t6, t2
4066    addu          t0, t0, a1
4067    addu.ph       t3, t3, t7
4068    addu.ph       t4, t4, t7
4069    ulw           t1, 0(t0)
4070    ulw           t2, 4(t0)
4071    addu.ph       t5, t5, t7
4072    addu.ph       t6, t6, t7
4073    usw           t3, 16(a2)
4074    usw           t4, 20(a2)
4075    preceu.ph.qbr t3, t1
4076    preceu.ph.qbl t4, t1
4077    usw           t5, 24(a2)
4078    usw           t6, 28(a2)
4079
4080    lw            t0, 12(a0)
4081    preceu.ph.qbr t5, t2
4082    preceu.ph.qbl t6, t2
4083    addu          t0, t0, a1
4084    addu.ph       t3, t3, t7
4085    addu.ph       t4, t4, t7
4086    ulw           t1, 0(t0)
4087    ulw           t2, 4(t0)
4088    addu.ph       t5, t5, t7
4089    addu.ph       t6, t6, t7
4090    usw           t3, 32(a2)
4091    usw           t4, 36(a2)
4092    preceu.ph.qbr t3, t1
4093    preceu.ph.qbl t4, t1
4094    usw           t5, 40(a2)
4095    usw           t6, 44(a2)
4096
4097    lw            t0, 16(a0)
4098    preceu.ph.qbr t5, t2
4099    preceu.ph.qbl t6, t2
4100    addu          t0, t0, a1
4101    addu.ph       t3, t3, t7
4102    addu.ph       t4, t4, t7
4103    ulw           t1, 0(t0)
4104    ulw           t2, 4(t0)
4105    addu.ph       t5, t5, t7
4106    addu.ph       t6, t6, t7
4107    usw           t3, 48(a2)
4108    usw           t4, 52(a2)
4109    preceu.ph.qbr t3, t1
4110    preceu.ph.qbl t4, t1
4111    usw           t5, 56(a2)
4112    usw           t6, 60(a2)
4113
4114    lw            t0, 20(a0)
4115    preceu.ph.qbr t5, t2
4116    preceu.ph.qbl t6, t2
4117    addu          t0, t0, a1
4118    addu.ph       t3, t3, t7
4119    addu.ph       t4, t4, t7
4120    ulw           t1, 0(t0)
4121    ulw           t2, 4(t0)
4122    addu.ph       t5, t5, t7
4123    addu.ph       t6, t6, t7
4124    usw           t3, 64(a2)
4125    usw           t4, 68(a2)
4126    preceu.ph.qbr t3, t1
4127    preceu.ph.qbl t4, t1
4128    usw           t5, 72(a2)
4129    usw           t6, 76(a2)
4130
4131    lw            t0, 24(a0)
4132    preceu.ph.qbr t5, t2
4133    preceu.ph.qbl t6, t2
4134    addu          t0, t0, a1
4135    addu.ph       t3, t3, t7
4136    addu.ph       t4, t4, t7
4137    ulw           t1, 0(t0)
4138    ulw           t2, 4(t0)
4139    addu.ph       t5, t5, t7
4140    addu.ph       t6, t6, t7
4141    usw           t3, 80(a2)
4142    usw           t4, 84(a2)
4143    preceu.ph.qbr t3, t1
4144    preceu.ph.qbl t4, t1
4145    usw           t5, 88(a2)
4146    usw           t6, 92(a2)
4147
4148    lw            t0, 28(a0)
4149    preceu.ph.qbr t5, t2
4150    preceu.ph.qbl t6, t2
4151    addu          t0, t0, a1
4152    addu.ph       t3, t3, t7
4153    addu.ph       t4, t4, t7
4154    ulw           t1, 0(t0)
4155    ulw           t2, 4(t0)
4156    addu.ph       t5, t5, t7
4157    addu.ph       t6, t6, t7
4158    usw           t3, 96(a2)
4159    usw           t4, 100(a2)
4160    preceu.ph.qbr t3, t1
4161    preceu.ph.qbl t4, t1
4162    usw           t5, 104(a2)
4163    usw           t6, 108(a2)
4164    preceu.ph.qbr t5, t2
4165    preceu.ph.qbl t6, t2
4166    addu.ph       t3, t3, t7
4167    addu.ph       t4, t4, t7
4168    addu.ph       t5, t5, t7
4169    addu.ph       t6, t6, t7
4170    usw           t3, 112(a2)
4171    usw           t4, 116(a2)
4172    usw           t5, 120(a2)
4173    usw           t6, 124(a2)
4174
4175    j             ra
4176     nop
4177
4178END(jsimd_convsamp_dspr2)
4179
4180
4181#ifndef __mips_soft_float
4182
4183/*****************************************************************************/
4184LEAF_DSPR2(jsimd_convsamp_float_dspr2)
4185/*
4186 * a0 = sample_data
4187 * a1 = start_col
4188 * a2 = workspace
4189 */
4190    .set at
4191
4192    lw          t0, 0(a0)
4193    addu        t0, t0, a1
4194    lbu         t1, 0(t0)
4195    lbu         t2, 1(t0)
4196    lbu         t3, 2(t0)
4197    lbu         t4, 3(t0)
4198    lbu         t5, 4(t0)
4199    lbu         t6, 5(t0)
4200    lbu         t7, 6(t0)
4201    lbu         t8, 7(t0)
4202    addiu       t1, t1, -128
4203    addiu       t2, t2, -128
4204    addiu       t3, t3, -128
4205    addiu       t4, t4, -128
4206    addiu       t5, t5, -128
4207    addiu       t6, t6, -128
4208    addiu       t7, t7, -128
4209    addiu       t8, t8, -128
4210    mtc1        t1, f2
4211    mtc1        t2, f4
4212    mtc1        t3, f6
4213    mtc1        t4, f8
4214    mtc1        t5, f10
4215    mtc1        t6, f12
4216    mtc1        t7, f14
4217    mtc1        t8, f16
4218    cvt.s.w     f2, f2
4219    cvt.s.w     f4, f4
4220    cvt.s.w     f6, f6
4221    cvt.s.w     f8, f8
4222    cvt.s.w     f10, f10
4223    cvt.s.w     f12, f12
4224    cvt.s.w     f14, f14
4225    cvt.s.w     f16, f16
4226    lw          t0, 4(a0)
4227    swc1        f2, 0(a2)
4228    swc1        f4, 4(a2)
4229    swc1        f6, 8(a2)
4230    addu        t0, t0, a1
4231    swc1        f8, 12(a2)
4232    swc1        f10, 16(a2)
4233    swc1        f12, 20(a2)
4234    swc1        f14, 24(a2)
4235    swc1        f16, 28(a2)
4236    /* elemr 1 */
4237    lbu         t1, 0(t0)
4238    lbu         t2, 1(t0)
4239    lbu         t3, 2(t0)
4240    lbu         t4, 3(t0)
4241    lbu         t5, 4(t0)
4242    lbu         t6, 5(t0)
4243    lbu         t7, 6(t0)
4244    lbu         t8, 7(t0)
4245    addiu       t1, t1, -128
4246    addiu       t2, t2, -128
4247    addiu       t3, t3, -128
4248    addiu       t4, t4, -128
4249    addiu       t5, t5, -128
4250    addiu       t6, t6, -128
4251    addiu       t7, t7, -128
4252    addiu       t8, t8, -128
4253    mtc1        t1, f2
4254    mtc1        t2, f4
4255    mtc1        t3, f6
4256    mtc1        t4, f8
4257    mtc1        t5, f10
4258    mtc1        t6, f12
4259    mtc1        t7, f14
4260    mtc1        t8, f16
4261    cvt.s.w     f2, f2
4262    cvt.s.w     f4, f4
4263    cvt.s.w     f6, f6
4264    cvt.s.w     f8, f8
4265    cvt.s.w     f10, f10
4266    cvt.s.w     f12, f12
4267    cvt.s.w     f14, f14
4268    cvt.s.w     f16, f16
4269    lw          t0, 8(a0)
4270    swc1        f2, 32(a2)
4271    swc1        f4, 36(a2)
4272    swc1        f6, 40(a2)
4273    addu        t0, t0, a1
4274    swc1        f8, 44(a2)
4275    swc1        f10, 48(a2)
4276    swc1        f12, 52(a2)
4277    swc1        f14, 56(a2)
4278    swc1        f16, 60(a2)
4279    /* elemr 2 */
4280    lbu         t1, 0(t0)
4281    lbu         t2, 1(t0)
4282    lbu         t3, 2(t0)
4283    lbu         t4, 3(t0)
4284    lbu         t5, 4(t0)
4285    lbu         t6, 5(t0)
4286    lbu         t7, 6(t0)
4287    lbu         t8, 7(t0)
4288    addiu       t1, t1, -128
4289    addiu       t2, t2, -128
4290    addiu       t3, t3, -128
4291    addiu       t4, t4, -128
4292    addiu       t5, t5, -128
4293    addiu       t6, t6, -128
4294    addiu       t7, t7, -128
4295    addiu       t8, t8, -128
4296    mtc1        t1, f2
4297    mtc1        t2, f4
4298    mtc1        t3, f6
4299    mtc1        t4, f8
4300    mtc1        t5, f10
4301    mtc1        t6, f12
4302    mtc1        t7, f14
4303    mtc1        t8, f16
4304    cvt.s.w     f2, f2
4305    cvt.s.w     f4, f4
4306    cvt.s.w     f6, f6
4307    cvt.s.w     f8, f8
4308    cvt.s.w     f10, f10
4309    cvt.s.w     f12, f12
4310    cvt.s.w     f14, f14
4311    cvt.s.w     f16, f16
4312    lw          t0, 12(a0)
4313    swc1        f2, 64(a2)
4314    swc1        f4, 68(a2)
4315    swc1        f6, 72(a2)
4316    addu        t0, t0, a1
4317    swc1        f8, 76(a2)
4318    swc1        f10, 80(a2)
4319    swc1        f12, 84(a2)
4320    swc1        f14, 88(a2)
4321    swc1        f16, 92(a2)
4322    /*  elemr 3 */
4323    lbu         t1, 0(t0)
4324    lbu         t2, 1(t0)
4325    lbu         t3, 2(t0)
4326    lbu         t4, 3(t0)
4327    lbu         t5, 4(t0)
4328    lbu         t6, 5(t0)
4329    lbu         t7, 6(t0)
4330    lbu         t8, 7(t0)
4331    addiu       t1, t1, -128
4332    addiu       t2, t2, -128
4333    addiu       t3, t3, -128
4334    addiu       t4, t4, -128
4335    addiu       t5, t5, -128
4336    addiu       t6, t6, -128
4337    addiu       t7, t7, -128
4338    addiu       t8, t8, -128
4339    mtc1        t1, f2
4340    mtc1        t2, f4
4341    mtc1        t3, f6
4342    mtc1        t4, f8
4343    mtc1        t5, f10
4344    mtc1        t6, f12
4345    mtc1        t7, f14
4346    mtc1        t8, f16
4347    cvt.s.w     f2, f2
4348    cvt.s.w     f4, f4
4349    cvt.s.w     f6, f6
4350    cvt.s.w     f8, f8
4351    cvt.s.w     f10, f10
4352    cvt.s.w     f12, f12
4353    cvt.s.w     f14, f14
4354    cvt.s.w     f16, f16
4355    lw          t0, 16(a0)
4356    swc1        f2, 96(a2)
4357    swc1        f4, 100(a2)
4358    swc1        f6, 104(a2)
4359    addu        t0, t0, a1
4360    swc1        f8, 108(a2)
4361    swc1        f10, 112(a2)
4362    swc1        f12, 116(a2)
4363    swc1        f14, 120(a2)
4364    swc1        f16, 124(a2)
4365    /* elemr 4 */
4366    lbu         t1, 0(t0)
4367    lbu         t2, 1(t0)
4368    lbu         t3, 2(t0)
4369    lbu         t4, 3(t0)
4370    lbu         t5, 4(t0)
4371    lbu         t6, 5(t0)
4372    lbu         t7, 6(t0)
4373    lbu         t8, 7(t0)
4374    addiu       t1, t1, -128
4375    addiu       t2, t2, -128
4376    addiu       t3, t3, -128
4377    addiu       t4, t4, -128
4378    addiu       t5, t5, -128
4379    addiu       t6, t6, -128
4380    addiu       t7, t7, -128
4381    addiu       t8, t8, -128
4382    mtc1        t1, f2
4383    mtc1        t2, f4
4384    mtc1        t3, f6
4385    mtc1        t4, f8
4386    mtc1        t5, f10
4387    mtc1        t6, f12
4388    mtc1        t7, f14
4389    mtc1        t8, f16
4390    cvt.s.w     f2, f2
4391    cvt.s.w     f4, f4
4392    cvt.s.w     f6, f6
4393    cvt.s.w     f8, f8
4394    cvt.s.w     f10, f10
4395    cvt.s.w     f12, f12
4396    cvt.s.w     f14, f14
4397    cvt.s.w     f16, f16
4398    lw          t0, 20(a0)
4399    swc1        f2, 128(a2)
4400    swc1        f4, 132(a2)
4401    swc1        f6, 136(a2)
4402    addu        t0, t0, a1
4403    swc1        f8, 140(a2)
4404    swc1        f10, 144(a2)
4405    swc1        f12, 148(a2)
4406    swc1        f14, 152(a2)
4407    swc1        f16, 156(a2)
4408    /* elemr 5 */
4409    lbu         t1, 0(t0)
4410    lbu         t2, 1(t0)
4411    lbu         t3, 2(t0)
4412    lbu         t4, 3(t0)
4413    lbu         t5, 4(t0)
4414    lbu         t6, 5(t0)
4415    lbu         t7, 6(t0)
4416    lbu         t8, 7(t0)
4417    addiu       t1, t1, -128
4418    addiu       t2, t2, -128
4419    addiu       t3, t3, -128
4420    addiu       t4, t4, -128
4421    addiu       t5, t5, -128
4422    addiu       t6, t6, -128
4423    addiu       t7, t7, -128
4424    addiu       t8, t8, -128
4425    mtc1        t1, f2
4426    mtc1        t2, f4
4427    mtc1        t3, f6
4428    mtc1        t4, f8
4429    mtc1        t5, f10
4430    mtc1        t6, f12
4431    mtc1        t7, f14
4432    mtc1        t8, f16
4433    cvt.s.w     f2, f2
4434    cvt.s.w     f4, f4
4435    cvt.s.w     f6, f6
4436    cvt.s.w     f8, f8
4437    cvt.s.w     f10, f10
4438    cvt.s.w     f12, f12
4439    cvt.s.w     f14, f14
4440    cvt.s.w     f16, f16
4441    lw          t0, 24(a0)
4442    swc1        f2, 160(a2)
4443    swc1        f4, 164(a2)
4444    swc1        f6, 168(a2)
4445    addu        t0, t0, a1
4446    swc1        f8, 172(a2)
4447    swc1        f10, 176(a2)
4448    swc1        f12, 180(a2)
4449    swc1        f14, 184(a2)
4450    swc1        f16, 188(a2)
4451    /* elemr 6 */
4452    lbu         t1, 0(t0)
4453    lbu         t2, 1(t0)
4454    lbu         t3, 2(t0)
4455    lbu         t4, 3(t0)
4456    lbu         t5, 4(t0)
4457    lbu         t6, 5(t0)
4458    lbu         t7, 6(t0)
4459    lbu         t8, 7(t0)
4460    addiu       t1, t1, -128
4461    addiu       t2, t2, -128
4462    addiu       t3, t3, -128
4463    addiu       t4, t4, -128
4464    addiu       t5, t5, -128
4465    addiu       t6, t6, -128
4466    addiu       t7, t7, -128
4467    addiu       t8, t8, -128
4468    mtc1        t1, f2
4469    mtc1        t2, f4
4470    mtc1        t3, f6
4471    mtc1        t4, f8
4472    mtc1        t5, f10
4473    mtc1        t6, f12
4474    mtc1        t7, f14
4475    mtc1        t8, f16
4476    cvt.s.w     f2, f2
4477    cvt.s.w     f4, f4
4478    cvt.s.w     f6, f6
4479    cvt.s.w     f8, f8
4480    cvt.s.w     f10, f10
4481    cvt.s.w     f12, f12
4482    cvt.s.w     f14, f14
4483    cvt.s.w     f16, f16
4484    lw          t0, 28(a0)
4485    swc1        f2, 192(a2)
4486    swc1        f4, 196(a2)
4487    swc1        f6, 200(a2)
4488    addu        t0, t0, a1
4489    swc1        f8, 204(a2)
4490    swc1        f10, 208(a2)
4491    swc1        f12, 212(a2)
4492    swc1        f14, 216(a2)
4493    swc1        f16, 220(a2)
4494    /* elemr 7 */
4495    lbu         t1, 0(t0)
4496    lbu         t2, 1(t0)
4497    lbu         t3, 2(t0)
4498    lbu         t4, 3(t0)
4499    lbu         t5, 4(t0)
4500    lbu         t6, 5(t0)
4501    lbu         t7, 6(t0)
4502    lbu         t8, 7(t0)
4503    addiu       t1, t1, -128
4504    addiu       t2, t2, -128
4505    addiu       t3, t3, -128
4506    addiu       t4, t4, -128
4507    addiu       t5, t5, -128
4508    addiu       t6, t6, -128
4509    addiu       t7, t7, -128
4510    addiu       t8, t8, -128
4511    mtc1        t1, f2
4512    mtc1        t2, f4
4513    mtc1        t3, f6
4514    mtc1        t4, f8
4515    mtc1        t5, f10
4516    mtc1        t6, f12
4517    mtc1        t7, f14
4518    mtc1        t8, f16
4519    cvt.s.w     f2, f2
4520    cvt.s.w     f4, f4
4521    cvt.s.w     f6, f6
4522    cvt.s.w     f8, f8
4523    cvt.s.w     f10, f10
4524    cvt.s.w     f12, f12
4525    cvt.s.w     f14, f14
4526    cvt.s.w     f16, f16
4527    swc1        f2, 224(a2)
4528    swc1        f4, 228(a2)
4529    swc1        f6, 232(a2)
4530    swc1        f8, 236(a2)
4531    swc1        f10, 240(a2)
4532    swc1        f12, 244(a2)
4533    swc1        f14, 248(a2)
4534    swc1        f16, 252(a2)
4535
4536    j           ra
4537     nop
4538
4539END(jsimd_convsamp_float_dspr2)
4540
4541#endif
4542
4543/*****************************************************************************/
4544