• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * MIPS DSPr2 optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5 *                          All Rights Reserved.
6 * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
7 *           Darko Laus       <darko.laus@imgtec.com>
8 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
9 *
10 * This software is provided 'as-is', without any express or implied
11 * warranty.  In no event will the authors be held liable for any damages
12 * arising from the use of this software.
13 *
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions:
17 *
18 * 1. The origin of this software must not be misrepresented; you must not
19 *    claim that you wrote the original software. If you use this software
20 *    in a product, an acknowledgment in the product documentation would be
21 *    appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be
23 *    misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution.
25 */
26
27#include "jsimd_dspr2_asm.h"
28
29
30/*****************************************************************************/
31LEAF_DSPR2(jsimd_c_null_convert_dspr2)
32/*
33 * a0     = cinfo->image_width
34 * a1     = input_buf
35 * a2     = output_buf
36 * a3     = output_row
37 * 16(sp) = num_rows
38 * 20(sp) = cinfo->num_components
39 *
40 * Null conversion for compression
41 */
42    SAVE_REGS_ON_STACK 8, s0, s1
43
44    lw          t9, 24(sp)      // t9 = num_rows
45    lw          s0, 28(sp)      // s0 = cinfo->num_components
46    andi        t0, a0, 3       // t0 = cinfo->image_width & 3
47    beqz        t0, 4f          // no residual
48     nop
490:
50    addiu       t9, t9, -1
51    bltz        t9, 7f
52     li         t1, 0
531:
54    sll         t3, t1, 2
55    lwx         t5, t3(a2)      // t5 = outptr = output_buf[ci]
56    lw          t2, 0(a1)       // t2 = inptr = *input_buf
57    sll         t4, a3, 2
58    lwx         t5, t4(t5)      // t5 = outptr = output_buf[ci][output_row]
59    addu        t2, t2, t1
60    addu        s1, t5, a0
61    addu        t6, t5, t0
622:
63    lbu         t3, 0(t2)
64    addiu       t5, t5, 1
65    sb          t3, -1(t5)
66    bne         t6, t5, 2b
67     addu       t2, t2, s0
683:
69    lbu         t3, 0(t2)
70    addu        t4, t2, s0
71    addu        t7, t4, s0
72    addu        t8, t7, s0
73    addu        t2, t8, s0
74    lbu         t4, 0(t4)
75    lbu         t7, 0(t7)
76    lbu         t8, 0(t8)
77    addiu       t5, t5, 4
78    sb          t3, -4(t5)
79    sb          t4, -3(t5)
80    sb          t7, -2(t5)
81    bne         s1, t5, 3b
82     sb         t8, -1(t5)
83    addiu       t1, t1, 1
84    bne         t1, s0, 1b
85     nop
86    addiu       a1, a1, 4
87    bgez        t9, 0b
88     addiu      a3, a3, 1
89    b           7f
90     nop
914:
92    addiu       t9, t9, -1
93    bltz        t9, 7f
94     li         t1, 0
955:
96    sll         t3, t1, 2
97    lwx         t5, t3(a2)      // t5 = outptr = output_buf[ci]
98    lw          t2, 0(a1)       // t2 = inptr = *input_buf
99    sll         t4, a3, 2
100    lwx         t5, t4(t5)      // t5 = outptr = output_buf[ci][output_row]
101    addu        t2, t2, t1
102    addu        s1, t5, a0
103    addu        t6, t5, t0
1046:
105    lbu         t3, 0(t2)
106    addu        t4, t2, s0
107    addu        t7, t4, s0
108    addu        t8, t7, s0
109    addu        t2, t8, s0
110    lbu         t4, 0(t4)
111    lbu         t7, 0(t7)
112    lbu         t8, 0(t8)
113    addiu       t5, t5, 4
114    sb          t3, -4(t5)
115    sb          t4, -3(t5)
116    sb          t7, -2(t5)
117    bne         s1, t5, 6b
118     sb         t8, -1(t5)
119    addiu       t1, t1, 1
120    bne         t1, s0, 5b
121     nop
122    addiu       a1, a1, 4
123    bgez        t9, 4b
124     addiu      a3, a3, 1
1257:
126    RESTORE_REGS_FROM_STACK 8, s0, s1
127
128    j           ra
129     nop
130
131END(jsimd_c_null_convert_dspr2)
132
133
134/*****************************************************************************/
135/*
136 * jsimd_extrgb_ycc_convert_dspr2
137 * jsimd_extbgr_ycc_convert_dspr2
138 * jsimd_extrgbx_ycc_convert_dspr2
139 * jsimd_extbgrx_ycc_convert_dspr2
140 * jsimd_extxbgr_ycc_convert_dspr2
141 * jsimd_extxrgb_ycc_convert_dspr2
142 *
143 * Colorspace conversion RGB -> YCbCr
144 */
145
146.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
147                                             r_offs, g_offs, b_offs
148
149.macro DO_RGB_TO_YCC  r, g, b, inptr
150    lbu         \r, \r_offs(\inptr)
151    lbu         \g, \g_offs(\inptr)
152    lbu         \b, \b_offs(\inptr)
153    addiu       \inptr, \pixel_size
154.endm
155
156LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
157/*
158 * a0     = cinfo->image_width
159 * a1     = input_buf
160 * a2     = output_buf
161 * a3     = output_row
162 * 16(sp) = num_rows
163 */
164    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
166    lw          t7, 48(sp)      // t7 = num_rows
167    li          s0, 0x4c8b      // FIX(0.29900)
168    li          s1, 0x9646      // FIX(0.58700)
169    li          s2, 0x1d2f      // FIX(0.11400)
170    li          s3, 0xffffd4cd  // -FIX(0.16874)
171    li          s4, 0xffffab33  // -FIX(0.33126)
172    li          s5, 0x8000      // FIX(0.50000)
173    li          s6, 0xffff94d1  // -FIX(0.41869)
174    li          s7, 0xffffeb2f  // -FIX(0.08131)
175    li          t8, 0x807fff    // CBCR_OFFSET + ONE_HALF-1
176
1770:
178    addiu       t7, -1          // --num_rows
179    lw          t6, 0(a1)       // t6 = input_buf[0]
180    lw          t0, 0(a2)
181    lw          t1, 4(a2)
182    lw          t2, 8(a2)
183    sll         t3, a3, 2
184    lwx         t0, t3(t0)      // t0 = output_buf[0][output_row]
185    lwx         t1, t3(t1)      // t1 = output_buf[1][output_row]
186    lwx         t2, t3(t2)      // t2 = output_buf[2][output_row]
187
188    addu        t9, t2, a0      // t9 = end address
189    addiu       a3, 1
190
1911:
192    DO_RGB_TO_YCC t3, t4, t5, t6
193
194    mtlo        s5, $ac0
195    mtlo        t8, $ac1
196    mtlo        t8, $ac2
197    maddu       $ac0, s2, t5
198    maddu       $ac1, s5, t5
199    maddu       $ac2, s5, t3
200    maddu       $ac0, s0, t3
201    maddu       $ac1, s3, t3
202    maddu       $ac2, s6, t4
203    maddu       $ac0, s1, t4
204    maddu       $ac1, s4, t4
205    maddu       $ac2, s7, t5
206    extr.w      t3, $ac0, 16
207    extr.w      t4, $ac1, 16
208    extr.w      t5, $ac2, 16
209    sb          t3, 0(t0)
210    sb          t4, 0(t1)
211    sb          t5, 0(t2)
212    addiu       t0, 1
213    addiu       t2, 1
214    bne         t2, t9, 1b
215     addiu      t1, 1
216    bgtz        t7, 0b
217     addiu      a1, 4
218
219    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
221    j           ra
222     nop
223END(jsimd_\colorid\()_ycc_convert_dspr2)
224
225.purgem DO_RGB_TO_YCC
226
227.endm
228
229/*-------------------------------------id -- pix R  G  B */
230GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
231GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
232GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
233GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
234GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
235GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
236
237
238/*****************************************************************************/
239/*
240 * jsimd_ycc_extrgb_convert_dspr2
241 * jsimd_ycc_extbgr_convert_dspr2
242 * jsimd_ycc_extrgbx_convert_dspr2
243 * jsimd_ycc_extbgrx_convert_dspr2
244 * jsimd_ycc_extxbgr_convert_dspr2
245 * jsimd_ycc_extxrgb_convert_dspr2
246 *
247 * Colorspace conversion YCbCr -> RGB
248 */
249
250.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
251                                             r_offs, g_offs, b_offs, a_offs
252
253.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
254    sb          \scratch0, \r_offs(\outptr)
255    sb          \scratch1, \g_offs(\outptr)
256    sb          \scratch2, \b_offs(\outptr)
257.if (\pixel_size == 4)
258    li          t0, 0xFF
259    sb          t0, \a_offs(\outptr)
260.endif
261    addiu       \outptr, \pixel_size
262.endm
263
264LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
265/*
266 * a0     = cinfo->image_width
267 * a1     = input_buf
268 * a2     = input_row
269 * a3     = output_buf
270 * 16(sp) = num_rows
271 */
272    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
273
274    lw          s1, 48(sp)
275    li          t3, 0x8000
276    li          t4, 0x166e9     // FIX(1.40200)
277    li          t5, 0x1c5a2     // FIX(1.77200)
278    li          t6, 0xffff492e  // -FIX(0.71414)
279    li          t7, 0xffffa7e6  // -FIX(0.34414)
280    repl.ph     t8, 128
281
2820:
283    lw          s0, 0(a3)
284    lw          t0, 0(a1)
285    lw          t1, 4(a1)
286    lw          t2, 8(a1)
287    sll         s5, a2, 2
288    addiu       s1, -1
289    lwx         s2, s5(t0)
290    lwx         s3, s5(t1)
291    lwx         s4, s5(t2)
292    addu        t9, s2, a0
293    addiu       a2, 1
294
2951:
296    lbu         s7, 0(s4)       // cr
297    lbu         s6, 0(s3)       // cb
298    lbu         s5, 0(s2)       // y
299    addiu       s2, 1
300    addiu       s4, 1
301    addiu       s7, -128
302    addiu       s6, -128
303    mul         t2, t7, s6
304    mul         t0, t6, s7      // Crgtab[cr]
305    sll         s7, 15
306    mulq_rs.w   t1, t4, s7      // Crrtab[cr]
307    sll         s6, 15
308    addu        t2, t3          // Cbgtab[cb]
309    addu        t2, t0
310
311    mulq_rs.w   t0, t5, s6      // Cbbtab[cb]
312    sra         t2, 16
313    addu        t1, s5
314    addu        t2, s5          // add y
315    ins         t2, t1, 16, 16
316    subu.ph     t2, t2, t8
317    addu        t0, s5
318    shll_s.ph   t2, t2, 8
319    subu        t0, 128
320    shra.ph     t2, t2, 8
321    shll_s.w    t0, t0, 24
322    addu.ph     t2, t2, t8      // clip & store
323    sra         t0, t0, 24
324    sra         t1, t2, 16
325    addiu       t0, 128
326
327    STORE_YCC_TO_RGB t1, t2, t0, s0
328
329    bne         s2, t9, 1b
330     addiu      s3, 1
331    bgtz        s1, 0b
332     addiu      a3, 4
333
334    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
335
336    j           ra
337     nop
338END(jsimd_ycc_\colorid\()_convert_dspr2)
339
340.purgem STORE_YCC_TO_RGB
341
342.endm
343
344/*-------------------------------------id -- pix R  G  B  A */
345GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
346GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
347GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
348GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
349GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
350GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
351
352
353/*****************************************************************************/
354/*
355 * jsimd_extrgb_gray_convert_dspr2
356 * jsimd_extbgr_gray_convert_dspr2
357 * jsimd_extrgbx_gray_convert_dspr2
358 * jsimd_extbgrx_gray_convert_dspr2
359 * jsimd_extxbgr_gray_convert_dspr2
360 * jsimd_extxrgb_gray_convert_dspr2
361 *
362 * Colorspace conversion RGB -> GRAY
363 */
364
365.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
366                                              r_offs, g_offs, b_offs
367
368.macro DO_RGB_TO_GRAY  r, g, b, inptr
369    lbu         \r, \r_offs(\inptr)
370    lbu         \g, \g_offs(\inptr)
371    lbu         \b, \b_offs(\inptr)
372    addiu       \inptr, \pixel_size
373.endm
374
375LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
376/*
377 * a0     = cinfo->image_width
378 * a1     = input_buf
379 * a2     = output_buf
380 * a3     = output_row
381 * 16(sp) = num_rows
382 */
383    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
384
385    li          s0, 0x4c8b      // s0 = FIX(0.29900)
386    li          s1, 0x9646      // s1 = FIX(0.58700)
387    li          s2, 0x1d2f      // s2 = FIX(0.11400)
388    li          s7, 0x8000      // s7 = FIX(0.50000)
389    lw          s6, 48(sp)
390    andi        t7, a0, 3
391
3920:
393    addiu       s6, -1          // s6 = num_rows
394    lw          t0, 0(a1)
395    lw          t1, 0(a2)
396    sll         t3, a3, 2
397    lwx         t1, t3(t1)
398    addiu       a3, 1
399    addu        t9, t1, a0
400    subu        t8, t9, t7
401    beq         t1, t8, 2f
402     nop
403
4041:
405    DO_RGB_TO_GRAY t3, t4, t5, t0
406    DO_RGB_TO_GRAY s3, s4, s5, t0
407
408    mtlo        s7, $ac0
409    maddu       $ac0, s2, t5
410    maddu       $ac0, s1, t4
411    maddu       $ac0, s0, t3
412    mtlo        s7, $ac1
413    maddu       $ac1, s2, s5
414    maddu       $ac1, s1, s4
415    maddu       $ac1, s0, s3
416    extr.w      t6, $ac0, 16
417
418    DO_RGB_TO_GRAY t3, t4, t5, t0
419    DO_RGB_TO_GRAY s3, s4, s5, t0
420
421    mtlo        s7, $ac0
422    maddu       $ac0, s2, t5
423    maddu       $ac0, s1, t4
424    extr.w      t2, $ac1, 16
425    maddu       $ac0, s0, t3
426    mtlo        s7, $ac1
427    maddu       $ac1, s2, s5
428    maddu       $ac1, s1, s4
429    maddu       $ac1, s0, s3
430    extr.w      t5, $ac0, 16
431    sb          t6, 0(t1)
432    sb          t2, 1(t1)
433    extr.w      t3, $ac1, 16
434    addiu       t1, 4
435    sb          t5, -2(t1)
436    sb          t3, -1(t1)
437    bne         t1, t8, 1b
438     nop
439
4402:
441    beqz        t7, 4f
442     nop
443
4443:
445    DO_RGB_TO_GRAY t3, t4, t5, t0
446
447    mtlo        s7, $ac0
448    maddu       $ac0, s2, t5
449    maddu       $ac0, s1, t4
450    maddu       $ac0, s0, t3
451    extr.w      t6, $ac0, 16
452    sb          t6, 0(t1)
453    addiu       t1, 1
454    bne         t1, t9, 3b
455     nop
456
4574:
458    bgtz        s6, 0b
459     addiu      a1, 4
460
461    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
462
463    j           ra
464     nop
465END(jsimd_\colorid\()_gray_convert_dspr2)
466
467.purgem DO_RGB_TO_GRAY
468
469.endm
470
471/*-------------------------------------id --  pix R  G  B */
472GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
473GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
474GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
475GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
476GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
477GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
478
479
480/*****************************************************************************/
481/*
482 * jsimd_h2v2_merged_upsample_dspr2
483 * jsimd_h2v2_extrgb_merged_upsample_dspr2
484 * jsimd_h2v2_extrgbx_merged_upsample_dspr2
485 * jsimd_h2v2_extbgr_merged_upsample_dspr2
486 * jsimd_h2v2_extbgrx_merged_upsample_dspr2
487 * jsimd_h2v2_extxbgr_merged_upsample_dspr2
488 * jsimd_h2v2_extxrgb_merged_upsample_dspr2
489 *
490 * Merged h2v2 upsample routines
491 */
492.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
493                                            r1_offs, g1_offs, \
494                                            b1_offs, a1_offs, \
495                                            r2_offs, g2_offs, \
496                                            b2_offs, a2_offs
497
498.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
499                            scratch5 outptr
500    sb          \scratch0, \r1_offs(\outptr)
501    sb          \scratch1, \g1_offs(\outptr)
502    sb          \scratch2, \b1_offs(\outptr)
503    sb          \scratch3, \r2_offs(\outptr)
504    sb          \scratch4, \g2_offs(\outptr)
505    sb          \scratch5, \b2_offs(\outptr)
506.if (\pixel_size == 8)
507    li          \scratch0, 0xFF
508    sb          \scratch0, \a1_offs(\outptr)
509    sb          \scratch0, \a2_offs(\outptr)
510.endif
511    addiu       \outptr, \pixel_size
512.endm
513
514.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
515    sb          \scratch0, \r1_offs(\outptr)
516    sb          \scratch1, \g1_offs(\outptr)
517    sb          \scratch2, \b1_offs(\outptr)
518
519.if (\pixel_size == 8)
520    li          t0, 0xFF
521    sb          t0, \a1_offs(\outptr)
522.endif
523.endm
524
525LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
526/*
527 * a0     = cinfo->output_width
528 * a1     = input_buf
529 * a2     = in_row_group_ctr
530 * a3     = output_buf
531 * 16(sp) = cinfo->sample_range_limit
532 */
533    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
534
535    lw          t9, 56(sp)      // cinfo->sample_range_limit
536    lw          v0, 0(a1)
537    lw          v1, 4(a1)
538    lw          t0, 8(a1)
539    sll         t1, a2, 3
540    addiu       t2, t1, 4
541    sll         t3, a2, 2
542    lw          t4, 0(a3)       // t4 = output_buf[0]
543    lwx         t1, t1(v0)      // t1 = input_buf[0][in_row_group_ctr*2]
544    lwx         t2, t2(v0)      // t2 = input_buf[0][in_row_group_ctr*2 + 1]
545    lwx         t5, t3(v1)      // t5 = input_buf[1][in_row_group_ctr]
546    lwx         t6, t3(t0)      // t6 = input_buf[2][in_row_group_ctr]
547    lw          t7, 4(a3)       // t7 = output_buf[1]
548    li          s1, 0xe6ea
549    addiu       t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
550    addiu       s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
551    addiu       s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
552    xori        s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
553    srl         t3, a0, 1
554    blez        t3, 2f
555     addu       t0, t5, t3      // t0 = end address
556 1:
557    lbu         t3, 0(t5)
558    lbu         s3, 0(t6)
559    addiu       t5, t5, 1
560    addiu       t3, t3, -128    // (cb - 128)
561    addiu       s3, s3, -128    // (cr - 128)
562    mult        $ac1, s1, t3
563    madd        $ac1, s2, s3
564    sll         s3, s3, 15
565    sll         t3, t3, 15
566    mulq_rs.w   s4, t8, s3      // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
567    extr_r.w    s5, $ac1, 16
568    mulq_rs.w   s6, s0, t3      // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
569    lbu         v0, 0(t1)
570    addiu       t6, t6, 1
571    addiu       t1, t1, 2
572    addu        t3, v0, s4      // y+cred
573    addu        s3, v0, s5      // y+cgreen
574    addu        v1, v0, s6      // y+cblue
575    addu        t3, t9, t3      // y+cred
576    addu        s3, t9, s3      // y+cgreen
577    addu        v1, t9, v1      // y+cblue
578    lbu         AT, 0(t3)
579    lbu         s7, 0(s3)
580    lbu         ra, 0(v1)
581    lbu         v0, -1(t1)
582    addu        t3, v0, s4      // y+cred
583    addu        s3, v0, s5      // y+cgreen
584    addu        v1, v0, s6      // y+cblue
585    addu        t3, t9, t3      // y+cred
586    addu        s3, t9, s3      // y+cgreen
587    addu        v1, t9, v1      // y+cblue
588    lbu         t3, 0(t3)
589    lbu         s3, 0(s3)
590    lbu         v1, 0(v1)
591    lbu         v0, 0(t2)
592
593    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
594
595    addu        t3, v0, s4      // y+cred
596    addu        s3, v0, s5      // y+cgreen
597    addu        v1, v0, s6      // y+cblue
598    addu        t3, t9, t3      // y+cred
599    addu        s3, t9, s3      // y+cgreen
600    addu        v1, t9, v1      // y+cblue
601    lbu         AT, 0(t3)
602    lbu         s7, 0(s3)
603    lbu         ra, 0(v1)
604    lbu         v0, 1(t2)
605    addiu       t2, t2, 2
606    addu        t3, v0, s4      // y+cred
607    addu        s3, v0, s5      // y+cgreen
608    addu        v1, v0, s6      // y+cblue
609    addu        t3, t9, t3      // y+cred
610    addu        s3, t9, s3      // y+cgreen
611    addu        v1, t9, v1      // y+cblue
612    lbu         t3, 0(t3)
613    lbu         s3, 0(s3)
614    lbu         v1, 0(v1)
615
616    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
617
618    bne         t0, t5, 1b
619     nop
6202:
621    andi        t0, a0, 1
622    beqz        t0, 4f
623     lbu        t3, 0(t5)
624    lbu         s3, 0(t6)
625    addiu       t3, t3, -128    // (cb - 128)
626    addiu       s3, s3, -128    // (cr - 128)
627    mult        $ac1, s1, t3
628    madd        $ac1, s2, s3
629    sll         s3, s3, 15
630    sll         t3, t3, 15
631    lbu         v0, 0(t1)
632    extr_r.w    s5, $ac1, 16
633    mulq_rs.w   s4, t8, s3      // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
634    mulq_rs.w   s6, s0, t3      // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
635    addu        t3, v0, s4      // y+cred
636    addu        s3, v0, s5      // y+cgreen
637    addu        v1, v0, s6      // y+cblue
638    addu        t3, t9, t3      // y+cred
639    addu        s3, t9, s3      // y+cgreen
640    addu        v1, t9, v1      // y+cblue
641    lbu         t3, 0(t3)
642    lbu         s3, 0(s3)
643    lbu         v1, 0(v1)
644    lbu         v0, 0(t2)
645
646    STORE_H2V2_1_PIXEL t3, s3, v1, t4
647
648    addu        t3, v0, s4      // y+cred
649    addu        s3, v0, s5      // y+cgreen
650    addu        v1, v0, s6      // y+cblue
651    addu        t3, t9, t3      // y+cred
652    addu        s3, t9, s3      // y+cgreen
653    addu        v1, t9, v1      // y+cblue
654    lbu         t3, 0(t3)
655    lbu         s3, 0(s3)
656    lbu         v1, 0(v1)
657
658    STORE_H2V2_1_PIXEL t3, s3, v1, t7
6594:
660    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
661
662    j           ra
663     nop
664
665END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
666
667.purgem STORE_H2V2_1_PIXEL
668.purgem STORE_H2V2_2_PIXELS
669.endm
670
671/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
672GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
673GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
674GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
675GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
676GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
677GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
678
679
680/*****************************************************************************/
681/*
682 * jsimd_h2v1_merged_upsample_dspr2
683 * jsimd_h2v1_extrgb_merged_upsample_dspr2
684 * jsimd_h2v1_extrgbx_merged_upsample_dspr2
685 * jsimd_h2v1_extbgr_merged_upsample_dspr2
686 * jsimd_h2v1_extbgrx_merged_upsample_dspr2
687 * jsimd_h2v1_extxbgr_merged_upsample_dspr2
688 * jsimd_h2v1_extxrgb_merged_upsample_dspr2
689 *
690 * Merged h2v1 upsample routines
691 */
692
693.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
694                                            r1_offs, g1_offs, \
695                                            b1_offs, a1_offs, \
696                                            r2_offs, g2_offs, \
697                                            b2_offs, a2_offs
698
699.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
700                            scratch5 outptr
701    sb          \scratch0, \r1_offs(\outptr)
702    sb          \scratch1, \g1_offs(\outptr)
703    sb          \scratch2, \b1_offs(\outptr)
704    sb          \scratch3, \r2_offs(\outptr)
705    sb          \scratch4, \g2_offs(\outptr)
706    sb          \scratch5, \b2_offs(\outptr)
707.if (\pixel_size == 8)
708    li          t0, 0xFF
709    sb          t0, \a1_offs(\outptr)
710    sb          t0, \a2_offs(\outptr)
711.endif
712    addiu       \outptr, \pixel_size
713.endm
714
715.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
716    sb          \scratch0, \r1_offs(\outptr)
717    sb          \scratch1, \g1_offs(\outptr)
718    sb          \scratch2, \b1_offs(\outptr)
719.if (\pixel_size == 8)
720    li          t0, 0xFF
721    sb          t0, \a1_offs(\outptr)
722.endif
723.endm
724
725LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
726/*
727 * a0     = cinfo->output_width
728 * a1     = input_buf
729 * a2     = in_row_group_ctr
730 * a3     = output_buf
731 * 16(sp) = range_limit
732 */
733    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
734
735    li          t0, 0xe6ea
736    lw          t1, 0(a1)         // t1 = input_buf[0]
737    lw          t2, 4(a1)         // t2 = input_buf[1]
738    lw          t3, 8(a1)         // t3 = input_buf[2]
739    lw          t8, 56(sp)        // t8 = range_limit
740    addiu       s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
741    addiu       s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
742    addiu       s0, t0, 0x9916    // s0 = 0x8000
743    addiu       s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
744    xori        s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
745    srl         t0, a0, 1
746    sll         t4, a2, 2
747    lwx         s5, t4(t1)      // s5 = inptr0
748    lwx         s6, t4(t2)      // s6 = inptr1
749    lwx         s7, t4(t3)      // s7 = inptr2
750    lw          t7, 0(a3)       // t7 = outptr
751    blez        t0, 2f
752     addu       t9, s6, t0      // t9 = end address
7531:
754    lbu         t2, 0(s6)       // t2 = cb
755    lbu         t0, 0(s7)       // t0 = cr
756    lbu         t1, 0(s5)       // t1 = y
757    addiu       t2, t2, -128    // t2 = cb - 128
758    addiu       t0, t0, -128    // t0 = cr - 128
759    mult        $ac1, s4, t2
760    madd        $ac1, s3, t0
761    sll         t0, t0, 15
762    sll         t2, t2, 15
763    mulq_rs.w   t0, s1, t0      // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
764    extr_r.w    t5, $ac1, 16
765    mulq_rs.w   t6, s2, t2      // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
766    addiu       s7, s7, 1
767    addiu       s6, s6, 1
768    addu        t2, t1, t0      // t2 = y + cred
769    addu        t3, t1, t5      // t3 = y + cgreen
770    addu        t4, t1, t6      // t4 = y + cblue
771    addu        t2, t8, t2
772    addu        t3, t8, t3
773    addu        t4, t8, t4
774    lbu         t1, 1(s5)
775    lbu         v0, 0(t2)
776    lbu         v1, 0(t3)
777    lbu         ra, 0(t4)
778    addu        t2, t1, t0
779    addu        t3, t1, t5
780    addu        t4, t1, t6
781    addu        t2, t8, t2
782    addu        t3, t8, t3
783    addu        t4, t8, t4
784    lbu         t2, 0(t2)
785    lbu         t3, 0(t3)
786    lbu         t4, 0(t4)
787
788    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
789
790    bne         t9, s6, 1b
791     addiu      s5, s5, 2
7922:
793    andi        t0, a0, 1
794    beqz        t0, 4f
795     nop
7963:
797    lbu         t2, 0(s6)
798    lbu         t0, 0(s7)
799    lbu         t1, 0(s5)
800    addiu       t2, t2, -128    // (cb - 128)
801    addiu       t0, t0, -128    // (cr - 128)
802    mul         t3, s4, t2
803    mul         t4, s3, t0
804    sll         t0, t0, 15
805    sll         t2, t2, 15
806    mulq_rs.w   t0, s1, t0      // (C1*cr + ONE_HALF)>> SCALEBITS
807    mulq_rs.w   t6, s2, t2      // (C2*cb + ONE_HALF)>> SCALEBITS
808    addu        t3, t3, s0
809    addu        t3, t4, t3
810    sra         t5, t3, 16      // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
811    addu        t2, t1, t0      // y + cred
812    addu        t3, t1, t5      // y + cgreen
813    addu        t4, t1, t6      // y + cblue
814    addu        t2, t8, t2
815    addu        t3, t8, t3
816    addu        t4, t8, t4
817    lbu         t2, 0(t2)
818    lbu         t3, 0(t3)
819    lbu         t4, 0(t4)
820
821    STORE_H2V1_1_PIXEL t2, t3, t4, t7
8224:
823    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
824
825    j           ra
826     nop
827
828END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
829
830.purgem STORE_H2V1_1_PIXEL
831.purgem STORE_H2V1_2_PIXELS
832.endm
833
834/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
835GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
836GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
837GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
838GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
839GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
840GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
841
842
843/*****************************************************************************/
844/*
845 * jsimd_h2v2_fancy_upsample_dspr2
846 *
847 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
848 */
849LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
850/*
851 * a0 = cinfo->max_v_samp_factor
852 * a1 = downsampled_width
853 * a2 = input_data
854 * a3 = output_data_ptr
855 */
856    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
857
858    li            s4, 0
859    lw            s2, 0(a3)       // s2 = *output_data_ptr
8600:
861    li            t9, 2
862    lw            s1, -4(a2)      // s1 = inptr1
863
8641:
865    lw            s0, 0(a2)       // s0 = inptr0
866    lwx           s3, s4(s2)
867    addiu         s5, a1, -2      // s5 = downsampled_width - 2
868    srl           t4, s5, 1
869    sll           t4, t4, 1
870    lbu           t0, 0(s0)
871    lbu           t1, 1(s0)
872    lbu           t2, 0(s1)
873    lbu           t3, 1(s1)
874    addiu         s0, 2
875    addiu         s1, 2
876    addu          t8, s0, t4      // t8 = end address
877    andi          s5, s5, 1       // s5 = residual
878    sll           t4, t0, 1
879    sll           t6, t1, 1
880    addu          t0, t0, t4      // t0 = (*inptr0++) * 3
881    addu          t1, t1, t6      // t1 = (*inptr0++) * 3
882    addu          t7, t0, t2      // t7 = thiscolsum
883    addu          t6, t1, t3      // t5 = nextcolsum
884    sll           t0, t7, 2       // t0 = thiscolsum * 4
885    subu          t1, t0, t7      // t1 = thiscolsum * 3
886    shra_r.w      t0, t0, 4
887    addiu         t1, 7
888    addu          t1, t1, t6
889    srl           t1, t1, 4
890    sb            t0, 0(s3)
891    sb            t1, 1(s3)
892    beq           t8, s0, 22f     // skip to final iteration if width == 3
893     addiu        s3, 2
8942:
895    lh            t0, 0(s0)       // t0 = A3|A2
896    lh            t2, 0(s1)       // t2 = B3|B2
897    addiu         s0, 2
898    addiu         s1, 2
899    preceu.ph.qbr t0, t0          // t0 = 0|A3|0|A2
900    preceu.ph.qbr t2, t2          // t2 = 0|B3|0|B2
901    shll.ph       t1, t0, 1
902    sll           t3, t6, 1
903    addu.ph       t0, t1, t0      // t0 = A3*3|A2*3
904    addu          t3, t3, t6      // t3 = this * 3
905    addu.ph       t0, t0, t2      // t0 = next2|next1
906    addu          t1, t3, t7
907    andi          t7, t0, 0xFFFF  // t7 = next1
908    sll           t2, t7, 1
909    addu          t2, t7, t2      // t2 = next1*3
910    addu          t4, t2, t6
911    srl           t6, t0, 16      // t6 = next2
912    shra_r.w      t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
913    addu          t0, t3, t7
914    addiu         t0, 7
915    srl           t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
916    shra_r.w      t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
917    addu          t2, t2, t6
918    addiu         t2, 7
919    srl           t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
920    sb            t1, 0(s3)
921    sb            t0, 1(s3)
922    sb            t4, 2(s3)
923    sb            t2, 3(s3)
924    bne           t8, s0, 2b
925     addiu        s3, 4
92622:
927    beqz          s5, 4f
928     addu         t8, s0, s5
9293:
930    lbu           t0, 0(s0)
931    lbu           t2, 0(s1)
932    addiu         s0, 1
933    addiu         s1, 1
934    sll           t3, t6, 1
935    sll           t1, t0, 1
936    addu          t1, t0, t1      // t1 = inptr0 * 3
937    addu          t3, t3, t6      // t3 = thiscolsum * 3
938    addu          t5, t1, t2
939    addu          t1, t3, t7
940    shra_r.w      t1, t1, 4
941    addu          t0, t3, t5
942    addiu         t0, 7
943    srl           t0, t0, 4
944    sb            t1, 0(s3)
945    sb            t0, 1(s3)
946    addiu         s3, 2
947    move          t7, t6
948    bne           t8, s0, 3b
949     move         t6, t5
9504:
951    sll           t0, t6, 2       // t0 = thiscolsum * 4
952    subu          t1, t0, t6      // t1 = thiscolsum * 3
953    addu          t1, t1, t7
954    addiu         s4, 4
955    shra_r.w      t1, t1, 4
956    addiu         t0, 7
957    srl           t0, t0, 4
958    sb            t1, 0(s3)
959    sb            t0, 1(s3)
960    addiu         t9, -1
961    addiu         s3, 2
962    bnez          t9, 1b
963     lw           s1, 4(a2)
964    srl           t0, s4, 2
965    subu          t0, a0, t0
966    bgtz          t0, 0b
967     addiu        a2, 4
968
969    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
970
971    j             ra
972     nop
973END(jsimd_h2v2_fancy_upsample_dspr2)
974
975
976/*****************************************************************************/
977LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
978/*
979 * a0 = cinfo->max_v_samp_factor
980 * a1 = downsampled_width
981 * a2 = input_data
982 * a3 = output_data_ptr
983 */
984    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
985
986    .set at
987
988    beqz          a0, 3f
989     sll          t0, a0, 2
990    lw            s1, 0(a3)
991    li            s3, 0x10001
992    addu          s0, s1, t0
9930:
994    addiu         t8, a1, -2
995    srl           t9, t8, 2
996    lw            t7, 0(a2)
997    lw            s2, 0(s1)
998    lbu           t0, 0(t7)
999    lbu           t1, 1(t7)       // t1 = inptr[1]
1000    sll           t2, t0, 1
1001    addu          t2, t2, t0      // t2 = invalue*3
1002    addu          t2, t2, t1
1003    shra_r.w      t2, t2, 2
1004    sb            t0, 0(s2)
1005    sb            t2, 1(s2)
1006    beqz          t9, 11f
1007     addiu        s2, 2
10081:
1009    ulw           t0, 0(t7)       // t0 = |P3|P2|P1|P0|
1010    ulw           t1, 1(t7)
1011    ulh           t2, 4(t7)       // t2 = |0|0|P5|P4|
1012    preceu.ph.qbl t3, t0          // t3 = |0|P3|0|P2|
1013    preceu.ph.qbr t0, t0          // t0 = |0|P1|0|P0|
1014    preceu.ph.qbr t2, t2          // t2 = |0|P5|0|P4|
1015    preceu.ph.qbl t4, t1          // t4 = |0|P4|0|P3|
1016    preceu.ph.qbr t1, t1          // t1 = |0|P2|0|P1|
1017    shll.ph       t5, t4, 1
1018    shll.ph       t6, t1, 1
1019    addu.ph       t5, t5, t4      // t5 = |P4*3|P3*3|
1020    addu.ph       t6, t6, t1      // t6 = |P2*3|P1*3|
1021    addu.ph       t4, t3, s3
1022    addu.ph       t0, t0, s3
1023    addu.ph       t4, t4, t5
1024    addu.ph       t0, t0, t6
1025    shrl.ph       t4, t4, 2       // t4 = |0|P3|0|P2|
1026    shrl.ph       t0, t0, 2       // t0 = |0|P1|0|P0|
1027    addu.ph       t2, t2, t5
1028    addu.ph       t3, t3, t6
1029    shra_r.ph     t2, t2, 2       // t2 = |0|P5|0|P4|
1030    shra_r.ph     t3, t3, 2       // t3 = |0|P3|0|P2|
1031    shll.ph       t2, t2, 8
1032    shll.ph       t3, t3, 8
1033    or            t2, t4, t2
1034    or            t3, t3, t0
1035    addiu         t9, -1
1036    usw           t3, 0(s2)
1037    usw           t2, 4(s2)
1038    addiu         s2, 8
1039    bgtz          t9, 1b
1040     addiu        t7, 4
104111:
1042    andi          t8, 3
1043    beqz          t8, 22f
1044     addiu        t7, 1
1045
10462:
1047    lbu           t0, 0(t7)
1048    addiu         t7, 1
1049    sll           t1, t0, 1
1050    addu          t2, t0, t1      // t2 = invalue
1051    lbu           t3, -2(t7)
1052    lbu           t4, 0(t7)
1053    addiu         t3, 1
1054    addiu         t4, 2
1055    addu          t3, t3, t2
1056    addu          t4, t4, t2
1057    srl           t3, 2
1058    srl           t4, 2
1059    sb            t3, 0(s2)
1060    sb            t4, 1(s2)
1061    addiu         t8, -1
1062    bgtz          t8, 2b
1063     addiu        s2, 2
1064
106522:
1066    lbu           t0, 0(t7)
1067    lbu           t2, -1(t7)
1068    sll           t1, t0, 1
1069    addu          t1, t1, t0      // t1 = invalue * 3
1070    addu          t1, t1, t2
1071    addiu         t1, 1
1072    srl           t1, t1, 2
1073    sb            t1, 0(s2)
1074    sb            t0, 1(s2)
1075    addiu         s1, 4
1076    bne           s1, s0, 0b
1077     addiu        a2, 4
10783:
1079    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1080
1081    j             ra
1082     nop
1083END(jsimd_h2v1_fancy_upsample_dspr2)
1084
1085
1086/*****************************************************************************/
1087LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
1088/*
1089 * a0     = cinfo->image_width
1090 * a1     = cinfo->max_v_samp_factor
1091 * a2     = compptr->v_samp_factor
1092 * a3     = compptr->width_in_blocks
1093 * 16(sp) = input_data
1094 * 20(sp) = output_data
1095 */
1096    .set at
1097
1098    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1099
1100    beqz        a2, 7f
1101     lw         s1, 44(sp)      // s1 = output_data
1102    lw          s0, 40(sp)      // s0 = input_data
1103    srl         s2, a0, 2
1104    andi        t9, a0, 2
1105    srl         t7, t9, 1
1106    addu        s2, t7, s2
1107    sll         t0, a3, 3       // t0 = width_in_blocks*DCT
1108    srl         t7, t0, 1
1109    subu        s2, t7, s2
11100:
1111    andi        t6, a0, 1       // t6 = temp_index
1112    addiu       t6, -1
1113    lw          t4, 0(s1)       // t4 = outptr
1114    lw          t5, 0(s0)       // t5 = inptr0
1115    li          s3, 0           // s3 = bias
1116    srl         t7, a0, 1       // t7 = image_width1
1117    srl         s4, t7, 2
1118    andi        t8, t7, 3
11191:
1120    ulhu        t0, 0(t5)
1121    ulhu        t1, 2(t5)
1122    ulhu        t2, 4(t5)
1123    ulhu        t3, 6(t5)
1124    raddu.w.qb  t0, t0
1125    raddu.w.qb  t1, t1
1126    raddu.w.qb  t2, t2
1127    raddu.w.qb  t3, t3
1128    shra.ph     t0, t0, 1
1129    shra_r.ph   t1, t1, 1
1130    shra.ph     t2, t2, 1
1131    shra_r.ph   t3, t3, 1
1132    sb          t0, 0(t4)
1133    sb          t1, 1(t4)
1134    sb          t2, 2(t4)
1135    sb          t3, 3(t4)
1136    addiu       s4, -1
1137    addiu       t4, 4
1138    bgtz        s4, 1b
1139     addiu      t5, 8
1140    beqz        t8, 3f
1141     addu       s4, t4, t8
11422:
1143    ulhu        t0, 0(t5)
1144    raddu.w.qb  t0, t0
1145    addqh.w     t0, t0, s3
1146    xori        s3, s3, 1
1147    sb          t0, 0(t4)
1148    addiu       t4, 1
1149    bne         t4, s4, 2b
1150     addiu      t5, 2
11513:
1152    lbux        t1, t6(t5)
1153    sll         t1, 1
1154    addqh.w     t2, t1, s3      // t2 = pixval1
1155    xori        s3, s3, 1
1156    addqh.w     t3, t1, s3      // t3 = pixval2
1157    blez        s2, 5f
1158     append     t3, t2,  8
1159    addu        t5, t4, s2      // t5 = loop_end2
11604:
1161    ush         t3, 0(t4)
1162    addiu       s2, -1
1163    bgtz        s2, 4b
1164     addiu      t4,  2
11655:
1166    beqz        t9, 6f
1167     nop
1168    sb          t2, 0(t4)
11696:
1170    addiu       s1, 4
1171    addiu       a2, -1
1172    bnez        a2, 0b
1173     addiu      s0, 4
11747:
1175    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1176
1177    j           ra
1178    nop
1179END(jsimd_h2v1_downsample_dspr2)
1180
1181
1182/*****************************************************************************/
1183LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
1184/*
1185 * a0     = cinfo->image_width
1186 * a1     = cinfo->max_v_samp_factor
1187 * a2     = compptr->v_samp_factor
1188 * a3     = compptr->width_in_blocks
1189 * 16(sp) = input_data
1190 * 20(sp) = output_data
1191 */
1192    .set at
1193
1194    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1195
1196    beqz        a2, 8f
1197     lw         s1, 52(sp)      // s1 = output_data
1198    lw          s0, 48(sp)      // s0 = input_data
1199
1200    andi        t6, a0, 1       // t6 = temp_index
1201    addiu       t6, -1
1202    srl         t7, a0, 1       // t7 = image_width1
1203    srl         s4, t7, 2
1204    andi        t8, t7, 3
1205    andi        t9, a0, 2
1206    srl         s2, a0, 2
1207    srl         t7, t9, 1
1208    addu        s2, t7, s2
1209    sll         t0, a3, 3       // s2 = width_in_blocks*DCT
1210    srl         t7, t0, 1
1211    subu        s2, t7, s2
12120:
1213    lw          t4, 0(s1)       // t4 = outptr
1214    lw          t5, 0(s0)       // t5 = inptr0
1215    lw          s7, 4(s0)       // s7 = inptr1
1216    li          s6, 1           // s6 = bias
12172:
1218    ulw         t0, 0(t5)       // t0 = |P3|P2|P1|P0|
1219    ulw         t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
1220    ulw         t2, 4(t5)
1221    ulw         t3, 4(s7)
1222    precrq.ph.w t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
1223    ins         t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
1224    raddu.w.qb  t1, t7
1225    raddu.w.qb  t0, t0
1226    shra_r.w    t1, t1, 2
1227    addiu       t0, 1
1228    srl         t0, 2
1229    precrq.ph.w t7, t2, t3
1230    ins         t2, t3, 16, 16
1231    raddu.w.qb  t7, t7
1232    raddu.w.qb  t2, t2
1233    shra_r.w    t7, t7, 2
1234    addiu       t2, 1
1235    srl         t2, 2
1236    sb          t0, 0(t4)
1237    sb          t1, 1(t4)
1238    sb          t2, 2(t4)
1239    sb          t7, 3(t4)
1240    addiu       t4, 4
1241    addiu       t5, 8
1242    addiu       s4, s4, -1
1243    bgtz        s4, 2b
1244     addiu      s7, 8
1245    beqz        t8, 4f
1246     addu       t8, t4, t8
12473:
1248    ulhu        t0, 0(t5)
1249    ulhu        t1, 0(s7)
1250    ins         t0, t1, 16, 16
1251    raddu.w.qb  t0, t0
1252    addu        t0, t0, s6
1253    srl         t0, 2
1254    xori        s6, s6, 3
1255    sb          t0, 0(t4)
1256    addiu       t5, 2
1257    addiu       t4, 1
1258    bne         t8, t4, 3b
1259     addiu      s7, 2
12604:
1261    lbux        t1, t6(t5)
1262    sll         t1, 1
1263    lbux        t0, t6(s7)
1264    sll         t0, 1
1265    addu        t1, t1, t0
1266    addu        t3, t1, s6
1267    srl         t0, t3, 2       // t2 = pixval1
1268    xori        s6, s6, 3
1269    addu        t2, t1, s6
1270    srl         t1, t2, 2       // t3 = pixval2
1271    blez        s2, 6f
1272     append     t1, t0, 8
12735:
1274    ush         t1, 0(t4)
1275    addiu       s2, -1
1276    bgtz        s2, 5b
1277     addiu      t4, 2
12786:
1279    beqz        t9, 7f
1280     nop
1281    sb          t0, 0(t4)
12827:
1283    addiu       s1, 4
1284    addiu       a2, -1
1285    bnez        a2, 0b
1286     addiu      s0, 8
12878:
1288    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1289
1290    j           ra
1291     nop
1292END(jsimd_h2v2_downsample_dspr2)
1293
1294
1295/*****************************************************************************/
1296LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
1297/*
1298 * a0     = input_data
1299 * a1     = output_data
1300 * a2     = compptr->v_samp_factor
1301 * a3     = cinfo->max_v_samp_factor
1302 * 16(sp) = cinfo->smoothing_factor
1303 * 20(sp) = compptr->width_in_blocks
1304 * 24(sp) = cinfo->image_width
1305 */
1306    .set at
1307
1308    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1309
1310    lw          s7, 52(sp)      // compptr->width_in_blocks
1311    lw          s0, 56(sp)      // cinfo->image_width
1312    lw          s6, 48(sp)      // cinfo->smoothing_factor
1313    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
1314    sll         v0, s7, 1
1315    subu        v0, v0, s0
1316    blez        v0, 2f
1317    move        v1, zero
1318    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
13190:
1320    addiu       t1, a0, -4
1321    sll         t2, v1, 2
1322    lwx         t1, t2(t1)
1323    move        t3, v0
1324    addu        t1, t1, s0
1325    lbu         t2, -1(t1)
13261:
1327    addiu       t3, t3, -1
1328    sb          t2, 0(t1)
1329    bgtz        t3, 1b
1330    addiu       t1, t1, 1
1331    addiu       v1, v1, 1
1332    bne         v1, t0, 0b
1333    nop
13342:
1335    li          v0, 80
1336    mul         v0, s6, v0
1337    li          v1, 16384
1338    move        t4, zero
1339    move        t5, zero
1340    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
1341    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
13423:
1343/* Special case for first column: pretend column -1 is same as column 0 */
1344    sll         v0, t4, 2
1345    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
1346    sll         v1, t5, 2
1347    addiu       t9, v1, 4
1348    addiu       s0, v1, -4
1349    addiu       s1, v1, 8
1350    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
1351    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
1352    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
1353    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
1354    lh          v0, 0(s2)
1355    lh          v1, 0(t9)
1356    lh          t0, 0(s0)
1357    lh          t1, 0(s1)
1358    ins         v0, v1, 16, 16
1359    ins         t0, t1, 16, 16
1360    raddu.w.qb  t2, v0
1361    raddu.w.qb  s3, t0
1362    lbu         v0, 0(s2)
1363    lbu         v1, 2(s2)
1364    lbu         t0, 0(t9)
1365    lbu         t1, 2(t9)
1366    addu        v0, v0, v1
1367    mult        $ac1, t2, t6
1368    addu        t0, t0, t1
1369    lbu         t2, 2(s0)
1370    addu        t0, t0, v0
1371    lbu         t3, 2(s1)
1372    addu        s3, t0, s3
1373    lbu         v0, 0(s0)
1374    lbu         t0, 0(s1)
1375    sll         s3, s3, 1
1376    addu        v0, v0, t2
1377    addu        t0, t0, t3
1378    addu        t0, t0, v0
1379    addu        s3, t0, s3
1380    madd        $ac1, s3, t7
1381    extr_r.w    v0, $ac1, 16
1382    addiu       t8, t8, 1
1383    addiu       s2, s2, 2
1384    addiu       t9, t9, 2
1385    addiu       s0, s0, 2
1386    addiu       s1, s1, 2
1387    sb          v0, -1(t8)
1388    addiu       s4, s7, -2
1389    and         s4, s4, 3
1390    addu        s5, s4, t8      // end address
13914:
1392    lh          v0, 0(s2)
1393    lh          v1, 0(t9)
1394    lh          t0, 0(s0)
1395    lh          t1, 0(s1)
1396    ins         v0, v1, 16, 16
1397    ins         t0, t1, 16, 16
1398    raddu.w.qb  t2, v0
1399    raddu.w.qb  s3, t0
1400    lbu         v0, -1(s2)
1401    lbu         v1, 2(s2)
1402    lbu         t0, -1(t9)
1403    lbu         t1, 2(t9)
1404    addu        v0, v0, v1
1405    mult        $ac1, t2, t6
1406    addu        t0, t0, t1
1407    lbu         t2, 2(s0)
1408    addu        t0, t0, v0
1409    lbu         t3, 2(s1)
1410    addu        s3, t0, s3
1411    lbu         v0, -1(s0)
1412    lbu         t0, -1(s1)
1413    sll         s3, s3, 1
1414    addu        v0, v0, t2
1415    addu        t0, t0, t3
1416    addu        t0, t0, v0
1417    addu        s3, t0, s3
1418    madd        $ac1, s3, t7
1419    extr_r.w    t2, $ac1, 16
1420    addiu       t8, t8, 1
1421    addiu       s2, s2, 2
1422    addiu       t9, t9, 2
1423    addiu       s0, s0, 2
1424    sb          t2, -1(t8)
1425    bne         s5, t8, 4b
1426    addiu       s1, s1, 2
1427    addiu       s5, s7, -2
1428    subu        s5, s5, s4
1429    addu        s5, s5, t8      // end address
14305:
1431    lh          v0, 0(s2)
1432    lh          v1, 0(t9)
1433    lh          t0, 0(s0)
1434    lh          t1, 0(s1)
1435    ins         v0, v1, 16, 16
1436    ins         t0, t1, 16, 16
1437    raddu.w.qb  t2, v0
1438    raddu.w.qb  s3, t0
1439    lbu         v0, -1(s2)
1440    lbu         v1, 2(s2)
1441    lbu         t0, -1(t9)
1442    lbu         t1, 2(t9)
1443    addu        v0, v0, v1
1444    mult        $ac1, t2, t6
1445    addu        t0, t0, t1
1446    lbu         t2, 2(s0)
1447    addu        t0, t0, v0
1448    lbu         t3, 2(s1)
1449    addu        s3, t0, s3
1450    lbu         v0, -1(s0)
1451    lbu         t0, -1(s1)
1452    sll         s3, s3, 1
1453    addu        v0, v0, t2
1454    addu        t0, t0, t3
1455    lh          v1, 2(t9)
1456    addu        t0, t0, v0
1457    lh          v0, 2(s2)
1458    addu        s3, t0, s3
1459    lh          t0, 2(s0)
1460    lh          t1, 2(s1)
1461    madd        $ac1, s3, t7
1462    extr_r.w    t2, $ac1, 16
1463    ins         t0, t1, 16, 16
1464    ins         v0, v1, 16, 16
1465    raddu.w.qb  s3, t0
1466    lbu         v1, 4(s2)
1467    lbu         t0, 1(t9)
1468    lbu         t1, 4(t9)
1469    sb          t2, 0(t8)
1470    raddu.w.qb  t3, v0
1471    lbu         v0, 1(s2)
1472    addu        t0, t0, t1
1473    mult        $ac1, t3, t6
1474    addu        v0, v0, v1
1475    lbu         t2, 4(s0)
1476    addu        t0, t0, v0
1477    lbu         v0, 1(s0)
1478    addu        s3, t0, s3
1479    lbu         t0, 1(s1)
1480    lbu         t3, 4(s1)
1481    addu        v0, v0, t2
1482    sll         s3, s3, 1
1483    addu        t0, t0, t3
1484    lh          v1, 4(t9)
1485    addu        t0, t0, v0
1486    lh          v0, 4(s2)
1487    addu        s3, t0, s3
1488    lh          t0, 4(s0)
1489    lh          t1, 4(s1)
1490    madd        $ac1, s3, t7
1491    extr_r.w    t2, $ac1, 16
1492    ins         t0, t1, 16, 16
1493    ins         v0, v1, 16, 16
1494    raddu.w.qb  s3, t0
1495    lbu         v1, 6(s2)
1496    lbu         t0, 3(t9)
1497    lbu         t1, 6(t9)
1498    sb          t2, 1(t8)
1499    raddu.w.qb  t3, v0
1500    lbu         v0, 3(s2)
1501    addu        t0, t0, t1
1502    mult        $ac1, t3, t6
1503    addu        v0, v0, v1
1504    lbu         t2, 6(s0)
1505    addu        t0, t0, v0
1506    lbu         v0, 3(s0)
1507    addu        s3, t0, s3
1508    lbu         t0, 3(s1)
1509    lbu         t3, 6(s1)
1510    addu        v0, v0, t2
1511    sll         s3, s3, 1
1512    addu        t0, t0, t3
1513    lh          v1, 6(t9)
1514    addu        t0, t0, v0
1515    lh          v0, 6(s2)
1516    addu        s3, t0, s3
1517    lh          t0, 6(s0)
1518    lh          t1, 6(s1)
1519    madd        $ac1, s3, t7
1520    extr_r.w    t3, $ac1, 16
1521    ins         t0, t1, 16, 16
1522    ins         v0, v1, 16, 16
1523    raddu.w.qb  s3, t0
1524    lbu         v1, 8(s2)
1525    lbu         t0, 5(t9)
1526    lbu         t1, 8(t9)
1527    sb          t3, 2(t8)
1528    raddu.w.qb  t2, v0
1529    lbu         v0, 5(s2)
1530    addu        t0, t0, t1
1531    mult        $ac1, t2, t6
1532    addu        v0, v0, v1
1533    lbu         t2, 8(s0)
1534    addu        t0, t0, v0
1535    lbu         v0, 5(s0)
1536    addu        s3, t0, s3
1537    lbu         t0, 5(s1)
1538    lbu         t3, 8(s1)
1539    addu        v0, v0, t2
1540    sll         s3, s3, 1
1541    addu        t0, t0, t3
1542    addiu       t8, t8, 4
1543    addu        t0, t0, v0
1544    addiu       s2, s2, 8
1545    addu        s3, t0, s3
1546    addiu       t9, t9, 8
1547    madd        $ac1, s3, t7
1548    extr_r.w    t1, $ac1, 16
1549    addiu       s0, s0, 8
1550    addiu       s1, s1, 8
1551    bne         s5, t8, 5b
1552    sb          t1, -1(t8)
1553/* Special case for last column */
1554    lh          v0, 0(s2)
1555    lh          v1, 0(t9)
1556    lh          t0, 0(s0)
1557    lh          t1, 0(s1)
1558    ins         v0, v1, 16, 16
1559    ins         t0, t1, 16, 16
1560    raddu.w.qb  t2, v0
1561    raddu.w.qb  s3, t0
1562    lbu         v0, -1(s2)
1563    lbu         v1, 1(s2)
1564    lbu         t0, -1(t9)
1565    lbu         t1, 1(t9)
1566    addu        v0, v0, v1
1567    mult        $ac1, t2, t6
1568    addu        t0, t0, t1
1569    lbu         t2, 1(s0)
1570    addu        t0, t0, v0
1571    lbu         t3, 1(s1)
1572    addu        s3, t0, s3
1573    lbu         v0, -1(s0)
1574    lbu         t0, -1(s1)
1575    sll         s3, s3, 1
1576    addu        v0, v0, t2
1577    addu        t0, t0, t3
1578    addu        t0, t0, v0
1579    addu        s3, t0, s3
1580    madd        $ac1, s3, t7
1581    extr_r.w    t0, $ac1, 16
1582    addiu       t5, t5, 2
1583    sb          t0, 0(t8)
1584    addiu       t4, t4, 1
1585    bne         t4, a2, 3b
1586    addiu       t5, t5, 2
1587
1588    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1589
1590    j           ra
1591     nop
1592
1593END(jsimd_h2v2_smooth_downsample_dspr2)
1594
1595
1596/*****************************************************************************/
1597LEAF_DSPR2(jsimd_int_upsample_dspr2)
1598/*
1599 * a0     = upsample->h_expand[compptr->component_index]
1600 * a1     = upsample->v_expand[compptr->component_index]
1601 * a2     = input_data
1602 * a3     = output_data_ptr
1603 * 16(sp) = cinfo->output_width
1604 * 20(sp) = cinfo->max_v_samp_factor
1605 */
1606    .set at
1607
1608    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1609
1610    lw          s0, 0(a3)       // s0 = output_data
1611    lw          s1, 32(sp)      // s1 = cinfo->output_width
1612    lw          s2, 36(sp)      // s2 = cinfo->max_v_samp_factor
1613    li          t6, 0           // t6 = inrow
1614    beqz        s2, 10f
1615     li         s3, 0           // s3 = outrow
16160:
1617    addu        t0, a2, t6
1618    addu        t7, s0, s3
1619    lw          t3, 0(t0)       // t3 = inptr
1620    lw          t8, 0(t7)       // t8 = outptr
1621    beqz        s1, 4f
1622     addu       t5, t8, s1      // t5 = outend
16231:
1624    lb          t2, 0(t3)       // t2 = invalue = *inptr++
1625    addiu       t3, 1
1626    beqz        a0, 3f
1627     move       t0, a0          // t0 = h_expand
16282:
1629    sb          t2, 0(t8)
1630    addiu       t0, -1
1631    bgtz        t0, 2b
1632     addiu      t8, 1
16333:
1634    bgt         t5, t8, 1b
1635     nop
16364:
1637    addiu       t9, a1, -1      // t9 = v_expand - 1
1638    blez        t9, 9f
1639     nop
16405:
1641    lw          t3, 0(s0)
1642    lw          t4, 4(s0)
1643    subu        t0, s1, 0xF
1644    blez        t0, 7f
1645     addu       t5, t3, s1      // t5 = end address
1646    andi        t7, s1, 0xF     // t7 = residual
1647    subu        t8, t5, t7
16486:
1649    ulw         t0, 0(t3)
1650    ulw         t1, 4(t3)
1651    ulw         t2, 8(t3)
1652    usw         t0, 0(t4)
1653    ulw         t0, 12(t3)
1654    usw         t1, 4(t4)
1655    usw         t2, 8(t4)
1656    usw         t0, 12(t4)
1657    addiu       t3, 16
1658    bne         t3, t8, 6b
1659     addiu      t4, 16
1660    beqz        t7, 8f
1661     nop
16627:
1663    lbu         t0, 0(t3)
1664    sb          t0, 0(t4)
1665    addiu       t3, 1
1666    bne         t3, t5, 7b
1667     addiu      t4, 1
16688:
1669    addiu       t9, -1
1670    bgtz        t9, 5b
1671     addiu      s0, 8
16729:
1673    addu        s3, s3, a1
1674    bne         s3, s2, 0b
1675     addiu      t6, 1
167610:
1677    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1678
1679    j           ra
1680     nop
1681END(jsimd_int_upsample_dspr2)
1682
1683
1684/*****************************************************************************/
1685LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
1686/*
1687 * a0 = cinfo->max_v_samp_factor
1688 * a1 = cinfo->output_width
1689 * a2 = input_data
1690 * a3 = output_data_ptr
1691 */
1692    lw          t7, 0(a3)       // t7 = output_data
1693    andi        t8, a1, 0xf     // t8 = residual
1694    sll         t0, a0, 2
1695    blez        a0, 4f
1696     addu       t9, t7, t0      // t9 = output_data end address
16970:
1698    lw          t5, 0(t7)       // t5 = outptr
1699    lw          t6, 0(a2)       // t6 = inptr
1700    addu        t3, t5, a1      // t3 = outptr + output_width (end address)
1701    subu        t3, t8          // t3 = end address - residual
1702    beq         t5, t3, 2f
1703     move       t4, t8
17041:
1705    ulw         t0, 0(t6)       // t0 = |P3|P2|P1|P0|
1706    ulw         t2, 4(t6)       // t2 = |P7|P6|P5|P4|
1707    srl         t1, t0, 16      // t1 = |X|X|P3|P2|
1708    ins         t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
1709    ins         t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
1710    ins         t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
1711    ins         t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
1712    usw         t0, 0(t5)
1713    usw         t1, 4(t5)
1714    srl         t0, t2, 16      // t0 = |X|X|P7|P6|
1715    ins         t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
1716    ins         t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
1717    ins         t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
1718    ins         t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
1719    usw         t2, 8(t5)
1720    usw         t0, 12(t5)
1721    addiu       t5, 16
1722    bne         t5, t3, 1b
1723     addiu      t6, 8
1724    beqz        t8, 3f
1725     move       t4, t8
17262:
1727    lbu         t1, 0(t6)
1728    sb          t1, 0(t5)
1729    sb          t1, 1(t5)
1730    addiu       t4, -2
1731    addiu       t6, 1
1732    bgtz        t4, 2b
1733     addiu      t5, 2
17343:
1735    addiu       t7, 4
1736    bne         t9, t7, 0b
1737     addiu      a2, 4
17384:
1739    j           ra
1740     nop
1741END(jsimd_h2v1_upsample_dspr2)
1742
1743
1744/*****************************************************************************/
1745LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
1746/*
1747 * a0 = cinfo->max_v_samp_factor
1748 * a1 = cinfo->output_width
1749 * a2 = input_data
1750 * a3 = output_data_ptr
1751 */
1752    lw          t7, 0(a3)
1753    blez        a0, 7f
1754     andi       t9, a1, 0xf     // t9 = residual
17550:
1756    lw          t6, 0(a2)       // t6 = inptr
1757    lw          t5, 0(t7)       // t5 = outptr
1758    addu        t8, t5, a1      // t8 = outptr end address
1759    subu        t8, t9          // t8 = end address - residual
1760    beq         t5, t8, 2f
1761     move       t4, t9
17621:
1763    ulw         t0, 0(t6)
1764    srl         t1, t0, 16
1765    ins         t0, t0, 16, 16
1766    ins         t0, t0, 8, 16
1767    ins         t1, t1, 16, 16
1768    ins         t1, t1, 8, 16
1769    ulw         t2, 4(t6)
1770    usw         t0, 0(t5)
1771    usw         t1, 4(t5)
1772    srl         t3, t2, 16
1773    ins         t2, t2, 16, 16
1774    ins         t2, t2, 8, 16
1775    ins         t3, t3, 16, 16
1776    ins         t3, t3, 8, 16
1777    usw         t2, 8(t5)
1778    usw         t3, 12(t5)
1779    addiu       t5, 16
1780    bne         t5, t8, 1b
1781     addiu      t6, 8
1782    beqz        t9, 3f
1783     move       t4, t9
17842:
1785    lbu         t0, 0(t6)
1786    sb          t0, 0(t5)
1787    sb          t0, 1(t5)
1788    addiu       t4, -2
1789    addiu       t6, 1
1790    bgtz        t4, 2b
1791     addiu      t5, 2
17923:
1793    lw          t6, 0(t7)       // t6 = outptr[0]
1794    lw          t5, 4(t7)       // t5 = outptr[1]
1795    addu        t4, t6, a1      // t4 = new end address
1796    beq         a1, t9, 5f
1797     subu       t8, t4, t9
17984:
1799    ulw         t0, 0(t6)
1800    ulw         t1, 4(t6)
1801    ulw         t2, 8(t6)
1802    usw         t0, 0(t5)
1803    ulw         t0, 12(t6)
1804    usw         t1, 4(t5)
1805    usw         t2, 8(t5)
1806    usw         t0, 12(t5)
1807    addiu       t6, 16
1808    bne         t6, t8, 4b
1809     addiu      t5, 16
1810    beqz        t9, 6f
1811     nop
18125:
1813    lbu         t0, 0(t6)
1814    sb          t0, 0(t5)
1815    addiu       t6, 1
1816    bne         t6, t4, 5b
1817     addiu      t5, 1
18186:
1819    addiu       t7, 8
1820    addiu       a0, -2
1821    bgtz        a0, 0b
1822     addiu      a2, 4
18237:
1824    j           ra
1825     nop
1826END(jsimd_h2v2_upsample_dspr2)
1827
1828
1829/*****************************************************************************/
1830LEAF_DSPR2(jsimd_idct_islow_dspr2)
1831/*
1832 * a0 = coef_block
1833 * a1 = compptr->dcttable
1834 * a2 = output
1835 * a3 = range_limit
1836 */
1837    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1838
1839    addiu       sp, sp, -256
1840    move        v0, sp
1841    addiu       v1, zero, 8     // v1 = DCTSIZE = 8
18421:
1843    lh          s4, 32(a0)      // s4 = inptr[16]
1844    lh          s5, 64(a0)      // s5 = inptr[32]
1845    lh          s6, 96(a0)      // s6 = inptr[48]
1846    lh          t1, 112(a0)     // t1 = inptr[56]
1847    lh          t7, 16(a0)      // t7 = inptr[8]
1848    lh          t5, 80(a0)      // t5 = inptr[40]
1849    lh          t3, 48(a0)      // t3 = inptr[24]
1850    or          s4, s4, t1
1851    or          s4, s4, t3
1852    or          s4, s4, t5
1853    or          s4, s4, t7
1854    or          s4, s4, s5
1855    or          s4, s4, s6
1856    bnez        s4, 2f
1857     addiu      v1, v1, -1
1858    lh          s5, 0(a1)       // quantptr[DCTSIZE*0]
1859    lh          s6, 0(a0)       // inptr[DCTSIZE*0]
1860    mul         s5, s5, s6      // DEQUANTIZE(inptr[0], quantptr[0])
1861    sll         s5, s5, 2
1862    sw          s5, 0(v0)
1863    sw          s5, 32(v0)
1864    sw          s5, 64(v0)
1865    sw          s5, 96(v0)
1866    sw          s5, 128(v0)
1867    sw          s5, 160(v0)
1868    sw          s5, 192(v0)
1869    b           3f
1870     sw         s5, 224(v0)
18712:
1872    lh          t0, 112(a1)
1873    lh          t2, 48(a1)
1874    lh          t4, 80(a1)
1875    lh          t6, 16(a1)
1876    mul         t0, t0, t1      // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
1877    mul         t1, t2, t3      // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
1878    mul         t2, t4, t5      // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
1879    mul         t3, t6, t7      // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
1880    lh          t4, 32(a1)
1881    lh          t5, 32(a0)
1882    lh          t6, 96(a1)
1883    lh          t7, 96(a0)
1884    addu        s0, t0, t1       // z3 = tmp0 + tmp2
1885    addu        s1, t1, t2       // z2 = tmp1 + tmp2
1886    addu        s2, t2, t3       // z4 = tmp1 + tmp3
1887    addu        s3, s0, s2       // z3 + z4
1888    addiu       t9, zero, 9633   // FIX_1_175875602
1889    mul         s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1890    addu        t8, t0, t3       // z1 = tmp0 + tmp3
1891    addiu       t9, zero, 2446   // FIX_0_298631336
1892    mul         t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1893    addiu       t9, zero, 16819  // FIX_2_053119869
1894    mul         t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1895    addiu       t9, zero, 25172  // FIX_3_072711026
1896    mul         t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1897    addiu       t9, zero, 12299  // FIX_1_501321110
1898    mul         t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1899    addiu       t9, zero, 16069  // FIX_1_961570560
1900    mul         s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
1901    addiu       t9, zero, 3196   // FIX_0_390180644
1902    mul         s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
1903    addiu       t9, zero, 7373   // FIX_0_899976223
1904    mul         t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
1905    addiu       t9, zero, 20995  // FIX_2_562915447
1906    mul         s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
1907    subu        s0, s3, s0       // z3 += z5
1908    addu        t0, t0, s0       // tmp0 += z3
1909    addu        t1, t1, s0       // tmp2 += z3
1910    subu        s2, s3, s2       // z4 += z5
1911    addu        t2, t2, s2       // tmp1 += z4
1912    addu        t3, t3, s2       // tmp3 += z4
1913    subu        t0, t0, t8       // tmp0 += z1
1914    subu        t1, t1, s1       // tmp2 += z2
1915    subu        t2, t2, s1       // tmp1 += z2
1916    subu        t3, t3, t8       // tmp3 += z1
1917    mul         s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
1918    addiu       t9, zero, 6270   // FIX_0_765366865
1919    mul         s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
1920    lh          t4, 0(a1)
1921    lh          t5, 0(a0)
1922    lh          t6, 64(a1)
1923    lh          t7, 64(a0)
1924    mul         s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
1925    mul         t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
1926    mul         t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
1927    addiu       t9, zero, 4433   // FIX_0_541196100
1928    addu        s3, s0, s1       // z2 + z3
1929    mul         s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1930    addiu       t9, zero, 15137  // FIX_1_847759065
1931    mul         t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
1932    addu        t4, t5, t6
1933    subu        t5, t5, t6
1934    sll         t4, t4, 13      // tmp0 = (z2 + z3) << CONST_BITS
1935    sll         t5, t5, 13      // tmp1 = (z2 - z3) << CONST_BITS
1936    addu        t7, s3, s2      // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1937    subu        t6, s3, t8      // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
1938    addu        s0, t4, t7
1939    subu        s1, t4, t7
1940    addu        s2, t5, t6
1941    subu        s3, t5, t6
1942    addu        t4, s0, t3
1943    subu        s0, s0, t3
1944    addu        t3, s2, t1
1945    subu        s2, s2, t1
1946    addu        t1, s3, t2
1947    subu        s3, s3, t2
1948    addu        t2, s1, t0
1949    subu        s1, s1, t0
1950    shra_r.w    t4, t4, 11
1951    shra_r.w    t3, t3, 11
1952    shra_r.w    t1, t1, 11
1953    shra_r.w    t2, t2, 11
1954    shra_r.w    s1, s1, 11
1955    shra_r.w    s3, s3, 11
1956    shra_r.w    s2, s2, 11
1957    shra_r.w    s0, s0, 11
1958    sw          t4, 0(v0)
1959    sw          t3, 32(v0)
1960    sw          t1, 64(v0)
1961    sw          t2, 96(v0)
1962    sw          s1, 128(v0)
1963    sw          s3, 160(v0)
1964    sw          s2, 192(v0)
1965    sw          s0, 224(v0)
19663:
1967    addiu       a1, a1, 2
1968    addiu       a0, a0, 2
1969    bgtz        v1, 1b
1970     addiu      v0, v0, 4
1971    move        v0, sp
1972    addiu       v1, zero, 8
19734:
1974    lw          t0, 8(v0)       // z2 = (JLONG)wsptr[2]
1975    lw          t1, 24(v0)      // z3 = (JLONG)wsptr[6]
1976    lw          t2, 0(v0)       // (JLONG)wsptr[0]
1977    lw          t3, 16(v0)      // (JLONG)wsptr[4]
1978    lw          s4, 4(v0)       // (JLONG)wsptr[1]
1979    lw          s5, 12(v0)      // (JLONG)wsptr[3]
1980    lw          s6, 20(v0)      // (JLONG)wsptr[5]
1981    lw          s7, 28(v0)      // (JLONG)wsptr[7]
1982    or          s4, s4, t0
1983    or          s4, s4, t1
1984    or          s4, s4, t3
1985    or          s4, s4, s7
1986    or          s4, s4, s5
1987    or          s4, s4, s6
1988    bnez        s4, 5f
1989     addiu      v1, v1, -1
1990    shra_r.w    s5, t2, 5
1991    andi        s5, s5, 0x3ff
1992    lbux        s5, s5(a3)
1993    lw          s1, 0(a2)
1994    replv.qb    s5, s5
1995    usw         s5, 0(s1)
1996    usw         s5, 4(s1)
1997    b           6f
1998     nop
19995:
2000    addu        t4, t0, t1       // z2 + z3
2001    addiu       t8, zero, 4433   // FIX_0_541196100
2002    mul         t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2003    addiu       t8, zero, 15137  // FIX_1_847759065
2004    mul         t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
2005    addiu       t8, zero, 6270   // FIX_0_765366865
2006    mul         t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
2007    addu        t4, t2, t3       // (JLONG)wsptr[0] + (JLONG)wsptr[4]
2008    subu        t2, t2, t3       // (JLONG)wsptr[0] - (JLONG)wsptr[4]
2009    sll         t4, t4, 13       // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
2010    sll         t2, t2, 13       // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
2011    subu        t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
2012    subu        t3, t2, t1       // tmp12 = tmp1 - tmp2
2013    addu        t2, t2, t1       // tmp11 = tmp1 + tmp2
2014    addu        t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2015    subu        t1, t4, t5       // tmp13 = tmp0 - tmp3
2016    addu        t0, t4, t5       // tmp10 = tmp0 + tmp3
2017    lw          t4, 28(v0)       // tmp0 = (JLONG)wsptr[7]
2018    lw          t6, 12(v0)       // tmp2 = (JLONG)wsptr[3]
2019    lw          t5, 20(v0)       // tmp1 = (JLONG)wsptr[5]
2020    lw          t7, 4(v0)        // tmp3 = (JLONG)wsptr[1]
2021    addu        s0, t4, t6       // z3 = tmp0 + tmp2
2022    addiu       t8, zero, 9633   // FIX_1_175875602
2023    addu        s1, t5, t7       // z4 = tmp1 + tmp3
2024    addu        s2, s0, s1       // z3 + z4
2025    mul         s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2026    addu        s3, t4, t7       // z1 = tmp0 + tmp3
2027    addu        t9, t5, t6       // z2 = tmp1 + tmp2
2028    addiu       t8, zero, 16069  // FIX_1_961570560
2029    mul         s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
2030    addiu       t8, zero, 3196   // FIX_0_390180644
2031    mul         s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
2032    addiu       t8, zero, 2446   // FIX_0_298631336
2033    mul         t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2034    addiu       t8, zero, 7373   // FIX_0_899976223
2035    mul         s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
2036    addiu       t8, zero, 16819  // FIX_2_053119869
2037    mul         t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2038    addiu       t8, zero, 20995  // FIX_2_562915447
2039    mul         t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
2040    addiu       t8, zero, 25172  // FIX_3_072711026
2041    mul         t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2042    addiu       t8, zero, 12299  // FIX_1_501321110
2043    mul         t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2044    subu        s0, s2, s0       // z3 += z5
2045    subu        s1, s2, s1       // z4 += z5
2046    addu        t4, t4, s0
2047    subu        t4, t4, s3      // tmp0
2048    addu        t5, t5, s1
2049    subu        t5, t5, t9      // tmp1
2050    addu        t6, t6, s0
2051    subu        t6, t6, t9      // tmp2
2052    addu        t7, t7, s1
2053    subu        t7, t7, s3      // tmp3
2054    addu        s0, t0, t7
2055    subu        t0, t0, t7
2056    addu        t7, t2, t6
2057    subu        t2, t2, t6
2058    addu        t6, t3, t5
2059    subu        t3, t3, t5
2060    addu        t5, t1, t4
2061    subu        t1, t1, t4
2062    shra_r.w    s0, s0, 18
2063    shra_r.w    t7, t7, 18
2064    shra_r.w    t6, t6, 18
2065    shra_r.w    t5, t5, 18
2066    shra_r.w    t1, t1, 18
2067    shra_r.w    t3, t3, 18
2068    shra_r.w    t2, t2, 18
2069    shra_r.w    t0, t0, 18
2070    andi        s0, s0, 0x3ff
2071    andi        t7, t7, 0x3ff
2072    andi        t6, t6, 0x3ff
2073    andi        t5, t5, 0x3ff
2074    andi        t1, t1, 0x3ff
2075    andi        t3, t3, 0x3ff
2076    andi        t2, t2, 0x3ff
2077    andi        t0, t0, 0x3ff
2078    lw          s1, 0(a2)
2079    lbux        s0, s0(a3)
2080    lbux        t7, t7(a3)
2081    lbux        t6, t6(a3)
2082    lbux        t5, t5(a3)
2083    lbux        t1, t1(a3)
2084    lbux        t3, t3(a3)
2085    lbux        t2, t2(a3)
2086    lbux        t0, t0(a3)
2087    sb          s0, 0(s1)
2088    sb          t7, 1(s1)
2089    sb          t6, 2(s1)
2090    sb          t5, 3(s1)
2091    sb          t1, 4(s1)
2092    sb          t3, 5(s1)
2093    sb          t2, 6(s1)
2094    sb          t0, 7(s1)
20956:
2096    addiu       v0, v0, 32
2097    bgtz        v1, 4b
2098     addiu      a2, a2, 4
2099    addiu       sp, sp, 256
2100
2101    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2102
2103    j           ra
2104     nop
2105
2106END(jsimd_idct_islow_dspr2)
2107
2108
2109/*****************************************************************************/
2110LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
2111/*
2112 * a0 = inptr
2113 * a1 = quantptr
2114 * a2 = wsptr
2115 * a3 = mips_idct_ifast_coefs
2116 */
2117    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2118
2119    addiu         t9, a0, 16      // end address
2120    or            AT, a3, zero
2121
21220:
2123    lw            s0, 0(a1)       // quantptr[DCTSIZE*0]
2124    lw            t0, 0(a0)       // inptr[DCTSIZE*0]
2125    lw            t1, 16(a0)      // inptr[DCTSIZE*1]
2126    muleq_s.w.phl v0, t0, s0      // tmp0 ...
2127    lw            t2, 32(a0)      // inptr[DCTSIZE*2]
2128    lw            t3, 48(a0)      // inptr[DCTSIZE*3]
2129    lw            t4, 64(a0)      // inptr[DCTSIZE*4]
2130    lw            t5, 80(a0)      // inptr[DCTSIZE*5]
2131    muleq_s.w.phr t0, t0, s0      // ... tmp0 ...
2132    lw            t6, 96(a0)      // inptr[DCTSIZE*6]
2133    lw            t7, 112(a0)     // inptr[DCTSIZE*7]
2134    or            s4, t1, t2
2135    or            s5, t3, t4
2136    bnez          s4, 1f
2137     ins          t0, v0, 16, 16  // ... tmp0
2138    bnez          s5, 1f
2139     or           s6, t5, t6
2140    or            s6, s6, t7
2141    bnez          s6, 1f
2142     sw           t0, 0(a2)       // wsptr[DCTSIZE*0]
2143    sw            t0, 16(a2)      // wsptr[DCTSIZE*1]
2144    sw            t0, 32(a2)      // wsptr[DCTSIZE*2]
2145    sw            t0, 48(a2)      // wsptr[DCTSIZE*3]
2146    sw            t0, 64(a2)      // wsptr[DCTSIZE*4]
2147    sw            t0, 80(a2)      // wsptr[DCTSIZE*5]
2148    sw            t0, 96(a2)      // wsptr[DCTSIZE*6]
2149    sw            t0, 112(a2)     // wsptr[DCTSIZE*7]
2150    addiu         a0, a0, 4
2151    b             2f
2152     addiu        a1, a1, 4
2153
21541:
2155    lw            s1, 32(a1)      // quantptr[DCTSIZE*2]
2156    lw            s2, 64(a1)      // quantptr[DCTSIZE*4]
2157    muleq_s.w.phl v0, t2, s1      // tmp1 ...
2158    muleq_s.w.phr t2, t2, s1      // ... tmp1 ...
2159    lw            s0, 16(a1)      // quantptr[DCTSIZE*1]
2160    lw            s1, 48(a1)      // quantptr[DCTSIZE*3]
2161    lw            s3, 96(a1)      // quantptr[DCTSIZE*6]
2162    muleq_s.w.phl v1, t4, s2      // tmp2 ...
2163    muleq_s.w.phr t4, t4, s2      // ... tmp2 ...
2164    lw            s2, 80(a1)      // quantptr[DCTSIZE*5]
2165    lw            t8, 4(AT)       // FIX(1.414213562)
2166    ins           t2, v0, 16, 16  // ... tmp1
2167    muleq_s.w.phl v0, t6, s3      // tmp3 ...
2168    muleq_s.w.phr t6, t6, s3      // ... tmp3 ...
2169    ins           t4, v1, 16, 16  // ... tmp2
2170    addq.ph       s4, t0, t4      // tmp10
2171    subq.ph       s5, t0, t4      // tmp11
2172    ins           t6, v0, 16, 16  // ... tmp3
2173    subq.ph       s6, t2, t6      // tmp12 ...
2174    addq.ph       s7, t2, t6      // tmp13
2175    mulq_s.ph     s6, s6, t8      // ... tmp12 ...
2176    addq.ph       t0, s4, s7      // tmp0
2177    subq.ph       t6, s4, s7      // tmp3
2178    muleq_s.w.phl v0, t1, s0      // tmp4 ...
2179    muleq_s.w.phr t1, t1, s0      // ... tmp4 ...
2180    shll_s.ph     s6, s6, 1       // x2
2181    lw            s3, 112(a1)     // quantptr[DCTSIZE*7]
2182    subq.ph       s6, s6, s7      // ... tmp12
2183    muleq_s.w.phl v1, t7, s3      // tmp7 ...
2184    muleq_s.w.phr t7, t7, s3      // ... tmp7 ...
2185    ins           t1, v0, 16, 16  // ... tmp4
2186    addq.ph       t2, s5, s6      // tmp1
2187    subq.ph       t4, s5, s6      // tmp2
2188    muleq_s.w.phl v0, t5, s2      // tmp6 ...
2189    muleq_s.w.phr t5, t5, s2      // ... tmp6 ...
2190    ins           t7, v1, 16, 16  // ... tmp7
2191    addq.ph       s5, t1, t7      // z11
2192    subq.ph       s6, t1, t7      // z12
2193    muleq_s.w.phl v1, t3, s1      // tmp5 ...
2194    muleq_s.w.phr t3, t3, s1      // ... tmp5 ...
2195    ins           t5, v0, 16, 16  // ... tmp6
2196    ins           t3, v1, 16, 16  // ... tmp5
2197    addq.ph       s7, t5, t3      // z13
2198    subq.ph       v0, t5, t3      // z10
2199    addq.ph       t7, s5, s7      // tmp7
2200    subq.ph       s5, s5, s7      // tmp11 ...
2201    addq.ph       v1, v0, s6      // z5 ...
2202    mulq_s.ph     s5, s5, t8      // ... tmp11
2203    lw            t8, 8(AT)       // FIX(1.847759065)
2204    lw            s4, 0(AT)       // FIX(1.082392200)
2205    addq.ph       s0, t0, t7
2206    subq.ph       s1, t0, t7
2207    mulq_s.ph     v1, v1, t8      // ... z5
2208    shll_s.ph     s5, s5, 1       // x2
2209    lw            t8, 12(AT)      // FIX(-2.613125930)
2210    sw            s0, 0(a2)       // wsptr[DCTSIZE*0]
2211    shll_s.ph     v0, v0, 1       // x4
2212    mulq_s.ph     v0, v0, t8      // tmp12 ...
2213    mulq_s.ph     s4, s6, s4      // tmp10 ...
2214    shll_s.ph     v1, v1, 1       // x2
2215    addiu         a0, a0, 4
2216    addiu         a1, a1, 4
2217    sw            s1, 112(a2)     // wsptr[DCTSIZE*7]
2218    shll_s.ph     s6, v0, 1       // x4
2219    shll_s.ph     s4, s4, 1       // x2
2220    addq.ph       s6, s6, v1      // ... tmp12
2221    subq.ph       t5, s6, t7      // tmp6
2222    subq.ph       s4, s4, v1      // ... tmp10
2223    subq.ph       t3, s5, t5      // tmp5
2224    addq.ph       s2, t2, t5
2225    addq.ph       t1, s4, t3      // tmp4
2226    subq.ph       s3, t2, t5
2227    sw            s2, 16(a2)      // wsptr[DCTSIZE*1]
2228    sw            s3, 96(a2)      // wsptr[DCTSIZE*6]
2229    addq.ph       v0, t4, t3
2230    subq.ph       v1, t4, t3
2231    sw            v0, 32(a2)      // wsptr[DCTSIZE*2]
2232    sw            v1, 80(a2)      // wsptr[DCTSIZE*5]
2233    addq.ph       v0, t6, t1
2234    subq.ph       v1, t6, t1
2235    sw            v0, 64(a2)      // wsptr[DCTSIZE*4]
2236    sw            v1, 48(a2)      // wsptr[DCTSIZE*3]
2237
22382:
2239    bne           a0, t9, 0b
2240     addiu        a2, a2, 4
2241
2242    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2243
2244    j             ra
2245     nop
2246
2247END(jsimd_idct_ifast_cols_dspr2)
2248
2249
2250/*****************************************************************************/
2251LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
2252/*
2253 * a0 = wsptr
2254 * a1 = output_buf
2255 * a2 = output_col
2256 * a3 = mips_idct_ifast_coefs
2257 */
2258    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2259
2260    addiu         t9, a0, 128     // end address
2261    lui           s8, 0x8080
2262    ori           s8, s8, 0x8080
2263
22640:
2265    lw            AT, 36(sp)      // restore $a3 (mips_idct_ifast_coefs)
2266    lw            t0, 0(a0)       // wsptr[DCTSIZE*0+0/1]  b a
2267    lw            s0, 16(a0)      // wsptr[DCTSIZE*1+0/1]  B A
2268    lw            t2, 4(a0)       // wsptr[DCTSIZE*0+2/3]  d c
2269    lw            s2, 20(a0)      // wsptr[DCTSIZE*1+2/3]  D C
2270    lw            t4, 8(a0)       // wsptr[DCTSIZE*0+4/5]  f e
2271    lw            s4, 24(a0)      // wsptr[DCTSIZE*1+4/5]  F E
2272    lw            t6, 12(a0)      // wsptr[DCTSIZE*0+6/7]  h g
2273    lw            s6, 28(a0)      // wsptr[DCTSIZE*1+6/7]  H G
2274    precrq.ph.w   t1, s0, t0      // B b
2275    ins           t0, s0, 16, 16  // A a
2276    bnez          t1, 1f
2277     or           s0, t2, s2
2278    bnez          s0, 1f
2279     or           s0, t4, s4
2280    bnez          s0, 1f
2281     or           s0, t6, s6
2282    bnez          s0, 1f
2283     shll_s.ph    s0, t0, 2       // A a
2284    lw            a3, 0(a1)
2285    lw            AT, 4(a1)
2286    precrq.ph.w   t0, s0, s0      // A A
2287    ins           s0, s0, 16, 16  // a a
2288    addu          a3, a3, a2
2289    addu          AT, AT, a2
2290    precrq.qb.ph  t0, t0, t0      // A A A A
2291    precrq.qb.ph  s0, s0, s0      // a a a a
2292    addu.qb       s0, s0, s8
2293    addu.qb       t0, t0, s8
2294    sw            s0, 0(a3)
2295    sw            s0, 4(a3)
2296    sw            t0, 0(AT)
2297    sw            t0, 4(AT)
2298    addiu         a0, a0, 32
2299    bne           a0, t9, 0b
2300     addiu        a1, a1, 8
2301    b             2f
2302     nop
2303
23041:
2305    precrq.ph.w   t3, s2, t2
2306    ins           t2, s2, 16, 16
2307    precrq.ph.w   t5, s4, t4
2308    ins           t4, s4, 16, 16
2309    precrq.ph.w   t7, s6, t6
2310    ins           t6, s6, 16, 16
2311    lw            t8, 4(AT)       // FIX(1.414213562)
2312    addq.ph       s4, t0, t4      // tmp10
2313    subq.ph       s5, t0, t4      // tmp11
2314    subq.ph       s6, t2, t6      // tmp12 ...
2315    addq.ph       s7, t2, t6      // tmp13
2316    mulq_s.ph     s6, s6, t8      // ... tmp12 ...
2317    addq.ph       t0, s4, s7      // tmp0
2318    subq.ph       t6, s4, s7      // tmp3
2319    shll_s.ph     s6, s6, 1       // x2
2320    subq.ph       s6, s6, s7      // ... tmp12
2321    addq.ph       t2, s5, s6      // tmp1
2322    subq.ph       t4, s5, s6      // tmp2
2323    addq.ph       s5, t1, t7      // z11
2324    subq.ph       s6, t1, t7      // z12
2325    addq.ph       s7, t5, t3      // z13
2326    subq.ph       v0, t5, t3      // z10
2327    addq.ph       t7, s5, s7      // tmp7
2328    subq.ph       s5, s5, s7      // tmp11 ...
2329    addq.ph       v1, v0, s6      // z5 ...
2330    mulq_s.ph     s5, s5, t8      // ... tmp11
2331    lw            t8, 8(AT)       // FIX(1.847759065)
2332    lw            s4, 0(AT)       // FIX(1.082392200)
2333    addq.ph       s0, t0, t7      // tmp0 + tmp7
2334    subq.ph       s7, t0, t7      // tmp0 - tmp7
2335    mulq_s.ph     v1, v1, t8      // ... z5
2336    lw            a3, 0(a1)
2337    lw            t8, 12(AT)      // FIX(-2.613125930)
2338    shll_s.ph     s5, s5, 1       // x2
2339    addu          a3, a3, a2
2340    shll_s.ph     v0, v0, 1       // x4
2341    mulq_s.ph     v0, v0, t8      // tmp12 ...
2342    mulq_s.ph     s4, s6, s4      // tmp10 ...
2343    shll_s.ph     v1, v1, 1       // x2
2344    addiu         a0, a0, 32
2345    addiu         a1, a1, 8
2346    shll_s.ph     s6, v0, 1       // x4
2347    shll_s.ph     s4, s4, 1       // x2
2348    addq.ph       s6, s6, v1      // ... tmp12
2349    shll_s.ph     s0, s0, 2
2350    subq.ph       t5, s6, t7      // tmp6
2351    subq.ph       s4, s4, v1      // ... tmp10
2352    subq.ph       t3, s5, t5      // tmp5
2353    shll_s.ph     s7, s7, 2
2354    addq.ph       t1, s4, t3      // tmp4
2355    addq.ph       s1, t2, t5      // tmp1 + tmp6
2356    subq.ph       s6, t2, t5      // tmp1 - tmp6
2357    addq.ph       s2, t4, t3      // tmp2 + tmp5
2358    subq.ph       s5, t4, t3      // tmp2 - tmp5
2359    addq.ph       s4, t6, t1      // tmp3 + tmp4
2360    subq.ph       s3, t6, t1      // tmp3 - tmp4
2361    shll_s.ph     s1, s1, 2
2362    shll_s.ph     s2, s2, 2
2363    shll_s.ph     s3, s3, 2
2364    shll_s.ph     s4, s4, 2
2365    shll_s.ph     s5, s5, 2
2366    shll_s.ph     s6, s6, 2
2367    precrq.ph.w   t0, s1, s0      // B A
2368    ins           s0, s1, 16, 16  // b a
2369    precrq.ph.w   t2, s3, s2      // D C
2370    ins           s2, s3, 16, 16  // d c
2371    precrq.ph.w   t4, s5, s4      // F E
2372    ins           s4, s5, 16, 16  // f e
2373    precrq.ph.w   t6, s7, s6      // H G
2374    ins           s6, s7, 16, 16  // h g
2375    precrq.qb.ph  t0, t2, t0      // D C B A
2376    precrq.qb.ph  s0, s2, s0      // d c b a
2377    precrq.qb.ph  t4, t6, t4      // H G F E
2378    precrq.qb.ph  s4, s6, s4      // h g f e
2379    addu.qb       s0, s0, s8
2380    addu.qb       s4, s4, s8
2381    sw            s0, 0(a3)       // outptr[0/1/2/3]       d c b a
2382    sw            s4, 4(a3)       // outptr[4/5/6/7]       h g f e
2383    lw            a3, -4(a1)
2384    addu.qb       t0, t0, s8
2385    addu          a3, a3, a2
2386    addu.qb       t4, t4, s8
2387    sw            t0, 0(a3)       // outptr[0/1/2/3]       D C B A
2388    bne           a0, t9, 0b
2389     sw           t4, 4(a3)       // outptr[4/5/6/7]       H G F E
2390
23912:
2392
2393    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2394
2395    j             ra
2396     nop
2397
2398END(jsimd_idct_ifast_rows_dspr2)
2399
2400
2401/*****************************************************************************/
2402LEAF_DSPR2(jsimd_fdct_islow_dspr2)
2403/*
2404 * a0 = data
2405 */
2406    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2407
2408    lui         t0, 6437
2409    ori         t0, 2260
2410    lui         t1, 9633
2411    ori         t1, 11363
2412    lui         t2, 0xd39e
2413    ori         t2, 0xe6dc
2414    lui         t3, 0xf72d
2415    ori         t3, 9633
2416    lui         t4, 2261
2417    ori         t4, 9633
2418    lui         t5, 0xd39e
2419    ori         t5, 6437
2420    lui         t6, 9633
2421    ori         t6, 0xd39d
2422    lui         t7, 0xe6dc
2423    ori         t7, 2260
2424    lui         t8, 4433
2425    ori         t8, 10703
2426    lui         t9, 0xd630
2427    ori         t9, 4433
2428    li          s8, 8
2429    move        a1, a0
24301:
2431    lw          s0, 0(a1)       // tmp0 = 1|0
2432    lw          s1, 4(a1)       // tmp1 = 3|2
2433    lw          s2, 8(a1)       // tmp2 = 5|4
2434    lw          s3, 12(a1)      // tmp3 = 7|6
2435    packrl.ph   s1, s1, s1      // tmp1 = 2|3
2436    packrl.ph   s3, s3, s3      // tmp3 = 6|7
2437    subq.ph     s7, s1, s2      // tmp7 = 2-5|3-4 = t5|t4
2438    subq.ph     s5, s0, s3      // tmp5 = 1-6|0-7 = t6|t7
2439    mult        $0, $0          // ac0  = 0
2440    dpa.w.ph    $ac0, s7, t0    // ac0 += t5*  6437 + t4*  2260
2441    dpa.w.ph    $ac0, s5, t1    // ac0 += t6*  9633 + t7* 11363
2442    mult        $ac1, $0, $0    // ac1  = 0
2443    dpa.w.ph    $ac1, s7, t2    // ac1 += t5*-11362 + t4* -6436
2444    dpa.w.ph    $ac1, s5, t3    // ac1 += t6* -2259 + t7*  9633
2445    mult        $ac2, $0, $0    // ac2  = 0
2446    dpa.w.ph    $ac2, s7, t4    // ac2 += t5*  2261 + t4*  9633
2447    dpa.w.ph    $ac2, s5, t5    // ac2 += t6*-11362 + t7*  6437
2448    mult        $ac3, $0, $0    // ac3  = 0
2449    dpa.w.ph    $ac3, s7, t6    // ac3 += t5*  9633 + t4*-11363
2450    dpa.w.ph    $ac3, s5, t7    // ac3 += t6* -6436 + t7*  2260
2451    addq.ph     s6, s1, s2      // tmp6 = 2+5|3+4 = t2|t3
2452    addq.ph     s4, s0, s3      // tmp4 = 1+6|0+7 = t1|t0
2453    extr_r.w    s0, $ac0, 11    // tmp0 = (ac0 + 1024) >> 11
2454    extr_r.w    s1, $ac1, 11    // tmp1 = (ac1 + 1024) >> 11
2455    extr_r.w    s2, $ac2, 11    // tmp2 = (ac2 + 1024) >> 11
2456    extr_r.w    s3, $ac3, 11    // tmp3 = (ac3 + 1024) >> 11
2457    addq.ph     s5, s4, s6      // tmp5 = t1+t2|t0+t3 = t11|t10
2458    subq.ph     s7, s4, s6      // tmp7 = t1-t2|t0-t3 = t12|t13
2459    sh          s0, 2(a1)
2460    sh          s1, 6(a1)
2461    sh          s2, 10(a1)
2462    sh          s3, 14(a1)
2463    mult        $0, $0          // ac0  = 0
2464    dpa.w.ph    $ac0, s7, t8    // ac0 += t12*  4433 + t13* 10703
2465    mult        $ac1, $0, $0    // ac1  = 0
2466    dpa.w.ph    $ac1, s7, t9    // ac1 += t12*-10704 + t13*  4433
2467    sra         s4, s5, 16      // tmp4 = t11
2468    addiu       a1, a1, 16
2469    addiu       s8, s8, -1
2470    extr_r.w    s0, $ac0, 11    // tmp0 = (ac0 + 1024) >> 11
2471    extr_r.w    s1, $ac1, 11    // tmp1 = (ac1 + 1024) >> 11
2472    addu        s2, s5, s4      // tmp2 = t10 + t11
2473    subu        s3, s5, s4      // tmp3 = t10 - t11
2474    sll         s2, s2, 2       // tmp2 = (t10 + t11) << 2
2475    sll         s3, s3, 2       // tmp3 = (t10 - t11) << 2
2476    sh          s2, -16(a1)
2477    sh          s3, -8(a1)
2478    sh          s0, -12(a1)
2479    bgtz        s8, 1b
2480     sh         s1, -4(a1)
2481    li          t0, 2260
2482    li          t1, 11363
2483    li          t2, 9633
2484    li          t3, 6436
2485    li          t4, 6437
2486    li          t5, 2261
2487    li          t6, 11362
2488    li          t7, 2259
2489    li          t8, 4433
2490    li          t9, 10703
2491    li          a1, 10704
2492    li          s8, 8
2493
24942:
2495    lh          a2, 0(a0)       // 0
2496    lh          a3, 16(a0)      // 8
2497    lh          v0, 32(a0)      // 16
2498    lh          v1, 48(a0)      // 24
2499    lh          s4, 64(a0)      // 32
2500    lh          s5, 80(a0)      // 40
2501    lh          s6, 96(a0)      // 48
2502    lh          s7, 112(a0)     // 56
2503    addu        s2, v0, s5      // tmp2 = 16 + 40
2504    subu        s5, v0, s5      // tmp5 = 16 - 40
2505    addu        s3, v1, s4      // tmp3 = 24 + 32
2506    subu        s4, v1, s4      // tmp4 = 24 - 32
2507    addu        s0, a2, s7      // tmp0 =  0 + 56
2508    subu        s7, a2, s7      // tmp7 =  0 - 56
2509    addu        s1, a3, s6      // tmp1 =  8 + 48
2510    subu        s6, a3, s6      // tmp6 =  8 - 48
2511    addu        a2, s0, s3      // tmp10 = tmp0 + tmp3
2512    subu        v1, s0, s3      // tmp13 = tmp0 - tmp3
2513    addu        a3, s1, s2      // tmp11 = tmp1 + tmp2
2514    subu        v0, s1, s2      // tmp12 = tmp1 - tmp2
2515    mult        s7, t1          // ac0  = tmp7 * c1
2516    madd        s4, t0          // ac0 += tmp4 * c0
2517    madd        s5, t4          // ac0 += tmp5 * c4
2518    madd        s6, t2          // ac0 += tmp6 * c2
2519    mult        $ac1, s7, t2    // ac1  = tmp7 * c2
2520    msub        $ac1, s4, t3    // ac1 -= tmp4 * c3
2521    msub        $ac1, s5, t6    // ac1 -= tmp5 * c6
2522    msub        $ac1, s6, t7    // ac1 -= tmp6 * c7
2523    mult        $ac2, s7, t4    // ac2  = tmp7 * c4
2524    madd        $ac2, s4, t2    // ac2 += tmp4 * c2
2525    madd        $ac2, s5, t5    // ac2 += tmp5 * c5
2526    msub        $ac2, s6, t6    // ac2 -= tmp6 * c6
2527    mult        $ac3, s7, t0    // ac3  = tmp7 * c0
2528    msub        $ac3, s4, t1    // ac3 -= tmp4 * c1
2529    madd        $ac3, s5, t2    // ac3 += tmp5 * c2
2530    msub        $ac3, s6, t3    // ac3 -= tmp6 * c3
2531    extr_r.w    s0, $ac0, 15    // tmp0 = (ac0 + 16384) >> 15
2532    extr_r.w    s1, $ac1, 15    // tmp1 = (ac1 + 16384) >> 15
2533    extr_r.w    s2, $ac2, 15    // tmp2 = (ac2 + 16384) >> 15
2534    extr_r.w    s3, $ac3, 15    // tmp3 = (ac3 + 16384) >> 15
2535    addiu       s8, s8, -1
2536    addu        s4, a2, a3      // tmp4 = tmp10 + tmp11
2537    subu        s5, a2, a3      // tmp5 = tmp10 - tmp11
2538    sh          s0, 16(a0)
2539    sh          s1, 48(a0)
2540    sh          s2, 80(a0)
2541    sh          s3, 112(a0)
2542    mult        v0, t8          // ac0  = tmp12 * c8
2543    madd        v1, t9          // ac0 += tmp13 * c9
2544    mult        $ac1, v1, t8    // ac1  = tmp13 * c8
2545    msub        $ac1, v0, a1    // ac1 -= tmp12 * c10
2546    addiu       a0, a0, 2
2547    extr_r.w    s6, $ac0, 15    // tmp6 = (ac0 + 16384) >> 15
2548    extr_r.w    s7, $ac1, 15    // tmp7 = (ac1 + 16384) >> 15
2549    shra_r.w    s4, s4, 2       // tmp4 = (tmp4 + 2) >> 2
2550    shra_r.w    s5, s5, 2       // tmp5 = (tmp5 + 2) >> 2
2551    sh          s4, -2(a0)
2552    sh          s5, 62(a0)
2553    sh          s6, 30(a0)
2554    bgtz        s8, 2b
2555     sh         s7, 94(a0)
2556
2557    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2558
2559    jr          ra
2560     nop
2561
2562END(jsimd_fdct_islow_dspr2)
2563
2564
2565/**************************************************************************/
2566LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
2567/*
2568 * a0 = data
2569 */
2570    .set at
2571
2572    SAVE_REGS_ON_STACK 8, s0, s1
2573
2574    li          a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2575    li          a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2576    li          a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2577    li          s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2578
2579    move        v0, a0
2580    addiu       v1, v0, 128     // end address
2581
25820:
2583    lw          t0, 0(v0)       // tmp0 = 1|0
2584    lw          t1, 4(v0)       // tmp1 = 3|2
2585    lw          t2, 8(v0)       // tmp2 = 5|4
2586    lw          t3, 12(v0)      // tmp3 = 7|6
2587    packrl.ph   t1, t1, t1      // tmp1 = 2|3
2588    packrl.ph   t3, t3, t3      // tmp3 = 6|7
2589    subq.ph     t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
2590    subq.ph     t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
2591    addq.ph     t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
2592    addq.ph     t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
2593    addq.ph     t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
2594    subq.ph     t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
2595    sra         t4, t8, 16      // tmp4 = t11
2596    mult        $0, $0          // ac0  = 0
2597    dpa.w.ph    $ac0, t9, s1
2598    mult        $ac1, $0, $0    // ac1  = 0
2599    dpa.w.ph    $ac1, t7, a3    // ac1 += t4*98 + t5*98
2600    dpsx.w.ph   $ac1, t5, a3    // ac1 += t6*98 + t7*98
2601    mult        $ac2, $0, $0    // ac2  = 0
2602    dpa.w.ph    $ac2, t7, a2    // ac2 += t4*139 + t5*139
2603    mult        $ac3, $0, $0    // ac3  = 0
2604    dpa.w.ph    $ac3, t5, a1    // ac3 += t6*334 + t7*334
2605    precrq.ph.w t0, t5, t7      // t0 = t5|t6
2606    addq.ph     t2, t8, t4      // tmp2 = t10 + t11
2607    subq.ph     t3, t8, t4      // tmp3 = t10 - t11
2608    extr.w      t4, $ac0, 8
2609    mult        $0, $0          // ac0  = 0
2610    dpa.w.ph    $ac0, t0, s1    // ac0 += t5*181 + t6*181
2611    extr.w      t0, $ac1, 8     // t0 = z5
2612    extr.w      t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
2613    extr.w      t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
2614    extr.w      t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
2615    add         t6, t1, t0      // t6 = z2
2616    add         t7, t7, t0      // t7 = z4
2617    subq.ph     t0, t5, t8      // t0 = z13 = tmp7 - z3
2618    addq.ph     t8, t5, t8      // t9 = z11 = tmp7 + z3
2619    addq.ph     t1, t0, t6      // t1 = z13 + z2
2620    subq.ph     t6, t0, t6      // t6 = z13 - z2
2621    addq.ph     t0, t8, t7      // t0 = z11 + z4
2622    subq.ph     t7, t8, t7      // t7 = z11 - z4
2623    addq.ph     t5, t4, t9
2624    subq.ph     t4, t9, t4
2625    sh          t2, 0(v0)
2626    sh          t5, 4(v0)
2627    sh          t3, 8(v0)
2628    sh          t4, 12(v0)
2629    sh          t1, 10(v0)
2630    sh          t6, 6(v0)
2631    sh          t0, 2(v0)
2632    sh          t7, 14(v0)
2633    addiu       v0, 16
2634    bne         v1, v0, 0b
2635     nop
2636    move        v0, a0
2637    addiu       v1, v0, 16
2638
26391:
2640    lh          t0, 0(v0)       // 0
2641    lh          t1, 16(v0)      // 8
2642    lh          t2, 32(v0)      // 16
2643    lh          t3, 48(v0)      // 24
2644    lh          t4, 64(v0)      // 32
2645    lh          t5, 80(v0)      // 40
2646    lh          t6, 96(v0)      // 48
2647    lh          t7, 112(v0)     // 56
2648    add         t8, t0, t7      // t8 = tmp0
2649    sub         t7, t0, t7      // t7 = tmp7
2650    add         t0, t1, t6      // t0 = tmp1
2651    sub         t1, t1, t6      // t1 = tmp6
2652    add         t6, t2, t5      // t6 = tmp2
2653    sub         t5, t2, t5      // t5 = tmp5
2654    add         t2, t3, t4      // t2 = tmp3
2655    sub         t3, t3, t4      // t3 = tmp4
2656    add         t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
2657    sub         t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
2658    sub         s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
2659    ins         t8, s0, 16, 16  // t8 = tmp12|tmp13
2660    add         t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
2661    mult        $0, $0          // ac0  = 0
2662    dpa.w.ph    $ac0, t8, s1    // ac0 += t12*181 + t13*181
2663    add         s0, t4, t2      // t8 = tmp10+tmp11
2664    sub         t4, t4, t2      // t4 = tmp10-tmp11
2665    sh          s0, 0(v0)
2666    sh          t4, 64(v0)
2667    extr.w      t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
2668    addq.ph     t4, t8, t2      // t9 = tmp13 + z1
2669    subq.ph     t8, t8, t2      // t2 = tmp13 - z1
2670    sh          t4, 32(v0)
2671    sh          t8, 96(v0)
2672    add         t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
2673    add         t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
2674    add         t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
2675    andi        t4, a1, 0xffff
2676    mul         s0, t1, t4
2677    sra         s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2678    ins         t1, t3, 16, 16  // t1 = tmp10|tmp12
2679    mult        $0, $0          // ac0  = 0
2680    mulsa.w.ph  $ac0, t1, a3    // ac0 += t10*98 - t12*98
2681    extr.w      t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
2682    add         t2, t7, t8      // t2 = tmp7 + z5
2683    sub         t7, t7, t8      // t7 = tmp7 - z5
2684    andi        t4, a2, 0xffff
2685    mul         t8, t3, t4
2686    sra         t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2687    andi        t4, s1, 0xffff
2688    mul         t6, t0, t4
2689    sra         t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2690    add         t0, t6, t8      // t0 = z3 + z2
2691    sub         t1, t6, t8      // t1 = z3 - z2
2692    add         t3, t6, s0      // t3 = z3 + z4
2693    sub         t4, t6, s0      // t4 = z3 - z4
2694    sub         t5, t2, t1      // t5 = dataptr[5]
2695    sub         t6, t7, t0      // t6 = dataptr[3]
2696    add         t3, t2, t3      // t3 = dataptr[1]
2697    add         t4, t7, t4      // t4 = dataptr[7]
2698    sh          t5, 80(v0)
2699    sh          t6, 48(v0)
2700    sh          t3, 16(v0)
2701    sh          t4, 112(v0)
2702    addiu       v0, 2
2703    bne         v0, v1, 1b
2704     nop
2705
2706    RESTORE_REGS_FROM_STACK 8, s0, s1
2707
2708    j           ra
2709     nop
2710END(jsimd_fdct_ifast_dspr2)
2711
2712
2713/*****************************************************************************/
2714LEAF_DSPR2(jsimd_quantize_dspr2)
2715/*
2716 * a0 = coef_block
2717 * a1 = divisors
2718 * a2 = workspace
2719 */
2720    .set at
2721
2722    SAVE_REGS_ON_STACK 16, s0, s1, s2
2723
2724    addiu       v0, a2, 124     // v0 = workspace_end
2725    lh          t0, 0(a2)
2726    lh          t1, 0(a1)
2727    lh          t2, 128(a1)
2728    sra         t3, t0, 15
2729    sll         t3, t3, 1
2730    addiu       t3, t3, 1
2731    mul         t0, t0, t3
2732    lh          t4, 384(a1)
2733    lh          t5, 130(a1)
2734    lh          t6, 2(a2)
2735    lh          t7, 2(a1)
2736    lh          t8, 386(a1)
2737
27381:
2739    andi        t1, 0xffff
2740    add         t9, t0, t2
2741    andi        t9, 0xffff
2742    mul         v1, t9, t1
2743    sra         s0, t6, 15
2744    sll         s0, s0, 1
2745    addiu       s0, s0, 1
2746    addiu       t9, t4, 16
2747    srav        v1, v1, t9
2748    mul         v1, v1, t3
2749    mul         t6, t6, s0
2750    andi        t7, 0xffff
2751    addiu       a2, a2, 4
2752    addiu       a1, a1, 4
2753    add         s1, t6, t5
2754    andi        s1, 0xffff
2755    sh          v1, 0(a0)
2756
2757    mul         s2, s1, t7
2758    addiu       s1, t8, 16
2759    srav        s2, s2, s1
2760    mul         s2, s2, s0
2761    lh          t0, 0(a2)
2762    lh          t1, 0(a1)
2763    sra         t3, t0, 15
2764    sll         t3, t3, 1
2765    addiu       t3, t3, 1
2766    mul         t0, t0, t3
2767    lh          t2, 128(a1)
2768    lh          t4, 384(a1)
2769    lh          t5, 130(a1)
2770    lh          t8, 386(a1)
2771    lh          t6, 2(a2)
2772    lh          t7, 2(a1)
2773    sh          s2, 2(a0)
2774    lh          t0, 0(a2)
2775    sra         t3, t0, 15
2776    sll         t3, t3, 1
2777    addiu       t3, t3, 1
2778    mul         t0, t0, t3
2779    bne         a2, v0, 1b
2780     addiu      a0, a0, 4
2781
2782    andi        t1, 0xffff
2783    add         t9, t0, t2
2784    andi        t9, 0xffff
2785    mul         v1, t9, t1
2786    sra         s0, t6, 15
2787    sll         s0, s0, 1
2788    addiu       s0, s0, 1
2789    addiu       t9, t4, 16
2790    srav        v1, v1, t9
2791    mul         v1, v1, t3
2792    mul         t6, t6, s0
2793    andi        t7, 0xffff
2794    sh          v1, 0(a0)
2795    add         s1, t6, t5
2796    andi        s1, 0xffff
2797    mul         s2, s1, t7
2798    addiu       s1, t8, 16
2799    addiu       a2, a2, 4
2800    addiu       a1, a1, 4
2801    srav        s2, s2, s1
2802    mul         s2, s2, s0
2803    sh          s2, 2(a0)
2804
2805    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2806
2807    j           ra
2808     nop
2809
2810END(jsimd_quantize_dspr2)
2811
2812
2813#ifndef __mips_soft_float
2814
2815/*****************************************************************************/
2816LEAF_DSPR2(jsimd_quantize_float_dspr2)
2817/*
2818 * a0 = coef_block
2819 * a1 = divisors
2820 * a2 = workspace
2821 */
2822    .set at
2823
2824    li          t1, 0x46800100  // integer representation 16384.5
2825    mtc1        t1, f0
2826    li          t0, 63
28270:
2828    lwc1        f2, 0(a2)
2829    lwc1        f10, 0(a1)
2830    lwc1        f4, 4(a2)
2831    lwc1        f12, 4(a1)
2832    lwc1        f6, 8(a2)
2833    lwc1        f14, 8(a1)
2834    lwc1        f8, 12(a2)
2835    lwc1        f16, 12(a1)
2836    madd.s      f2, f0, f2, f10
2837    madd.s      f4, f0, f4, f12
2838    madd.s      f6, f0, f6, f14
2839    madd.s      f8, f0, f8, f16
2840    lwc1        f10, 16(a1)
2841    lwc1        f12, 20(a1)
2842    trunc.w.s   f2, f2
2843    trunc.w.s   f4, f4
2844    trunc.w.s   f6, f6
2845    trunc.w.s   f8, f8
2846    lwc1        f14, 24(a1)
2847    lwc1        f16, 28(a1)
2848    mfc1        t1, f2
2849    mfc1        t2, f4
2850    mfc1        t3, f6
2851    mfc1        t4, f8
2852    lwc1        f2, 16(a2)
2853    lwc1        f4, 20(a2)
2854    lwc1        f6, 24(a2)
2855    lwc1        f8, 28(a2)
2856    madd.s      f2, f0, f2, f10
2857    madd.s      f4, f0, f4, f12
2858    madd.s      f6, f0, f6, f14
2859    madd.s      f8, f0, f8, f16
2860    addiu       t1, t1, -16384
2861    addiu       t2, t2, -16384
2862    addiu       t3, t3, -16384
2863    addiu       t4, t4, -16384
2864    trunc.w.s   f2, f2
2865    trunc.w.s   f4, f4
2866    trunc.w.s   f6, f6
2867    trunc.w.s   f8, f8
2868    sh          t1, 0(a0)
2869    sh          t2, 2(a0)
2870    sh          t3, 4(a0)
2871    sh          t4, 6(a0)
2872    mfc1        t1, f2
2873    mfc1        t2, f4
2874    mfc1        t3, f6
2875    mfc1        t4, f8
2876    addiu       t0, t0, -8
2877    addiu       a2, a2, 32
2878    addiu       a1, a1, 32
2879    addiu       t1, t1, -16384
2880    addiu       t2, t2, -16384
2881    addiu       t3, t3, -16384
2882    addiu       t4, t4, -16384
2883    sh          t1, 8(a0)
2884    sh          t2, 10(a0)
2885    sh          t3, 12(a0)
2886    sh          t4, 14(a0)
2887    bgez        t0, 0b
2888     addiu      a0, a0, 16
2889
2890    j           ra
2891     nop
2892
2893END(jsimd_quantize_float_dspr2)
2894
2895#endif
2896
2897
2898/*****************************************************************************/
2899LEAF_DSPR2(jsimd_idct_2x2_dspr2)
2900/*
2901 * a0 = compptr->dct_table
2902 * a1 = coef_block
2903 * a2 = output_buf
2904 * a3 = output_col
2905 */
2906    .set at
2907
2908    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2909
2910    addiu       sp, sp, -40
2911    move        v0, sp
2912    addiu       s2, zero, 29692
2913    addiu       s3, zero, -10426
2914    addiu       s4, zero, 6967
2915    addiu       s5, zero, -5906
2916    lh          t0, 0(a1)       // t0 = inptr[DCTSIZE*0]
2917    lh          t5, 0(a0)       // t5 = quantptr[DCTSIZE*0]
2918    lh          t1, 48(a1)      // t1 = inptr[DCTSIZE*3]
2919    lh          t6, 48(a0)      // t6 = quantptr[DCTSIZE*3]
2920    mul         t4, t5, t0
2921    lh          t0, 16(a1)      // t0 = inptr[DCTSIZE*1]
2922    lh          t5, 16(a0)      // t5 = quantptr[DCTSIZE*1]
2923    mul         t6, t6, t1
2924    mul         t5, t5, t0
2925    lh          t2, 80(a1)      // t2 = inptr[DCTSIZE*5]
2926    lh          t7, 80(a0)      // t7 = quantptr[DCTSIZE*5]
2927    lh          t3, 112(a1)     // t3 = inptr[DCTSIZE*7]
2928    lh          t8, 112(a0)     // t8 = quantptr[DCTSIZE*7]
2929    mul         t7, t7, t2
2930    mult        zero, zero
2931    mul         t8, t8, t3
2932    li          s0, 0x73FCD746  // s0 = (29692 << 16) | (-10426 & 0xffff)
2933    li          s1, 0x1B37E8EE  // s1 = (6967 << 16) | (-5906 & 0xffff)
2934    ins         t6, t5, 16, 16  // t6 = t5|t6
2935    sll         t4, t4, 15
2936    dpa.w.ph    $ac0, t6, s0
2937    lh          t1, 2(a1)
2938    lh          t6, 2(a0)
2939    ins         t8, t7, 16, 16  // t8 = t7|t8
2940    dpa.w.ph    $ac0, t8, s1
2941    mflo        t0, $ac0
2942    mul         t5, t6, t1
2943    lh          t1, 18(a1)
2944    lh          t6, 18(a0)
2945    lh          t2, 50(a1)
2946    lh          t7, 50(a0)
2947    mul         t6, t6, t1
2948    subu        t8, t4, t0
2949    mul         t7, t7, t2
2950    addu        t0, t4, t0
2951    shra_r.w    t0, t0, 13
2952    lh          t1, 82(a1)
2953    lh          t2, 82(a0)
2954    lh          t3, 114(a1)
2955    lh          t4, 114(a0)
2956    shra_r.w    t8, t8, 13
2957    mul         t1, t1, t2
2958    mul         t3, t3, t4
2959    sw          t0, 0(v0)
2960    sw          t8, 20(v0)
2961    sll         t4, t5, 15
2962    ins         t7, t6, 16, 16
2963    mult        zero, zero
2964    dpa.w.ph    $ac0, t7, s0
2965    ins         t3, t1, 16, 16
2966    lh          t1, 6(a1)
2967    lh          t6, 6(a0)
2968    dpa.w.ph    $ac0, t3, s1
2969    mflo        t0, $ac0
2970    mul         t5, t6, t1
2971    lh          t1, 22(a1)
2972    lh          t6, 22(a0)
2973    lh          t2, 54(a1)
2974    lh          t7, 54(a0)
2975    mul         t6, t6, t1
2976    subu        t8, t4, t0
2977    mul         t7, t7, t2
2978    addu        t0, t4, t0
2979    shra_r.w    t0, t0, 13
2980    lh          t1, 86(a1)
2981    lh          t2, 86(a0)
2982    lh          t3, 118(a1)
2983    lh          t4, 118(a0)
2984    shra_r.w    t8, t8, 13
2985    mul         t1, t1, t2
2986    mul         t3, t3, t4
2987    sw          t0, 4(v0)
2988    sw          t8, 24(v0)
2989    sll         t4, t5, 15
2990    ins         t7, t6, 16, 16
2991    mult        zero, zero
2992    dpa.w.ph    $ac0, t7, s0
2993    ins         t3, t1, 16, 16
2994    lh          t1, 10(a1)
2995    lh          t6, 10(a0)
2996    dpa.w.ph    $ac0, t3, s1
2997    mflo        t0, $ac0
2998    mul         t5, t6, t1
2999    lh          t1, 26(a1)
3000    lh          t6, 26(a0)
3001    lh          t2, 58(a1)
3002    lh          t7, 58(a0)
3003    mul         t6, t6, t1
3004    subu        t8, t4, t0
3005    mul         t7, t7, t2
3006    addu        t0, t4, t0
3007    shra_r.w    t0, t0, 13
3008    lh          t1, 90(a1)
3009    lh          t2, 90(a0)
3010    lh          t3, 122(a1)
3011    lh          t4, 122(a0)
3012    shra_r.w    t8, t8, 13
3013    mul         t1, t1, t2
3014    mul         t3, t3, t4
3015    sw          t0, 8(v0)
3016    sw          t8, 28(v0)
3017    sll         t4, t5, 15
3018    ins         t7, t6, 16, 16
3019    mult        zero, zero
3020    dpa.w.ph    $ac0, t7, s0
3021    ins         t3, t1, 16, 16
3022    lh          t1, 14(a1)
3023    lh          t6, 14(a0)
3024    dpa.w.ph    $ac0, t3, s1
3025    mflo        t0, $ac0
3026    mul         t5, t6, t1
3027    lh          t1, 30(a1)
3028    lh          t6, 30(a0)
3029    lh          t2, 62(a1)
3030    lh          t7, 62(a0)
3031    mul         t6, t6, t1
3032    subu        t8, t4, t0
3033    mul         t7, t7, t2
3034    addu        t0, t4, t0
3035    shra_r.w    t0, t0, 13
3036    lh          t1, 94(a1)
3037    lh          t2, 94(a0)
3038    lh          t3, 126(a1)
3039    lh          t4, 126(a0)
3040    shra_r.w    t8, t8, 13
3041    mul         t1, t1, t2
3042    mul         t3, t3, t4
3043    sw          t0, 12(v0)
3044    sw          t8, 32(v0)
3045    sll         t4, t5, 15
3046    ins         t7, t6, 16, 16
3047    mult        zero, zero
3048    dpa.w.ph    $ac0, t7, s0
3049    ins         t3, t1, 16, 16
3050    dpa.w.ph    $ac0, t3, s1
3051    mflo        t0, $ac0
3052    lw          t9, 0(a2)
3053    lw          t3, 0(v0)
3054    lw          t7, 4(v0)
3055    lw          t1, 8(v0)
3056    addu        t9, t9, a3
3057    sll         t3, t3, 15
3058    subu        t8, t4, t0
3059    addu        t0, t4, t0
3060    shra_r.w    t0, t0, 13
3061    shra_r.w    t8, t8, 13
3062    sw          t0, 16(v0)
3063    sw          t8, 36(v0)
3064    lw          t5, 12(v0)
3065    lw          t6, 16(v0)
3066    mult        t7, s2
3067    madd        t1, s3
3068    madd        t5, s4
3069    madd        t6, s5
3070    lw          t5, 24(v0)
3071    lw          t7, 28(v0)
3072    mflo        t0, $ac0
3073    lw          t8, 32(v0)
3074    lw          t2, 36(v0)
3075    mult        $ac1, t5, s2
3076    madd        $ac1, t7, s3
3077    madd        $ac1, t8, s4
3078    madd        $ac1, t2, s5
3079    addu        t1, t3, t0
3080    subu        t6, t3, t0
3081    shra_r.w    t1, t1, 20
3082    shra_r.w    t6, t6, 20
3083    mflo        t4, $ac1
3084    shll_s.w    t1, t1, 24
3085    shll_s.w    t6, t6, 24
3086    sra         t1, t1, 24
3087    sra         t6, t6, 24
3088    addiu       t1, t1, 128
3089    addiu       t6, t6, 128
3090    lw          t0, 20(v0)
3091    sb          t1, 0(t9)
3092    sb          t6, 1(t9)
3093    sll         t0, t0, 15
3094    lw          t9, 4(a2)
3095    addu        t1, t0, t4
3096    subu        t6, t0, t4
3097    addu        t9, t9, a3
3098    shra_r.w    t1, t1, 20
3099    shra_r.w    t6, t6, 20
3100    shll_s.w    t1, t1, 24
3101    shll_s.w    t6, t6, 24
3102    sra         t1, t1, 24
3103    sra         t6, t6, 24
3104    addiu       t1, t1, 128
3105    addiu       t6, t6, 128
3106    sb          t1, 0(t9)
3107    sb          t6, 1(t9)
3108    addiu       sp, sp, 40
3109
3110    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3111
3112    j           ra
3113     nop
3114
3115END(jsimd_idct_2x2_dspr2)
3116
3117
3118/*****************************************************************************/
3119LEAF_DSPR2(jsimd_idct_4x4_dspr2)
3120/*
3121 * a0     = compptr->dct_table
3122 * a1     = coef_block
3123 * a2     = output_buf
3124 * a3     = output_col
3125 * 16(sp) = workspace[DCTSIZE*4];  // buffers data between passes
3126 */
3127    .set at
3128
3129    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3130
3131    lw          v1, 48(sp)
3132    move        t0, a1
3133    move        t1, v1
3134    li          t9, 4
3135    li          s0, 0x2e75f93e
3136    li          s1, 0x21f9ba79
3137    li          s2, 0xecc2efb0
3138    li          s3, 0x52031ccd
3139
31400:
3141    lh          s6, 32(t0)      // inptr[DCTSIZE*2]
3142    lh          t6, 32(a0)      // quantptr[DCTSIZE*2]
3143    lh          s7, 96(t0)      // inptr[DCTSIZE*6]
3144    lh          t7, 96(a0)      // quantptr[DCTSIZE*6]
3145    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3146    lh          s4, 0(t0)       // inptr[DCTSIZE*0]
3147    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3148    lh          s5, 0(a0)       // quantptr[0]
3149    li          s6, 15137
3150    li          s7, 6270
3151    mul         t2, s4, s5      // tmp0 = (inptr[0] * quantptr[0])
3152    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3153    lh          t5, 112(t0)     // inptr[DCTSIZE*7]
3154    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3155    lh          s4, 112(a0)     // quantptr[DCTSIZE*7]
3156    lh          v0, 80(t0)      // inptr[DCTSIZE*5]
3157    lh          s5, 80(a0)      // quantptr[DCTSIZE*5]
3158    lh          s6, 48(a0)      // quantptr[DCTSIZE*3]
3159    sll         t2, t2, 14      // tmp0 <<= (CONST_BITS+1)
3160    lh          s7, 16(a0)      // quantptr[DCTSIZE*1]
3161    lh          t8, 16(t0)      // inptr[DCTSIZE*1]
3162    subu        t6, t6, t7      // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3163    lh          t7, 48(t0)      // inptr[DCTSIZE*3]
3164    mul         t5, s4, t5      // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3165    mul         v0, s5, v0      // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3166    mul         t7, s6, t7      // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3167    mul         t8, s7, t8      // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3168    addu        t3, t2, t6      // tmp10 = tmp0 + z2
3169    subu        t4, t2, t6      // tmp10 = tmp0 - z2
3170    mult        $ac0, zero, zero
3171    mult        $ac1, zero, zero
3172    ins         t5, v0, 16, 16
3173    ins         t7, t8, 16, 16
3174    addiu       t9, t9, -1
3175    dpa.w.ph    $ac0, t5, s0
3176    dpa.w.ph    $ac0, t7, s1
3177    dpa.w.ph    $ac1, t5, s2
3178    dpa.w.ph    $ac1, t7, s3
3179    mflo        s4, $ac0
3180    mflo        s5, $ac1
3181    addiu       a0, a0, 2
3182    addiu       t1, t1, 4
3183    addiu       t0, t0, 2
3184    addu        t6, t4, s4
3185    subu        t5, t4, s4
3186    addu        s6, t3, s5
3187    subu        s7, t3, s5
3188    shra_r.w    t6, t6, 12      // DESCALE(tmp12 + temp1, 12)
3189    shra_r.w    t5, t5, 12      // DESCALE(tmp12 - temp1, 12)
3190    shra_r.w    s6, s6, 12      // DESCALE(tmp10 + temp2, 12)
3191    shra_r.w    s7, s7, 12      // DESCALE(tmp10 - temp2, 12)
3192    sw          t6, 28(t1)
3193    sw          t5, 60(t1)
3194    sw          s6, -4(t1)
3195    bgtz        t9, 0b
3196     sw         s7, 92(t1)
3197    // second loop three pass
3198    li          t9, 3
31991:
3200    lh          s6, 34(t0)      // inptr[DCTSIZE*2]
3201    lh          t6, 34(a0)      // quantptr[DCTSIZE*2]
3202    lh          s7, 98(t0)      // inptr[DCTSIZE*6]
3203    lh          t7, 98(a0)      // quantptr[DCTSIZE*6]
3204    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3205    lh          s4, 2(t0)       // inptr[DCTSIZE*0]
3206    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3207    lh          s5, 2(a0)       // quantptr[DCTSIZE*0]
3208    li          s6, 15137
3209    li          s7, 6270
3210    mul         t2, s4, s5      // tmp0 = (inptr[0] * quantptr[0])
3211    mul         v0, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3212    lh          t5, 114(t0)     // inptr[DCTSIZE*7]
3213    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3214    lh          s4, 114(a0)     // quantptr[DCTSIZE*7]
3215    lh          s5, 82(a0)      // quantptr[DCTSIZE*5]
3216    lh          t6, 82(t0)      // inptr[DCTSIZE*5]
3217    sll         t2, t2, 14      // tmp0 <<= (CONST_BITS+1)
3218    lh          s6, 50(a0)      // quantptr[DCTSIZE*3]
3219    lh          t8, 18(t0)      // inptr[DCTSIZE*1]
3220    subu        v0, v0, t7      // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3221    lh          t7, 50(t0)      // inptr[DCTSIZE*3]
3222    lh          s7, 18(a0)      // quantptr[DCTSIZE*1]
3223    mul         t5, s4, t5      // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3224    mul         t6, s5, t6      // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3225    mul         t7, s6, t7      // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3226    mul         t8, s7, t8      // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3227    addu        t3, t2, v0      // tmp10 = tmp0 + z2
3228    subu        t4, t2, v0      // tmp10 = tmp0 - z2
3229    mult        $ac0, zero, zero
3230    mult        $ac1, zero, zero
3231    ins         t5, t6, 16, 16
3232    ins         t7, t8, 16, 16
3233    dpa.w.ph    $ac0, t5, s0
3234    dpa.w.ph    $ac0, t7, s1
3235    dpa.w.ph    $ac1, t5, s2
3236    dpa.w.ph    $ac1, t7, s3
3237    mflo        t5, $ac0
3238    mflo        t6, $ac1
3239    addiu       t9, t9, -1
3240    addiu       t0, t0, 2
3241    addiu       a0, a0, 2
3242    addiu       t1, t1, 4
3243    addu        s5, t4, t5
3244    subu        s4, t4, t5
3245    addu        s6, t3, t6
3246    subu        s7, t3, t6
3247    shra_r.w    s5, s5, 12      // DESCALE(tmp12 + temp1, 12)
3248    shra_r.w    s4, s4, 12      // DESCALE(tmp12 - temp1, 12)
3249    shra_r.w    s6, s6, 12      // DESCALE(tmp10 + temp2, 12)
3250    shra_r.w    s7, s7, 12      // DESCALE(tmp10 - temp2, 12)
3251    sw          s5, 32(t1)
3252    sw          s4, 64(t1)
3253    sw          s6, 0(t1)
3254    bgtz        t9, 1b
3255     sw         s7, 96(t1)
3256    move        t1, v1
3257    li          s4, 15137
3258    lw          s6, 8(t1)       // wsptr[2]
3259    li          s5, 6270
3260    lw          s7, 24(t1)      // wsptr[6]
3261    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3262    lw          t2, 0(t1)       // wsptr[0]
3263    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3264    lh          t5, 28(t1)      // wsptr[7]
3265    lh          t6, 20(t1)      // wsptr[5]
3266    lh          t7, 12(t1)      // wsptr[3]
3267    lh          t8, 4(t1)       // wsptr[1]
3268    ins         t5, t6, 16, 16
3269    ins         t7, t8, 16, 16
3270    mult        $ac0, zero, zero
3271    dpa.w.ph    $ac0, t5, s0
3272    dpa.w.ph    $ac0, t7, s1
3273    mult        $ac1, zero, zero
3274    dpa.w.ph    $ac1, t5, s2
3275    dpa.w.ph    $ac1, t7, s3
3276    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3277    mflo        s6, $ac0
3278    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3279    subu        s4, s4, s5
3280    addu        t3, t2, s4      // tmp10 = tmp0 + z2
3281    mflo        s7, $ac1
3282    subu        t4, t2, s4      // tmp10 = tmp0 - z2
3283    addu        t7, t4, s6
3284    subu        t8, t4, s6
3285    addu        t5, t3, s7
3286    subu        t6, t3, s7
3287    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
3288    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
3289    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
3290    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
3291    sll         s4, t9, 2
3292    lw          v0, 0(a2)       // output_buf[ctr]
3293    shll_s.w    t5, t5, 24
3294    shll_s.w    t6, t6, 24
3295    shll_s.w    t7, t7, 24
3296    shll_s.w    t8, t8, 24
3297    sra         t5, t5, 24
3298    sra         t6, t6, 24
3299    sra         t7, t7, 24
3300    sra         t8, t8, 24
3301    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
3302    addiu       t5, t5, 128
3303    addiu       t6, t6, 128
3304    addiu       t7, t7, 128
3305    addiu       t8, t8, 128
3306    sb          t5, 0(v0)
3307    sb          t7, 1(v0)
3308    sb          t8, 2(v0)
3309    sb          t6, 3(v0)
3310    // 2
3311    li          s4, 15137
3312    lw          s6, 40(t1)      // wsptr[2]
3313    li          s5, 6270
3314    lw          s7, 56(t1)      // wsptr[6]
3315    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3316    lw          t2, 32(t1)      // wsptr[0]
3317    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3318    lh          t5, 60(t1)      // wsptr[7]
3319    lh          t6, 52(t1)      // wsptr[5]
3320    lh          t7, 44(t1)      // wsptr[3]
3321    lh          t8, 36(t1)      // wsptr[1]
3322    ins         t5, t6, 16, 16
3323    ins         t7, t8, 16, 16
3324    mult        $ac0, zero, zero
3325    dpa.w.ph    $ac0, t5, s0
3326    dpa.w.ph    $ac0, t7, s1
3327    mult        $ac1, zero, zero
3328    dpa.w.ph    $ac1, t5, s2
3329    dpa.w.ph    $ac1, t7, s3
3330    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3331    mflo        s6, $ac0
3332    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3333    subu        s4, s4, s5
3334    addu        t3, t2, s4      // tmp10 = tmp0 + z2
3335    mflo        s7, $ac1
3336    subu        t4, t2, s4      // tmp10 = tmp0 - z2
3337    addu        t7, t4, s6
3338    subu        t8, t4, s6
3339    addu        t5, t3, s7
3340    subu        t6, t3, s7
3341    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3342    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3343    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3344    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3345    sll         s4, t9, 2
3346    lw          v0, 4(a2)       // output_buf[ctr]
3347    shll_s.w    t5, t5, 24
3348    shll_s.w    t6, t6, 24
3349    shll_s.w    t7, t7, 24
3350    shll_s.w    t8, t8, 24
3351    sra         t5, t5, 24
3352    sra         t6, t6, 24
3353    sra         t7, t7, 24
3354    sra         t8, t8, 24
3355    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
3356    addiu       t5, t5, 128
3357    addiu       t6, t6, 128
3358    addiu       t7, t7, 128
3359    addiu       t8, t8, 128
3360    sb          t5, 0(v0)
3361    sb          t7, 1(v0)
3362    sb          t8, 2(v0)
3363    sb          t6, 3(v0)
3364    // 3
3365    li          s4, 15137
3366    lw          s6, 72(t1)      // wsptr[2]
3367    li          s5, 6270
3368    lw          s7, 88(t1)      // wsptr[6]
3369    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3370    lw          t2, 64(t1)      // wsptr[0]
3371    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3372    lh          t5, 92(t1)      // wsptr[7]
3373    lh          t6, 84(t1)      // wsptr[5]
3374    lh          t7, 76(t1)      // wsptr[3]
3375    lh          t8, 68(t1)      // wsptr[1]
3376    ins         t5, t6, 16, 16
3377    ins         t7, t8, 16, 16
3378    mult        $ac0, zero, zero
3379    dpa.w.ph    $ac0, t5, s0
3380    dpa.w.ph    $ac0, t7, s1
3381    mult        $ac1, zero, zero
3382    dpa.w.ph    $ac1, t5, s2
3383    dpa.w.ph    $ac1, t7, s3
3384    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3385    mflo        s6, $ac0
3386    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3387    subu        s4, s4, s5
3388    addu        t3, t2, s4      // tmp10 = tmp0 + z2
3389    mflo        s7, $ac1
3390    subu        t4, t2, s4      // tmp10 = tmp0 - z2
3391    addu        t7, t4, s6
3392    subu        t8, t4, s6
3393    addu        t5, t3, s7
3394    subu        t6, t3, s7
3395    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
3396    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
3397    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
3398    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
3399    sll         s4, t9, 2
3400    lw          v0, 8(a2)       // output_buf[ctr]
3401    shll_s.w    t5, t5, 24
3402    shll_s.w    t6, t6, 24
3403    shll_s.w    t7, t7, 24
3404    shll_s.w    t8, t8, 24
3405    sra         t5, t5, 24
3406    sra         t6, t6, 24
3407    sra         t7, t7, 24
3408    sra         t8, t8, 24
3409    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
3410    addiu       t5, t5, 128
3411    addiu       t6, t6, 128
3412    addiu       t7, t7, 128
3413    addiu       t8, t8, 128
3414    sb          t5, 0(v0)
3415    sb          t7, 1(v0)
3416    sb          t8, 2(v0)
3417    sb          t6, 3(v0)
3418    li          s4, 15137
3419    lw          s6, 104(t1)     // wsptr[2]
3420    li          s5, 6270
3421    lw          s7, 120(t1)     // wsptr[6]
3422    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3423    lw          t2, 96(t1)      // wsptr[0]
3424    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3425    lh          t5, 124(t1)     // wsptr[7]
3426    lh          t6, 116(t1)     // wsptr[5]
3427    lh          t7, 108(t1)     // wsptr[3]
3428    lh          t8, 100(t1)     // wsptr[1]
3429    ins         t5, t6, 16, 16
3430    ins         t7, t8, 16, 16
3431    mult        $ac0, zero, zero
3432    dpa.w.ph    $ac0, t5, s0
3433    dpa.w.ph    $ac0, t7, s1
3434    mult        $ac1, zero, zero
3435    dpa.w.ph    $ac1, t5, s2
3436    dpa.w.ph    $ac1, t7, s3
3437    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3438    mflo        s6, $ac0
3439    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3440    subu        s4, s4, s5
3441    addu        t3, t2, s4      // tmp10 = tmp0 + z2;
3442    mflo        s7, $ac1
3443    subu        t4, t2, s4      // tmp10 = tmp0 - z2;
3444    addu        t7, t4, s6
3445    subu        t8, t4, s6
3446    addu        t5, t3, s7
3447    subu        t6, t3, s7
3448    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
3449    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
3450    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
3451    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
3452    sll         s4, t9, 2
3453    lw          v0, 12(a2)      // output_buf[ctr]
3454    shll_s.w    t5, t5, 24
3455    shll_s.w    t6, t6, 24
3456    shll_s.w    t7, t7, 24
3457    shll_s.w    t8, t8, 24
3458    sra         t5, t5, 24
3459    sra         t6, t6, 24
3460    sra         t7, t7, 24
3461    sra         t8, t8, 24
3462    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
3463    addiu       t5, t5, 128
3464    addiu       t6, t6, 128
3465    addiu       t7, t7, 128
3466    addiu       t8, t8, 128
3467    sb          t5, 0(v0)
3468    sb          t7, 1(v0)
3469    sb          t8, 2(v0)
3470    sb          t6, 3(v0)
3471
3472    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3473
3474    j           ra
3475     nop
3476END(jsimd_idct_4x4_dspr2)
3477
3478
3479/*****************************************************************************/
3480LEAF_DSPR2(jsimd_idct_6x6_dspr2)
3481/*
3482 * a0 = compptr->dct_table
3483 * a1 = coef_block
3484 * a2 = output_buf
3485 * a3 = output_col
3486 */
3487    .set at
3488
3489    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3490
3491    addiu       sp, sp, -144
3492    move        v0, sp
3493    addiu       v1, v0, 24
3494    addiu       t9, zero, 5793
3495    addiu       s0, zero, 10033
3496    addiu       s1, zero, 2998
3497
34981:
3499    lh          s2, 0(a0)       // q0 = quantptr[ 0]
3500    lh          s3, 32(a0)      // q1 = quantptr[16]
3501    lh          s4, 64(a0)      // q2 = quantptr[32]
3502    lh          t2, 64(a1)      // tmp2 = inptr[32]
3503    lh          t1, 32(a1)      // tmp1 = inptr[16]
3504    lh          t0, 0(a1)       // tmp0 = inptr[ 0]
3505    mul         t2, t2, s4      // tmp2 = tmp2 * q2
3506    mul         t1, t1, s3      // tmp1 = tmp1 * q1
3507    mul         t0, t0, s2      // tmp0 = tmp0 * q0
3508    lh          t6, 16(a1)      // z1 = inptr[ 8]
3509    lh          t8, 80(a1)      // z3 = inptr[40]
3510    lh          t7, 48(a1)      // z2 = inptr[24]
3511    lh          s2, 16(a0)      // q0 = quantptr[ 8]
3512    lh          s4, 80(a0)      // q2 = quantptr[40]
3513    lh          s3, 48(a0)      // q1 = quantptr[24]
3514    mul         t2, t2, t9      // tmp2 = tmp2 * 5793
3515    mul         t1, t1, s0      // tmp1 = tmp1 * 10033
3516    sll         t0, t0, 13      // tmp0 = tmp0 << 13
3517    mul         t6, t6, s2      // z1 = z1 * q0
3518    mul         t8, t8, s4      // z3 = z3 * q2
3519    mul         t7, t7, s3      // z2 = z2 * q1
3520    addu        t3, t0, t2      // tmp10 = tmp0 + tmp2
3521    sll         t2, t2, 1       // tmp2 = tmp2 << 2
3522    subu        t4, t0, t2      // tmp11 = tmp0 - tmp2;
3523    subu        t5, t3, t1      // tmp12 = tmp10 - tmp1
3524    addu        t3, t3, t1      // tmp10 = tmp10 + tmp1
3525    addu        t1, t6, t8      // tmp1 = z1 + z3
3526    mul         t1, t1, s1      // tmp1 = tmp1 * 2998
3527    shra_r.w    t4, t4, 11      // tmp11 = (tmp11 + 1024) >> 11
3528    subu        t2, t6, t8      // tmp2 = z1 - z3
3529    subu        t2, t2, t7      // tmp2 = tmp2 - z2
3530    sll         t2, t2, 2       // tmp2 = tmp2 << 2
3531    addu        t0, t6, t7      // tmp0 = z1 + z2
3532    sll         t0, t0, 13      // tmp0 = tmp0 << 13
3533    subu        s2, t8, t7      // q0 = z3 - z2
3534    sll         s2, s2, 13      // q0 = q0 << 13
3535    addu        t0, t0, t1      // tmp0 = tmp0 + tmp1
3536    addu        t1, s2, t1      // tmp1 = q0 + tmp1
3537    addu        s2, t4, t2      // q0 = tmp11 + tmp2
3538    subu        s3, t4, t2      // q1 = tmp11 - tmp2
3539    addu        t6, t3, t0      // z1 = tmp10 + tmp0
3540    subu        t7, t3, t0      // z2 = tmp10 - tmp0
3541    addu        t4, t5, t1      // tmp11 = tmp12 + tmp1
3542    subu        t5, t5, t1      // tmp12 = tmp12 - tmp1
3543    shra_r.w    t6, t6, 11      // z1 = (z1 + 1024) >> 11
3544    shra_r.w    t7, t7, 11      // z2 = (z2 + 1024) >> 11
3545    shra_r.w    t4, t4, 11      // tmp11 = (tmp11 + 1024) >> 11
3546    shra_r.w    t5, t5, 11      // tmp12 = (tmp12 + 1024) >> 11
3547    sw          s2, 24(v0)
3548    sw          s3, 96(v0)
3549    sw          t6, 0(v0)
3550    sw          t7, 120(v0)
3551    sw          t4, 48(v0)
3552    sw          t5, 72(v0)
3553    addiu       v0, v0, 4
3554    addiu       a1, a1, 2
3555    bne         v0, v1, 1b
3556     addiu      a0, a0, 2
3557
3558    /* Pass 2: process 6 rows from work array, store into output array. */
3559    move        v0, sp
3560    addiu       v1, v0, 144
3561
35622:
3563    lw          t0, 0(v0)
3564    lw          t2, 16(v0)
3565    lw          s5, 0(a2)
3566    addiu       t0, t0, 16
3567    sll         t0, t0, 13
3568    mul         t3, t2, t9
3569    lw          t6, 4(v0)
3570    lw          t8, 20(v0)
3571    lw          t7, 12(v0)
3572    addu        s5, s5, a3
3573    addu        s6, t6, t8
3574    mul         s6, s6, s1
3575    addu        t1, t0, t3
3576    subu        t4, t0, t3
3577    subu        t4, t4, t3
3578    lw          t3, 8(v0)
3579    mul         t0, t3, s0
3580    addu        s7, t6, t7
3581    sll         s7, s7, 13
3582    addu        s7, s6, s7
3583    subu        t2, t8, t7
3584    sll         t2, t2, 13
3585    addu        t2, s6, t2
3586    subu        s6, t6, t7
3587    subu        s6, s6, t8
3588    sll         s6, s6, 13
3589    addu        t3, t1, t0
3590    subu        t5, t1, t0
3591    addu        t6, t3, s7
3592    subu        t3, t3, s7
3593    addu        t7, t4, s6
3594    subu        t4, t4, s6
3595    addu        t8, t5, t2
3596    subu        t5, t5, t2
3597    shll_s.w    t6, t6, 6
3598    shll_s.w    t3, t3, 6
3599    shll_s.w    t7, t7, 6
3600    shll_s.w    t4, t4, 6
3601    shll_s.w    t8, t8, 6
3602    shll_s.w    t5, t5, 6
3603    sra         t6, t6, 24
3604    addiu       t6, t6, 128
3605    sra         t3, t3, 24
3606    addiu       t3, t3, 128
3607    sb          t6, 0(s5)
3608    sra         t7, t7, 24
3609    addiu       t7, t7, 128
3610    sb          t3, 5(s5)
3611    sra         t4, t4, 24
3612    addiu       t4, t4, 128
3613    sb          t7, 1(s5)
3614    sra         t8, t8, 24
3615    addiu       t8, t8, 128
3616    sb          t4, 4(s5)
3617    addiu       v0, v0, 24
3618    sra         t5, t5, 24
3619    addiu       t5, t5, 128
3620    sb          t8, 2(s5)
3621    addiu       a2, a2,  4
3622    bne         v0, v1, 2b
3623     sb         t5, 3(s5)
3624
3625    addiu       sp, sp, 144
3626
3627    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3628
3629    j           ra
3630     nop
3631
3632END(jsimd_idct_6x6_dspr2)
3633
3634
3635/*****************************************************************************/
3636LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
3637/*
3638 * a0 = compptr->dct_table
3639 * a1 = coef_block
3640 * a2 = workspace
3641 */
3642    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3643
3644    li          a3, 8
3645
36461:
3647    // odd part
3648    lh          t0, 48(a1)
3649    lh          t1, 48(a0)
3650    lh          t2, 16(a1)
3651    lh          t3, 16(a0)
3652    lh          t4, 80(a1)
3653    lh          t5, 80(a0)
3654    lh          t6, 112(a1)
3655    lh          t7, 112(a0)
3656    mul         t0, t0, t1      // z2
3657    mul         t1, t2, t3      // z1
3658    mul         t2, t4, t5      // z3
3659    mul         t3, t6, t7      // z4
3660    li          t4, 10703       // FIX(1.306562965)
3661    li          t5, 4433        // FIX_0_541196100
3662    li          t6, 7053        // FIX(0.860918669)
3663    mul         t4, t0, t4      // tmp11
3664    mul         t5, t0, t5      // -tmp14
3665    addu        t7, t1, t2      // tmp10
3666    addu        t8, t7, t3      // tmp10 + z4
3667    mul         t6, t6, t8      // tmp15
3668    li          t8, 2139        // FIX(0.261052384)
3669    mul         t8, t7, t8      // MULTIPLY(tmp10, FIX(0.261052384))
3670    li          t7, 2295        // FIX(0.280143716)
3671    mul         t7, t1, t7      // MULTIPLY(z1, FIX(0.280143716))
3672    addu        t9, t2, t3      // z3 + z4
3673    li          s0, 8565        // FIX(1.045510580)
3674    mul         t9, t9, s0      // -tmp13
3675    li          s0, 12112       // FIX(1.478575242)
3676    mul         s0, t2, s0      // MULTIPLY(z3, FIX(1.478575242)
3677    li          s1, 12998       // FIX(1.586706681)
3678    mul         s1, t3, s1      // MULTIPLY(z4, FIX(1.586706681))
3679    li          s2, 5540        // FIX(0.676326758)
3680    mul         s2, t1, s2      // MULTIPLY(z1, FIX(0.676326758))
3681    li          s3, 16244       // FIX(1.982889723)
3682    mul         s3, t3, s3      // MULTIPLY(z4, FIX(1.982889723))
3683    subu        t1, t1, t3      // z1-=z4
3684    subu        t0, t0, t2      // z2-=z3
3685    addu        t2, t0, t1      // z1+z2
3686    li          t3, 4433        // FIX_0_541196100
3687    mul         t2, t2, t3      // z3
3688    li          t3, 6270        // FIX_0_765366865
3689    mul         t1, t1, t3      // MULTIPLY(z1, FIX_0_765366865)
3690    li          t3, 15137       // FIX_0_765366865
3691    mul         t0, t0, t3      // MULTIPLY(z2, FIX_1_847759065)
3692    addu        t8, t6, t8      // tmp12
3693    addu        t3, t8, t4      // tmp12 + tmp11
3694    addu        t3, t3, t7      // tmp10
3695    subu        t8, t8, t9      // tmp12 + tmp13
3696    addu        s0, t5, s0
3697    subu        t8, t8, s0      // tmp12
3698    subu        t9, t6, t9
3699    subu        s1, s1, t4
3700    addu        t9, t9, s1      // tmp13
3701    subu        t6, t6, t5
3702    subu        t6, t6, s2
3703    subu        t6, t6, s3      // tmp15
3704    // even part start
3705    lh          t4, 64(a1)
3706    lh          t5, 64(a0)
3707    lh          t7, 32(a1)
3708    lh          s0, 32(a0)
3709    lh          s1, 0(a1)
3710    lh          s2, 0(a0)
3711    lh          s3, 96(a1)
3712    lh          v0, 96(a0)
3713    mul         t4, t4, t5      // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
3714    mul         t5, t7, s0      // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
3715    mul         t7, s1, s2      // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
3716    mul         s0, s3, v0      // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
3717    // odd part end
3718    addu        t1, t2, t1      // tmp11
3719    subu        t0, t2, t0      // tmp14
3720    // update counter and pointers
3721    addiu       a3, a3, -1
3722    addiu       a0, a0, 2
3723    addiu       a1, a1, 2
3724    // even part rest
3725    li          s1, 10033
3726    li          s2, 11190
3727    mul         t4, t4, s1      // z4
3728    mul         s1, t5, s2      // z4
3729    sll         t5, t5, 13      // z1
3730    sll         t7, t7, 13
3731    addiu       t7, t7, 1024    // z3
3732    sll         s0, s0, 13      // z2
3733    addu        s2, t7, t4      // tmp10
3734    subu        t4, t7, t4      // tmp11
3735    subu        s3, t5, s0      // tmp12
3736    addu        t2, t7, s3      // tmp21
3737    subu        s3, t7, s3      // tmp24
3738    addu        t7, s1, s0      // tmp12
3739    addu        v0, s2, t7      // tmp20
3740    subu        s2, s2, t7      // tmp25
3741    subu        s1, s1, t5      // z4 - z1
3742    subu        s1, s1, s0      // tmp12
3743    addu        s0, t4, s1      // tmp22
3744    subu        t4, t4, s1      // tmp23
3745    // final output stage
3746    addu        t5, v0, t3
3747    subu        v0, v0, t3
3748    addu        t3, t2, t1
3749    subu        t2, t2, t1
3750    addu        t1, s0, t8
3751    subu        s0, s0, t8
3752    addu        t8, t4, t9
3753    subu        t4, t4, t9
3754    addu        t9, s3, t0
3755    subu        s3, s3, t0
3756    addu        t0, s2, t6
3757    subu        s2, s2, t6
3758    sra         t5, t5, 11
3759    sra         t3, t3, 11
3760    sra         t1, t1, 11
3761    sra         t8, t8, 11
3762    sra         t9, t9, 11
3763    sra         t0, t0, 11
3764    sra         s2, s2, 11
3765    sra         s3, s3, 11
3766    sra         t4, t4, 11
3767    sra         s0, s0, 11
3768    sra         t2, t2, 11
3769    sra         v0, v0, 11
3770    sw          t5, 0(a2)
3771    sw          t3, 32(a2)
3772    sw          t1, 64(a2)
3773    sw          t8, 96(a2)
3774    sw          t9, 128(a2)
3775    sw          t0, 160(a2)
3776    sw          s2, 192(a2)
3777    sw          s3, 224(a2)
3778    sw          t4, 256(a2)
3779    sw          s0, 288(a2)
3780    sw          t2, 320(a2)
3781    sw          v0, 352(a2)
3782    bgtz        a3, 1b
3783     addiu      a2, a2, 4
3784
3785    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3786
3787    j           ra
3788     nop
3789
3790END(jsimd_idct_12x12_pass1_dspr2)
3791
3792
3793/*****************************************************************************/
3794LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
3795/*
3796 * a0 = workspace
3797 * a1 = output
3798 */
3799    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3800
3801    li          a3, 12
3802
38031:
3804    // Odd part
3805    lw          t0, 12(a0)
3806    lw          t1, 4(a0)
3807    lw          t2, 20(a0)
3808    lw          t3, 28(a0)
3809    li          t4, 10703       // FIX(1.306562965)
3810    li          t5, 4433        // FIX_0_541196100
3811    mul         t4, t0, t4      // tmp11
3812    mul         t5, t0, t5      // -tmp14
3813    addu        t6, t1, t2      // tmp10
3814    li          t7, 2139        // FIX(0.261052384)
3815    mul         t7, t6, t7      // MULTIPLY(tmp10, FIX(0.261052384))
3816    addu        t6, t6, t3      // tmp10 + z4
3817    li          t8, 7053        // FIX(0.860918669)
3818    mul         t6, t6, t8      // tmp15
3819    li          t8, 2295        // FIX(0.280143716)
3820    mul         t8, t1, t8      // MULTIPLY(z1, FIX(0.280143716))
3821    addu        t9, t2, t3      // z3 + z4
3822    li          s0, 8565        // FIX(1.045510580)
3823    mul         t9, t9, s0      // -tmp13
3824    li          s0, 12112       // FIX(1.478575242)
3825    mul         s0, t2, s0      // MULTIPLY(z3, FIX(1.478575242))
3826    li          s1, 12998       // FIX(1.586706681)
3827    mul         s1, t3, s1      // MULTIPLY(z4, FIX(1.586706681))
3828    li          s2, 5540        // FIX(0.676326758)
3829    mul         s2, t1, s2      // MULTIPLY(z1, FIX(0.676326758))
3830    li          s3, 16244       // FIX(1.982889723)
3831    mul         s3, t3, s3      // MULTIPLY(z4, FIX(1.982889723))
3832    subu        t1, t1, t3      // z1 -= z4
3833    subu        t0, t0, t2      // z2 -= z3
3834    addu        t2, t1, t0      // z1 + z2
3835    li          t3, 4433        // FIX_0_541196100
3836    mul         t2, t2, t3      // z3
3837    li          t3, 6270        // FIX_0_765366865
3838    mul         t1, t1, t3      // MULTIPLY(z1, FIX_0_765366865)
3839    li          t3, 15137       // FIX_1_847759065
3840    mul         t0, t0, t3      // MULTIPLY(z2, FIX_1_847759065)
3841    addu        t3, t6, t7      // tmp12
3842    addu        t7, t3, t4
3843    addu        t7, t7, t8      // tmp10
3844    subu        t3, t3, t9
3845    subu        t3, t3, t5
3846    subu        t3, t3, s0      // tmp12
3847    subu        t9, t6, t9
3848    subu        t9, t9, t4
3849    addu        t9, t9, s1      // tmp13
3850    subu        t6, t6, t5
3851    subu        t6, t6, s2
3852    subu        t6, t6, s3      // tmp15
3853    addu        t1, t2, t1      // tmp11
3854    subu        t0, t2, t0      // tmp14
3855    // even part
3856    lw          t2, 16(a0)      // z4
3857    lw          t4, 8(a0)       // z1
3858    lw          t5, 0(a0)       // z3
3859    lw          t8, 24(a0)      // z2
3860    li          s0, 10033       // FIX(1.224744871)
3861    li          s1, 11190       // FIX(1.366025404)
3862    mul         t2, t2, s0      // z4
3863    mul         s0, t4, s1      // z4
3864    addiu       t5, t5, 0x10
3865    sll         t5, t5, 13      // z3
3866    sll         t4, t4, 13      // z1
3867    sll         t8, t8, 13      // z2
3868    subu        s1, t4, t8      // tmp12
3869    addu        s2, t5, t2      // tmp10
3870    subu        t2, t5, t2      // tmp11
3871    addu        s3, t5, s1      // tmp21
3872    subu        s1, t5, s1      // tmp24
3873    addu        t5, s0, t8      // tmp12
3874    addu        v0, s2, t5      // tmp20
3875    subu        t5, s2, t5      // tmp25
3876    subu        t4, s0, t4
3877    subu        t4, t4, t8      // tmp12
3878    addu        t8, t2, t4      // tmp22
3879    subu        t2, t2, t4      // tmp23
3880    // increment counter and pointers
3881    addiu       a3, a3, -1
3882    addiu       a0, a0, 32
3883    // Final stage
3884    addu        t4, v0, t7
3885    subu        v0, v0, t7
3886    addu        t7, s3, t1
3887    subu        s3, s3, t1
3888    addu        t1, t8, t3
3889    subu        t8, t8, t3
3890    addu        t3, t2, t9
3891    subu        t2, t2, t9
3892    addu        t9, s1, t0
3893    subu        s1, s1, t0
3894    addu        t0, t5, t6
3895    subu        t5, t5, t6
3896    sll         t4, t4, 4
3897    sll         t7, t7, 4
3898    sll         t1, t1, 4
3899    sll         t3, t3, 4
3900    sll         t9, t9, 4
3901    sll         t0, t0, 4
3902    sll         t5, t5, 4
3903    sll         s1, s1, 4
3904    sll         t2, t2, 4
3905    sll         t8, t8, 4
3906    sll         s3, s3, 4
3907    sll         v0, v0, 4
3908    shll_s.w    t4, t4, 2
3909    shll_s.w    t7, t7, 2
3910    shll_s.w    t1, t1, 2
3911    shll_s.w    t3, t3, 2
3912    shll_s.w    t9, t9, 2
3913    shll_s.w    t0, t0, 2
3914    shll_s.w    t5, t5, 2
3915    shll_s.w    s1, s1, 2
3916    shll_s.w    t2, t2, 2
3917    shll_s.w    t8, t8, 2
3918    shll_s.w    s3, s3, 2
3919    shll_s.w    v0, v0, 2
3920    srl         t4, t4, 24
3921    srl         t7, t7, 24
3922    srl         t1, t1, 24
3923    srl         t3, t3, 24
3924    srl         t9, t9, 24
3925    srl         t0, t0, 24
3926    srl         t5, t5, 24
3927    srl         s1, s1, 24
3928    srl         t2, t2, 24
3929    srl         t8, t8, 24
3930    srl         s3, s3, 24
3931    srl         v0, v0, 24
3932    lw          t6, 0(a1)
3933    addiu       t4, t4, 0x80
3934    addiu       t7, t7, 0x80
3935    addiu       t1, t1, 0x80
3936    addiu       t3, t3, 0x80
3937    addiu       t9, t9, 0x80
3938    addiu       t0, t0, 0x80
3939    addiu       t5, t5, 0x80
3940    addiu       s1, s1, 0x80
3941    addiu       t2, t2, 0x80
3942    addiu       t8, t8, 0x80
3943    addiu       s3, s3, 0x80
3944    addiu       v0, v0, 0x80
3945    sb          t4, 0(t6)
3946    sb          t7, 1(t6)
3947    sb          t1, 2(t6)
3948    sb          t3, 3(t6)
3949    sb          t9, 4(t6)
3950    sb          t0, 5(t6)
3951    sb          t5, 6(t6)
3952    sb          s1, 7(t6)
3953    sb          t2, 8(t6)
3954    sb          t8, 9(t6)
3955    sb          s3, 10(t6)
3956    sb          v0, 11(t6)
3957    bgtz        a3, 1b
3958     addiu      a1, a1, 4
3959
3960    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3961
3962    jr          ra
3963     nop
3964
3965END(jsimd_idct_12x12_pass2_dspr2)
3966
3967
3968/*****************************************************************************/
3969LEAF_DSPR2(jsimd_convsamp_dspr2)
3970/*
3971 * a0 = sample_data
3972 * a1 = start_col
3973 * a2 = workspace
3974 */
3975    lw            t0, 0(a0)
3976    li            t7, 0xff80ff80
3977    addu          t0, t0, a1
3978    ulw           t1, 0(t0)
3979    ulw           t2, 4(t0)
3980    preceu.ph.qbr t3, t1
3981    preceu.ph.qbl t4, t1
3982    lw            t0, 4(a0)
3983    preceu.ph.qbr t5, t2
3984    preceu.ph.qbl t6, t2
3985    addu          t0, t0, a1
3986    addu.ph       t3, t3, t7
3987    addu.ph       t4, t4, t7
3988    ulw           t1, 0(t0)
3989    ulw           t2, 4(t0)
3990    addu.ph       t5, t5, t7
3991    addu.ph       t6, t6, t7
3992    usw           t3, 0(a2)
3993    usw           t4, 4(a2)
3994    preceu.ph.qbr t3, t1
3995    preceu.ph.qbl t4, t1
3996    usw           t5, 8(a2)
3997    usw           t6, 12(a2)
3998
3999    lw            t0, 8(a0)
4000    preceu.ph.qbr t5, t2
4001    preceu.ph.qbl t6, t2
4002    addu          t0, t0, a1
4003    addu.ph       t3, t3, t7
4004    addu.ph       t4, t4, t7
4005    ulw           t1, 0(t0)
4006    ulw           t2, 4(t0)
4007    addu.ph       t5, t5, t7
4008    addu.ph       t6, t6, t7
4009    usw           t3, 16(a2)
4010    usw           t4, 20(a2)
4011    preceu.ph.qbr t3, t1
4012    preceu.ph.qbl t4, t1
4013    usw           t5, 24(a2)
4014    usw           t6, 28(a2)
4015
4016    lw            t0, 12(a0)
4017    preceu.ph.qbr t5, t2
4018    preceu.ph.qbl t6, t2
4019    addu          t0, t0, a1
4020    addu.ph       t3, t3, t7
4021    addu.ph       t4, t4, t7
4022    ulw           t1, 0(t0)
4023    ulw           t2, 4(t0)
4024    addu.ph       t5, t5, t7
4025    addu.ph       t6, t6, t7
4026    usw           t3, 32(a2)
4027    usw           t4, 36(a2)
4028    preceu.ph.qbr t3, t1
4029    preceu.ph.qbl t4, t1
4030    usw           t5, 40(a2)
4031    usw           t6, 44(a2)
4032
4033    lw            t0, 16(a0)
4034    preceu.ph.qbr t5, t2
4035    preceu.ph.qbl t6, t2
4036    addu          t0, t0, a1
4037    addu.ph       t3, t3, t7
4038    addu.ph       t4, t4, t7
4039    ulw           t1, 0(t0)
4040    ulw           t2, 4(t0)
4041    addu.ph       t5, t5, t7
4042    addu.ph       t6, t6, t7
4043    usw           t3, 48(a2)
4044    usw           t4, 52(a2)
4045    preceu.ph.qbr t3, t1
4046    preceu.ph.qbl t4, t1
4047    usw           t5, 56(a2)
4048    usw           t6, 60(a2)
4049
4050    lw            t0, 20(a0)
4051    preceu.ph.qbr t5, t2
4052    preceu.ph.qbl t6, t2
4053    addu          t0, t0, a1
4054    addu.ph       t3, t3, t7
4055    addu.ph       t4, t4, t7
4056    ulw           t1, 0(t0)
4057    ulw           t2, 4(t0)
4058    addu.ph       t5, t5, t7
4059    addu.ph       t6, t6, t7
4060    usw           t3, 64(a2)
4061    usw           t4, 68(a2)
4062    preceu.ph.qbr t3, t1
4063    preceu.ph.qbl t4, t1
4064    usw           t5, 72(a2)
4065    usw           t6, 76(a2)
4066
4067    lw            t0, 24(a0)
4068    preceu.ph.qbr t5, t2
4069    preceu.ph.qbl t6, t2
4070    addu          t0, t0, a1
4071    addu.ph       t3, t3, t7
4072    addu.ph       t4, t4, t7
4073    ulw           t1, 0(t0)
4074    ulw           t2, 4(t0)
4075    addu.ph       t5, t5, t7
4076    addu.ph       t6, t6, t7
4077    usw           t3, 80(a2)
4078    usw           t4, 84(a2)
4079    preceu.ph.qbr t3, t1
4080    preceu.ph.qbl t4, t1
4081    usw           t5, 88(a2)
4082    usw           t6, 92(a2)
4083
4084    lw            t0, 28(a0)
4085    preceu.ph.qbr t5, t2
4086    preceu.ph.qbl t6, t2
4087    addu          t0, t0, a1
4088    addu.ph       t3, t3, t7
4089    addu.ph       t4, t4, t7
4090    ulw           t1, 0(t0)
4091    ulw           t2, 4(t0)
4092    addu.ph       t5, t5, t7
4093    addu.ph       t6, t6, t7
4094    usw           t3, 96(a2)
4095    usw           t4, 100(a2)
4096    preceu.ph.qbr t3, t1
4097    preceu.ph.qbl t4, t1
4098    usw           t5, 104(a2)
4099    usw           t6, 108(a2)
4100    preceu.ph.qbr t5, t2
4101    preceu.ph.qbl t6, t2
4102    addu.ph       t3, t3, t7
4103    addu.ph       t4, t4, t7
4104    addu.ph       t5, t5, t7
4105    addu.ph       t6, t6, t7
4106    usw           t3, 112(a2)
4107    usw           t4, 116(a2)
4108    usw           t5, 120(a2)
4109    usw           t6, 124(a2)
4110
4111    j             ra
4112     nop
4113
4114END(jsimd_convsamp_dspr2)
4115
4116
4117#ifndef __mips_soft_float
4118
4119/*****************************************************************************/
4120LEAF_DSPR2(jsimd_convsamp_float_dspr2)
4121/*
4122 * a0 = sample_data
4123 * a1 = start_col
4124 * a2 = workspace
4125 */
4126    .set at
4127
4128    lw          t0, 0(a0)
4129    addu        t0, t0, a1
4130    lbu         t1, 0(t0)
4131    lbu         t2, 1(t0)
4132    lbu         t3, 2(t0)
4133    lbu         t4, 3(t0)
4134    lbu         t5, 4(t0)
4135    lbu         t6, 5(t0)
4136    lbu         t7, 6(t0)
4137    lbu         t8, 7(t0)
4138    addiu       t1, t1, -128
4139    addiu       t2, t2, -128
4140    addiu       t3, t3, -128
4141    addiu       t4, t4, -128
4142    addiu       t5, t5, -128
4143    addiu       t6, t6, -128
4144    addiu       t7, t7, -128
4145    addiu       t8, t8, -128
4146    mtc1        t1, f2
4147    mtc1        t2, f4
4148    mtc1        t3, f6
4149    mtc1        t4, f8
4150    mtc1        t5, f10
4151    mtc1        t6, f12
4152    mtc1        t7, f14
4153    mtc1        t8, f16
4154    cvt.s.w     f2, f2
4155    cvt.s.w     f4, f4
4156    cvt.s.w     f6, f6
4157    cvt.s.w     f8, f8
4158    cvt.s.w     f10, f10
4159    cvt.s.w     f12, f12
4160    cvt.s.w     f14, f14
4161    cvt.s.w     f16, f16
4162    lw          t0, 4(a0)
4163    swc1        f2, 0(a2)
4164    swc1        f4, 4(a2)
4165    swc1        f6, 8(a2)
4166    addu        t0, t0, a1
4167    swc1        f8, 12(a2)
4168    swc1        f10, 16(a2)
4169    swc1        f12, 20(a2)
4170    swc1        f14, 24(a2)
4171    swc1        f16, 28(a2)
4172    // elemr 1
4173    lbu         t1, 0(t0)
4174    lbu         t2, 1(t0)
4175    lbu         t3, 2(t0)
4176    lbu         t4, 3(t0)
4177    lbu         t5, 4(t0)
4178    lbu         t6, 5(t0)
4179    lbu         t7, 6(t0)
4180    lbu         t8, 7(t0)
4181    addiu       t1, t1, -128
4182    addiu       t2, t2, -128
4183    addiu       t3, t3, -128
4184    addiu       t4, t4, -128
4185    addiu       t5, t5, -128
4186    addiu       t6, t6, -128
4187    addiu       t7, t7, -128
4188    addiu       t8, t8, -128
4189    mtc1        t1, f2
4190    mtc1        t2, f4
4191    mtc1        t3, f6
4192    mtc1        t4, f8
4193    mtc1        t5, f10
4194    mtc1        t6, f12
4195    mtc1        t7, f14
4196    mtc1        t8, f16
4197    cvt.s.w     f2, f2
4198    cvt.s.w     f4, f4
4199    cvt.s.w     f6, f6
4200    cvt.s.w     f8, f8
4201    cvt.s.w     f10, f10
4202    cvt.s.w     f12, f12
4203    cvt.s.w     f14, f14
4204    cvt.s.w     f16, f16
4205    lw          t0, 8(a0)
4206    swc1        f2, 32(a2)
4207    swc1        f4, 36(a2)
4208    swc1        f6, 40(a2)
4209    addu        t0, t0, a1
4210    swc1        f8, 44(a2)
4211    swc1        f10, 48(a2)
4212    swc1        f12, 52(a2)
4213    swc1        f14, 56(a2)
4214    swc1        f16, 60(a2)
4215    // elemr 2
4216    lbu         t1, 0(t0)
4217    lbu         t2, 1(t0)
4218    lbu         t3, 2(t0)
4219    lbu         t4, 3(t0)
4220    lbu         t5, 4(t0)
4221    lbu         t6, 5(t0)
4222    lbu         t7, 6(t0)
4223    lbu         t8, 7(t0)
4224    addiu       t1, t1, -128
4225    addiu       t2, t2, -128
4226    addiu       t3, t3, -128
4227    addiu       t4, t4, -128
4228    addiu       t5, t5, -128
4229    addiu       t6, t6, -128
4230    addiu       t7, t7, -128
4231    addiu       t8, t8, -128
4232    mtc1        t1, f2
4233    mtc1        t2, f4
4234    mtc1        t3, f6
4235    mtc1        t4, f8
4236    mtc1        t5, f10
4237    mtc1        t6, f12
4238    mtc1        t7, f14
4239    mtc1        t8, f16
4240    cvt.s.w     f2, f2
4241    cvt.s.w     f4, f4
4242    cvt.s.w     f6, f6
4243    cvt.s.w     f8, f8
4244    cvt.s.w     f10, f10
4245    cvt.s.w     f12, f12
4246    cvt.s.w     f14, f14
4247    cvt.s.w     f16, f16
4248    lw          t0, 12(a0)
4249    swc1        f2, 64(a2)
4250    swc1        f4, 68(a2)
4251    swc1        f6, 72(a2)
4252    addu        t0, t0, a1
4253    swc1        f8, 76(a2)
4254    swc1        f10, 80(a2)
4255    swc1        f12, 84(a2)
4256    swc1        f14, 88(a2)
4257    swc1        f16, 92(a2)
4258    //  elemr 3
4259    lbu         t1, 0(t0)
4260    lbu         t2, 1(t0)
4261    lbu         t3, 2(t0)
4262    lbu         t4, 3(t0)
4263    lbu         t5, 4(t0)
4264    lbu         t6, 5(t0)
4265    lbu         t7, 6(t0)
4266    lbu         t8, 7(t0)
4267    addiu       t1, t1, -128
4268    addiu       t2, t2, -128
4269    addiu       t3, t3, -128
4270    addiu       t4, t4, -128
4271    addiu       t5, t5, -128
4272    addiu       t6, t6, -128
4273    addiu       t7, t7, -128
4274    addiu       t8, t8, -128
4275    mtc1        t1, f2
4276    mtc1        t2, f4
4277    mtc1        t3, f6
4278    mtc1        t4, f8
4279    mtc1        t5, f10
4280    mtc1        t6, f12
4281    mtc1        t7, f14
4282    mtc1        t8, f16
4283    cvt.s.w     f2, f2
4284    cvt.s.w     f4, f4
4285    cvt.s.w     f6, f6
4286    cvt.s.w     f8, f8
4287    cvt.s.w     f10, f10
4288    cvt.s.w     f12, f12
4289    cvt.s.w     f14, f14
4290    cvt.s.w     f16, f16
4291    lw          t0, 16(a0)
4292    swc1        f2, 96(a2)
4293    swc1        f4, 100(a2)
4294    swc1        f6, 104(a2)
4295    addu        t0, t0, a1
4296    swc1        f8, 108(a2)
4297    swc1        f10, 112(a2)
4298    swc1        f12, 116(a2)
4299    swc1        f14, 120(a2)
4300    swc1        f16, 124(a2)
4301    // elemr 4
4302    lbu         t1, 0(t0)
4303    lbu         t2, 1(t0)
4304    lbu         t3, 2(t0)
4305    lbu         t4, 3(t0)
4306    lbu         t5, 4(t0)
4307    lbu         t6, 5(t0)
4308    lbu         t7, 6(t0)
4309    lbu         t8, 7(t0)
4310    addiu       t1, t1, -128
4311    addiu       t2, t2, -128
4312    addiu       t3, t3, -128
4313    addiu       t4, t4, -128
4314    addiu       t5, t5, -128
4315    addiu       t6, t6, -128
4316    addiu       t7, t7, -128
4317    addiu       t8, t8, -128
4318    mtc1        t1, f2
4319    mtc1        t2, f4
4320    mtc1        t3, f6
4321    mtc1        t4, f8
4322    mtc1        t5, f10
4323    mtc1        t6, f12
4324    mtc1        t7, f14
4325    mtc1        t8, f16
4326    cvt.s.w     f2, f2
4327    cvt.s.w     f4, f4
4328    cvt.s.w     f6, f6
4329    cvt.s.w     f8, f8
4330    cvt.s.w     f10, f10
4331    cvt.s.w     f12, f12
4332    cvt.s.w     f14, f14
4333    cvt.s.w     f16, f16
4334    lw          t0, 20(a0)
4335    swc1        f2, 128(a2)
4336    swc1        f4, 132(a2)
4337    swc1        f6, 136(a2)
4338    addu        t0, t0, a1
4339    swc1        f8, 140(a2)
4340    swc1        f10, 144(a2)
4341    swc1        f12, 148(a2)
4342    swc1        f14, 152(a2)
4343    swc1        f16, 156(a2)
4344    // elemr 5
4345    lbu         t1, 0(t0)
4346    lbu         t2, 1(t0)
4347    lbu         t3, 2(t0)
4348    lbu         t4, 3(t0)
4349    lbu         t5, 4(t0)
4350    lbu         t6, 5(t0)
4351    lbu         t7, 6(t0)
4352    lbu         t8, 7(t0)
4353    addiu       t1, t1, -128
4354    addiu       t2, t2, -128
4355    addiu       t3, t3, -128
4356    addiu       t4, t4, -128
4357    addiu       t5, t5, -128
4358    addiu       t6, t6, -128
4359    addiu       t7, t7, -128
4360    addiu       t8, t8, -128
4361    mtc1        t1, f2
4362    mtc1        t2, f4
4363    mtc1        t3, f6
4364    mtc1        t4, f8
4365    mtc1        t5, f10
4366    mtc1        t6, f12
4367    mtc1        t7, f14
4368    mtc1        t8, f16
4369    cvt.s.w     f2, f2
4370    cvt.s.w     f4, f4
4371    cvt.s.w     f6, f6
4372    cvt.s.w     f8, f8
4373    cvt.s.w     f10, f10
4374    cvt.s.w     f12, f12
4375    cvt.s.w     f14, f14
4376    cvt.s.w     f16, f16
4377    lw          t0, 24(a0)
4378    swc1        f2, 160(a2)
4379    swc1        f4, 164(a2)
4380    swc1        f6, 168(a2)
4381    addu        t0, t0, a1
4382    swc1        f8, 172(a2)
4383    swc1        f10, 176(a2)
4384    swc1        f12, 180(a2)
4385    swc1        f14, 184(a2)
4386    swc1        f16, 188(a2)
4387    // elemr 6
4388    lbu         t1, 0(t0)
4389    lbu         t2, 1(t0)
4390    lbu         t3, 2(t0)
4391    lbu         t4, 3(t0)
4392    lbu         t5, 4(t0)
4393    lbu         t6, 5(t0)
4394    lbu         t7, 6(t0)
4395    lbu         t8, 7(t0)
4396    addiu       t1, t1, -128
4397    addiu       t2, t2, -128
4398    addiu       t3, t3, -128
4399    addiu       t4, t4, -128
4400    addiu       t5, t5, -128
4401    addiu       t6, t6, -128
4402    addiu       t7, t7, -128
4403    addiu       t8, t8, -128
4404    mtc1        t1, f2
4405    mtc1        t2, f4
4406    mtc1        t3, f6
4407    mtc1        t4, f8
4408    mtc1        t5, f10
4409    mtc1        t6, f12
4410    mtc1        t7, f14
4411    mtc1        t8, f16
4412    cvt.s.w     f2, f2
4413    cvt.s.w     f4, f4
4414    cvt.s.w     f6, f6
4415    cvt.s.w     f8, f8
4416    cvt.s.w     f10, f10
4417    cvt.s.w     f12, f12
4418    cvt.s.w     f14, f14
4419    cvt.s.w     f16, f16
4420    lw          t0, 28(a0)
4421    swc1        f2, 192(a2)
4422    swc1        f4, 196(a2)
4423    swc1        f6, 200(a2)
4424    addu        t0, t0, a1
4425    swc1        f8, 204(a2)
4426    swc1        f10, 208(a2)
4427    swc1        f12, 212(a2)
4428    swc1        f14, 216(a2)
4429    swc1        f16, 220(a2)
4430    // elemr 7
4431    lbu         t1, 0(t0)
4432    lbu         t2, 1(t0)
4433    lbu         t3, 2(t0)
4434    lbu         t4, 3(t0)
4435    lbu         t5, 4(t0)
4436    lbu         t6, 5(t0)
4437    lbu         t7, 6(t0)
4438    lbu         t8, 7(t0)
4439    addiu       t1, t1, -128
4440    addiu       t2, t2, -128
4441    addiu       t3, t3, -128
4442    addiu       t4, t4, -128
4443    addiu       t5, t5, -128
4444    addiu       t6, t6, -128
4445    addiu       t7, t7, -128
4446    addiu       t8, t8, -128
4447    mtc1        t1, f2
4448    mtc1        t2, f4
4449    mtc1        t3, f6
4450    mtc1        t4, f8
4451    mtc1        t5, f10
4452    mtc1        t6, f12
4453    mtc1        t7, f14
4454    mtc1        t8, f16
4455    cvt.s.w     f2, f2
4456    cvt.s.w     f4, f4
4457    cvt.s.w     f6, f6
4458    cvt.s.w     f8, f8
4459    cvt.s.w     f10, f10
4460    cvt.s.w     f12, f12
4461    cvt.s.w     f14, f14
4462    cvt.s.w     f16, f16
4463    swc1        f2, 224(a2)
4464    swc1        f4, 228(a2)
4465    swc1        f6, 232(a2)
4466    swc1        f8, 236(a2)
4467    swc1        f10, 240(a2)
4468    swc1        f12, 244(a2)
4469    swc1        f14, 248(a2)
4470    swc1        f16, 252(a2)
4471
4472    j           ra
4473     nop
4474
4475END(jsimd_convsamp_float_dspr2)
4476
4477#endif
4478
4479/*****************************************************************************/
4480