• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of speed-critical encoding functions.
11 //
12 // Author(s): Darko Laus (darko.laus@imgtec.com)
13 //            Mirko Raus (mirko.raus@imgtec.com)
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_MIPS_DSP_R2)
18 
19 #include "src/dsp/mips_macro.h"
20 #include "src/enc/cost_enc.h"
21 #include "src/enc/vp8i_enc.h"
22 
23 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
24 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
25 
26 // O - output
27 // I - input (macro doesn't change it)
28 #define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
29                           I0, I1, I2, I3, I4, I5, I6, I7)                      \
30   "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
31   "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
32   "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
33   "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
34   "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
35   "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
36   "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
37   "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
38 
39 // IO - input/output
40 #define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
41   "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
42   "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
43   "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
44   "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
45   "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
46   "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
47   "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
48   "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
49 
50 // dpa.w.ph $ac0 temp0 ,temp1
51 //  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
52 // dpax.w.ph $ac0 temp0 ,temp1
53 //  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
54 // O - output
55 // I - input (macro doesn't change it)
56 #define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
57                  I8, I9, I10, I11, I12, I13, I14, I15)                         \
58     "mult            $ac0,      $zero,     $zero              \n\t"            \
59     "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
60     "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
61     "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
62     "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
63     "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
64     "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
65     "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
66     "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
67     "mflo            %[" #O0 "],  $ac0                        \n\t"
68 
69 #define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
70   OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
71   [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
72   [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
73   [temp17]"=&r"(temp17)
74 
75 // macro for one horizontal pass in FTransform
76 // temp0..temp15 holds tmp[0]..tmp[15]
77 // A - offset in bytes to load from src and ref buffers
78 // TEMP0..TEMP3 - registers for corresponding tmp elements
79 #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
80   "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
81   "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
82   "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
83   "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
84   "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
85   "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
86   "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
87   "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
88   "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
89   "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
90   "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
91   "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
92   "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
93   "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
94   "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
95   "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
96   "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
97   "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
98   "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
99   "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
100   "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
101   "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
102   "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
103   "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
104   "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
105   "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
106   "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
107   "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
108   "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
109   "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
110   "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
111 
112 // macro for one vertical pass in FTransform
113 // temp0..temp15 holds tmp[0]..tmp[15]
114 // A..D - offsets in bytes to store to out buffer
115 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
116 #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
117   "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
118   "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
119   "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
120   "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
121   "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
122   "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
123   "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
124   "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
125   "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
126   "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
127   "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
128   "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
129   "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
130   "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
131   "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
132   "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
133   "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
134   "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
135   "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
136   "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
137   "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
138   "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
139   "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
140   "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
141   "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
142   "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
143 
FTransform_MIPSdspR2(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)144 static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
145                                  const uint8_t* WEBP_RESTRICT ref,
146                                  int16_t* WEBP_RESTRICT out) {
147   const int c2217 = 2217;
148   const int c5352 = 5352;
149   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
150   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
151   int temp17, temp18, temp19, temp20;
152   const int* const args[3] =
153       { (const int*)src, (const int*)ref, (const int*)out };
154 
155   __asm__ volatile (
156     HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
157     HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
158     HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
159     HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
160     "lw            %[temp20],     8(%[args])                  \n\t"
161     VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
162     VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
163     VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
164     VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
165     OUTPUT_EARLY_CLOBBER_REGS_18(),
166       [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
167     : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
168     : "memory", "hi", "lo"
169   );
170 }
171 
172 #undef VERTICAL_PASS
173 #undef HORIZONTAL_PASS
174 
ITransformOne(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)175 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
176                                       const int16_t* WEBP_RESTRICT in,
177                                       uint8_t* WEBP_RESTRICT dst) {
178   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
179   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
180 
181   __asm__ volatile (
182     "ulw              %[temp1],   0(%[in])                 \n\t"
183     "ulw              %[temp2],   16(%[in])                \n\t"
184     LOAD_IN_X2(temp5, temp6, 24, 26)
185     ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
186     LOAD_IN_X2(temp1, temp2, 8, 10)
187     MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
188                   temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
189                   temp13, temp11, temp14, temp12)
190     INSERT_HALF_X2(temp8, temp7, temp10, temp9)
191     "ulw              %[temp17],  4(%[in])                 \n\t"
192     "ulw              %[temp18],  20(%[in])                \n\t"
193     ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
194     ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
195     ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
196     LOAD_IN_X2(temp17, temp18, 12, 14)
197     LOAD_IN_X2(temp9, temp10, 28, 30)
198     MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
199                   temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
200                   temp15, temp4, temp16, temp17)
201     INSERT_HALF_X2(temp11, temp12, temp13, temp14)
202     ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
203     ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
204 
205     // horizontal
206     SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
207     INSERT_HALF_X2(temp1, temp6, temp5, temp2)
208     SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
209     "repl.ph          %[temp2],   0x4                      \n\t"
210     INSERT_HALF_X2(temp3, temp8, temp17, temp4)
211     "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
212     "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
213     ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
214     ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
215     MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
216                   temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
217                   temp6, temp17, temp8, temp18)
218     MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
219                   temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
220                   temp18, temp12, temp17, temp16)
221     INSERT_HALF_X2(temp1, temp3, temp9, temp13)
222     INSERT_HALF_X2(temp6, temp8, temp11, temp15)
223     SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
224                    temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
225                    temp6)
226     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
227                           temp16, temp11, temp10, temp15, temp14)
228     LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
229                         0, 0, 0, 0,
230                         0, 1, 2, 3,
231                         BPS)
232     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
233                             temp11, temp10, temp11, temp14, temp15)
234     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
235                      temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
236                      dst, 0, 1, 2, 3, BPS)
237 
238     OUTPUT_EARLY_CLOBBER_REGS_18()
239     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
240     : "memory", "hi", "lo"
241   );
242 }
243 
ITransform_MIPSdspR2(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)244 static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
245                                  const int16_t* WEBP_RESTRICT in,
246                                  uint8_t* WEBP_RESTRICT dst, int do_two) {
247   ITransformOne(ref, in, dst);
248   if (do_two) {
249     ITransformOne(ref + 4, in + 16, dst + 4);
250   }
251 }
252 
Disto4x4_MIPSdspR2(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)253 static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
254                               const uint8_t* WEBP_RESTRICT const b,
255                               const uint16_t* WEBP_RESTRICT const w) {
256   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
257   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
258 
259   __asm__ volatile (
260     LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
261                         0, 0, 0, 0,
262                         0, 1, 2, 3,
263                         BPS)
264     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
265                             temp12, temp1, temp2, temp3, temp4)
266     ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
267                       temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
268     PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
269                           temp7, temp2, temp4, temp6, temp8)
270     ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
271                       temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
272     ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
273                       temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
274     ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
275                       temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
276     ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
277     LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
278                         0, 4, 8, 12,
279                         0, 0, 0, 0,
280                         0)
281     LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
282                         0, 4, 8, 12,
283                         1, 1, 1, 1,
284                         16)
285     MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
286              temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
287     LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
288                         0, 0, 0, 0,
289                         0, 1, 2, 3,
290                         BPS)
291     CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
292                             temp12, temp1, temp2, temp3, temp4)
293     ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
294                       temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
295     PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
296                           temp7, temp2, temp4, temp6, temp8)
297     ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
298                       temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
299     ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
300                       temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
301     ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
302                       temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
303     ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
304     LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
305                         0, 4, 8, 12,
306                         0, 0, 0, 0,
307                         0)
308     LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
309                         0, 4, 8, 12,
310                         1, 1, 1, 1,
311                         16)
312     MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
313              temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
314     OUTPUT_EARLY_CLOBBER_REGS_17()
315     : [a]"r"(a), [b]"r"(b), [w]"r"(w)
316     : "memory", "hi", "lo"
317   );
318   return abs(temp3 - temp17) >> 5;
319 }
320 
Disto16x16_MIPSdspR2(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)321 static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
322                                 const uint8_t* WEBP_RESTRICT const b,
323                                 const uint16_t* WEBP_RESTRICT const w) {
324   int D = 0;
325   int x, y;
326   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
327     for (x = 0; x < 16; x += 4) {
328       D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
329     }
330   }
331   return D;
332 }
333 
334 //------------------------------------------------------------------------------
335 // Intra predictions
336 
337 #define FILL_PART(J, SIZE)                                            \
338     "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
339     "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
340   ".if " #SIZE " == 16                                     \n\t"      \
341     "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
342     "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
343   ".endif                                                  \n\t"
344 
345 #define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
346   int value = (VALUE);                                              \
347   __asm__ volatile (                                                \
348     "replv.qb   %[value],  %[value]                      \n\t"      \
349     FILL_PART( 0, SIZE)                                             \
350     FILL_PART( 1, SIZE)                                             \
351     FILL_PART( 2, SIZE)                                             \
352     FILL_PART( 3, SIZE)                                             \
353     FILL_PART( 4, SIZE)                                             \
354     FILL_PART( 5, SIZE)                                             \
355     FILL_PART( 6, SIZE)                                             \
356     FILL_PART( 7, SIZE)                                             \
357   ".if " #SIZE " == 16                                   \n\t"      \
358     FILL_PART( 8, 16)                                               \
359     FILL_PART( 9, 16)                                               \
360     FILL_PART(10, 16)                                               \
361     FILL_PART(11, 16)                                               \
362     FILL_PART(12, 16)                                               \
363     FILL_PART(13, 16)                                               \
364     FILL_PART(14, 16)                                               \
365     FILL_PART(15, 16)                                               \
366   ".endif                                                \n\t"      \
367     : [value]"+&r"(value)                                           \
368     : [dst]"r"((DST))                                               \
369     : "memory"                                                      \
370   );                                                                \
371 } while (0)
372 
373 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
374 static WEBP_INLINE void VerticalPred##SIZE(                                    \
375     uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) {        \
376   int j;                                                                       \
377   if ((TOP)) {                                                                 \
378     for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
379   } else {                                                                     \
380     FILL_8_OR_16((DST), 127, (SIZE));                                          \
381   }                                                                            \
382 }
383 
384 VERTICAL_PRED(dst, top, 8)
385 VERTICAL_PRED(dst, top, 16)
386 
387 #undef VERTICAL_PRED
388 
389 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
390 static WEBP_INLINE void HorizontalPred##SIZE(                                  \
391     uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) {       \
392   if (LEFT) {                                                                  \
393     int j;                                                                     \
394     for (j = 0; j < (SIZE); ++j) {                                             \
395       memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
396     }                                                                          \
397   } else {                                                                     \
398     FILL_8_OR_16((DST), 129, (SIZE));                                          \
399   }                                                                            \
400 }
401 
402 HORIZONTAL_PRED(dst, left, 8)
403 HORIZONTAL_PRED(dst, left, 16)
404 
405 #undef HORIZONTAL_PRED
406 
407 #define CLIPPING()                                                             \
408   "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
409   "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
410   "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
411   "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
412   "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
413   "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
414   "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
415   "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
416   "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
417   "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
418   "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
419   "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
420   "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
421   "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
422 
423 #define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
424   int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
425   int temp0, temp1, temp2, temp3;                                              \
426   __asm__ volatile (                                                           \
427     "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
428     "ulw             %[temp0],   0(%[top])               \n\t"                 \
429     "ulw             %[temp1],   4(%[top])               \n\t"                 \
430     "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
431     CLIPPING()                                                                 \
432     "usw             %[temp0],   0(%[dst])               \n\t"                 \
433     "usw             %[temp1],   4(%[dst])               \n\t"                 \
434   ".if " #SIZE " == 16                                   \n\t"                 \
435     "ulw             %[temp0],   8(%[top])               \n\t"                 \
436     "ulw             %[temp1],   12(%[top])              \n\t"                 \
437     CLIPPING()                                                                 \
438     "usw             %[temp0],   8(%[dst])               \n\t"                 \
439     "usw             %[temp1],   12(%[dst])              \n\t"                 \
440   ".endif                                                \n\t"                 \
441     : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
442       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
443     : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
444     : "memory"                                                                 \
445   );                                                                           \
446 } while (0)
447 
448 #define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
449   int y;                                                                       \
450   const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
451   for (y = 0; y < (SIZE); ++y) {                                               \
452     CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
453     (DST) += BPS;                                                              \
454   }                                                                            \
455 } while (0)
456 
457 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
458 static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST),         \
459                                          const uint8_t* WEBP_RESTRICT (LEFT),  \
460                                          const uint8_t* WEBP_RESTRICT (TOP)) { \
461   if ((LEFT) != NULL) {                                                        \
462     if ((TOP) != NULL) {                                                       \
463       CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
464     } else {                                                                   \
465       HorizontalPred##SIZE((DST), (LEFT));                                     \
466     }                                                                          \
467   } else {                                                                     \
468     /* true motion without left samples (hence: with default 129 value)    */  \
469     /* is equivalent to VE prediction where you just copy the top samples. */  \
470     /* Note that if top samples are not available, the default value is    */  \
471     /* then 129, and not 127 as in the VerticalPred case.                  */  \
472     if ((TOP) != NULL) {                                                       \
473       VerticalPred##SIZE((DST), (TOP));                                        \
474     } else {                                                                   \
475       FILL_8_OR_16((DST), 129, (SIZE));                                        \
476     }                                                                          \
477   }                                                                            \
478 }
479 
480 TRUE_MOTION(dst, left, top, 8)
481 TRUE_MOTION(dst, left, top, 16)
482 
483 #undef TRUE_MOTION
484 #undef CLIP_TO_DST
485 #undef CLIP_8B_TO_DST
486 #undef CLIPPING
487 
DCMode16(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)488 static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
489                                  const uint8_t* WEBP_RESTRICT left,
490                                  const uint8_t* WEBP_RESTRICT top) {
491   int DC, DC1;
492   int temp0, temp1, temp2, temp3;
493 
494   __asm__ volatile(
495     "beqz        %[top],   2f                  \n\t"
496     LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
497                         0, 4, 8, 12,
498                         0, 0, 0, 0,
499                         0)
500     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
501     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
502     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
503     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
504     "addu        %[temp0], %[temp0], %[temp1]  \n\t"
505     "addu        %[temp2], %[temp2], %[temp3]  \n\t"
506     "addu        %[DC],    %[temp0], %[temp2]  \n\t"
507     "move        %[DC1],   %[DC]               \n\t"
508     "beqz        %[left],  1f                  \n\t"
509     LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
510                         0, 4, 8, 12,
511                         0, 0, 0, 0,
512                         0)
513     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
514     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
515     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
516     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
517     "addu        %[temp0], %[temp0], %[temp1]  \n\t"
518     "addu        %[temp2], %[temp2], %[temp3]  \n\t"
519     "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
520   "1:                                          \n\t"
521     "addu        %[DC],   %[DC],     %[DC1]    \n\t"
522     "j           3f                            \n\t"
523   "2:                                          \n\t"
524     "beqz        %[left],  4f                  \n\t"
525     LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
526                         0, 4, 8, 12,
527                         0, 0, 0, 0,
528                         0)
529     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
530     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
531     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
532     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
533     "addu        %[temp0], %[temp0], %[temp1]  \n\t"
534     "addu        %[temp2], %[temp2], %[temp3]  \n\t"
535     "addu        %[DC],    %[temp0], %[temp2]  \n\t"
536     "addu        %[DC],    %[DC],    %[DC]     \n\t"
537   "3:                                          \n\t"
538     "shra_r.w    %[DC],    %[DC],    5         \n\t"
539     "j           5f                            \n\t"
540   "4:                                          \n\t"
541     "li          %[DC],    0x80                \n\t"
542   "5:                                          \n\t"
543     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
544       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
545     : [left]"r"(left), [top]"r"(top)
546     : "memory"
547   );
548 
549   FILL_8_OR_16(dst, DC, 16);
550 }
551 
DCMode8(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)552 static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
553                                 const uint8_t* WEBP_RESTRICT left,
554                                 const uint8_t* WEBP_RESTRICT top) {
555   int DC, DC1;
556   int temp0, temp1, temp2, temp3;
557 
558   __asm__ volatile(
559     "beqz        %[top],   2f                  \n\t"
560     "ulw         %[temp0], 0(%[top])           \n\t"
561     "ulw         %[temp1], 4(%[top])           \n\t"
562     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
563     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
564     "addu        %[DC],    %[temp0], %[temp1]  \n\t"
565     "move        %[DC1],   %[DC]               \n\t"
566     "beqz        %[left],  1f                  \n\t"
567     "ulw         %[temp2], 0(%[left])          \n\t"
568     "ulw         %[temp3], 4(%[left])          \n\t"
569     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
570     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
571     "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
572   "1:                                          \n\t"
573     "addu        %[DC],    %[DC],    %[DC1]    \n\t"
574     "j           3f                            \n\t"
575   "2:                                          \n\t"
576     "beqz        %[left],  4f                  \n\t"
577     "ulw         %[temp2], 0(%[left])          \n\t"
578     "ulw         %[temp3], 4(%[left])          \n\t"
579     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
580     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
581     "addu        %[DC],    %[temp2], %[temp3]  \n\t"
582     "addu        %[DC],    %[DC],    %[DC]     \n\t"
583   "3:                                          \n\t"
584     "shra_r.w    %[DC], %[DC], 4               \n\t"
585     "j           5f                            \n\t"
586   "4:                                          \n\t"
587     "li          %[DC], 0x80                   \n\t"
588   "5:                                          \n\t"
589     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
590       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
591     : [left]"r"(left), [top]"r"(top)
592     : "memory"
593   );
594 
595   FILL_8_OR_16(dst, DC, 8);
596 }
597 
DC4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)598 static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
599   int temp0, temp1;
600   __asm__ volatile(
601     "ulw          %[temp0],   0(%[top])               \n\t"
602     "ulw          %[temp1],   -5(%[top])              \n\t"
603     "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
604     "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
605     "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
606     "addiu        %[temp0],   %[temp0],    4          \n\t"
607     "srl          %[temp0],   %[temp0],    3          \n\t"
608     "replv.qb     %[temp0],   %[temp0]                \n\t"
609     "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
610     "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
611     "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
612     "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
613     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
614     : [top]"r"(top), [dst]"r"(dst)
615     : "memory"
616   );
617 }
618 
TM4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)619 static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
620   int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
621   const int c35 = 0xff00ff;
622   __asm__ volatile (
623     "lbu              %[temp1],  0(%[top])                     \n\t"
624     "lbu              %[a10],    1(%[top])                     \n\t"
625     "lbu              %[temp2],  2(%[top])                     \n\t"
626     "lbu              %[a32],    3(%[top])                     \n\t"
627     "ulw              %[temp0],  -5(%[top])                    \n\t"
628     "lbu              %[temp4],  -1(%[top])                    \n\t"
629     "append           %[a10],    %[temp1],   16                \n\t"
630     "append           %[a32],    %[temp2],   16                \n\t"
631     "replv.ph         %[temp4],  %[temp4]                      \n\t"
632     "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
633     "and              %[temp0],  %[temp0],   %[c35]            \n\t"
634     "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
635     "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
636     "srl              %[temp2],  %[temp1],   16                \n\t"
637     "srl              %[temp3],  %[temp0],   16                \n\t"
638     "replv.ph         %[temp2],  %[temp2]                      \n\t"
639     "replv.ph         %[temp3],  %[temp3]                      \n\t"
640     "replv.ph         %[temp4],  %[temp1]                      \n\t"
641     "replv.ph         %[temp5],  %[temp0]                      \n\t"
642     "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
643     "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
644     "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
645     "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
646     "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
647     "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
648     "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
649     "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
650     "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
651     "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
652     "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
653     "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
654     "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
655     "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
656     "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
657     "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
658     "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
659     "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
660     "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
661     "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
662     "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
663     "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
664     "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
665     "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
666     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
667       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
668       [a10]"=&r"(a10), [a32]"=&r"(a32)
669     : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
670     : "memory"
671   );
672 }
673 
VE4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)674 static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
675   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
676   __asm__ volatile(
677     "ulw             %[temp0],   -1(%[top])              \n\t"
678     "ulh             %[temp1],   3(%[top])               \n\t"
679     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
680     "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
681     "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
682     "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
683     "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
684     "shll.ph         %[temp5],   %[temp5],    1          \n\t"
685     "shll.ph         %[temp6],   %[temp6],    1          \n\t"
686     "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
687     "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
688     "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
689     "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
690     "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
691     "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
692     "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
693     "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
694     "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
695     "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
696     "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
697     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
698       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
699       [temp6]"=&r"(temp6)
700     : [top]"r"(top), [dst]"r"(dst)
701     : "memory"
702   );
703 }
704 
HE4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)705 static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
706   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
707   __asm__ volatile(
708     "ulw             %[temp0],   -4(%[top])              \n\t"
709     "lbu             %[temp1],   -5(%[top])              \n\t"
710     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
711     "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
712     "replv.ph        %[temp4],   %[temp1]                \n\t"
713     "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
714     "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
715     "shll.ph         %[temp5],   %[temp5],    1          \n\t"
716     "shll.ph         %[temp6],   %[temp6],    1          \n\t"
717     "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
718     "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
719     "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
720     "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
721     "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
722     "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
723     "replv.qb        %[temp0],   %[temp3]                \n\t"
724     "replv.qb        %[temp1],   %[temp2]                \n\t"
725     "srl             %[temp3],   %[temp3],    16         \n\t"
726     "srl             %[temp2],   %[temp2],    16         \n\t"
727     "replv.qb        %[temp3],   %[temp3]                \n\t"
728     "replv.qb        %[temp2],   %[temp2]                \n\t"
729     "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
730     "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
731     "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
732     "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
733     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
734       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
735       [temp6]"=&r"(temp6)
736     : [top]"r"(top), [dst]"r"(dst)
737     : "memory"
738   );
739 }
740 
RD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)741 static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
742   int temp0, temp1, temp2, temp3, temp4, temp5;
743   int temp6, temp7, temp8, temp9, temp10, temp11;
744   __asm__ volatile(
745     "ulw             %[temp0],    -5(%[top])               \n\t"
746     "ulw             %[temp1],    -1(%[top])               \n\t"
747     "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
748     "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
749     "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
750     "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
751     "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
752     "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
753     "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
754     "shll.ph         %[temp6],    %[temp6],    1           \n\t"
755     "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
756     "shll.ph         %[temp7],    %[temp7],    1           \n\t"
757     "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
758     "shll.ph         %[temp8],    %[temp8],    1           \n\t"
759     "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
760     "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
761     "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
762     "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
763     "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
764     "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
765     "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
766     "lbu             %[temp0],    3(%[top])                \n\t"
767     "lbu             %[temp1],    2(%[top])                \n\t"
768     "lbu             %[temp2],    1(%[top])                \n\t"
769     "sll             %[temp1],    %[temp1],    1           \n\t"
770     "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
771     "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
772     "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
773     "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
774     "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
775     "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
776     "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
777     "prepend         %[temp9],    %[temp11],   8           \n\t"
778     "prepend         %[temp10],   %[temp0],    8           \n\t"
779     "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
780     "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
781     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
782       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
783       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
784       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
785     : [top]"r"(top), [dst]"r"(dst)
786     : "memory"
787   );
788 }
789 
VR4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)790 static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
791   int temp0, temp1, temp2, temp3, temp4;
792   int temp5, temp6, temp7, temp8, temp9;
793   __asm__ volatile (
794     "ulw              %[temp0],   -4(%[top])              \n\t"
795     "ulw              %[temp1],   0(%[top])               \n\t"
796     "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
797     "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
798     "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
799     "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
800     "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
801     "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
802     "move             %[temp6],   %[temp1]                \n\t"
803     "append           %[temp1],   %[temp2],    16         \n\t"
804     "shll.ph          %[temp9],   %[temp6],    1          \n\t"
805     "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
806     "shll.ph          %[temp8],   %[temp7],    1          \n\t"
807     "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
808     "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
809     "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
810     "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
811     "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
812     "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
813     "shll.ph          %[temp7],   %[temp7],    1          \n\t"
814     "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
815     "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
816     "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
817     "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
818     "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
819     "append           %[temp4],   %[temp5],    16         \n\t"
820     "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
821     "append           %[temp3],   %[temp1],    16         \n\t"
822     "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
823     "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
824     "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
825     "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
826     "append           %[temp3],   %[temp6],    8          \n\t"
827     "srl              %[temp6],   %[temp6],    16         \n\t"
828     "append           %[temp8],   %[temp6],    8          \n\t"
829     "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
830     "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
831     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
832       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
833       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
834       [temp9]"=&r"(temp9)
835     : [top]"r"(top), [dst]"r"(dst)
836     : "memory"
837   );
838 }
839 
LD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)840 static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
841   int temp0, temp1, temp2, temp3, temp4, temp5;
842   int temp6, temp7, temp8, temp9, temp10, temp11;
843   __asm__ volatile(
844     "ulw             %[temp0],    0(%[top])               \n\t"
845     "ulw             %[temp1],    4(%[top])               \n\t"
846     "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
847     "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
848     "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
849     "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
850     "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
851     "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
852     "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
853     "shll.ph         %[temp6],    %[temp6],    1          \n\t"
854     "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
855     "shll.ph         %[temp7],    %[temp7],    1          \n\t"
856     "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
857     "shll.ph         %[temp8],    %[temp8],    1          \n\t"
858     "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
859     "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
860     "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
861     "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
862     "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
863     "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
864     "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
865     "srl             %[temp1],    %[temp1],    24         \n\t"
866     "sll             %[temp1],    %[temp1],    1          \n\t"
867     "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
868     "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
869     "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
870     "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
871     "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
872     "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
873     "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
874     "prepend         %[temp9],    %[temp11],   8          \n\t"
875     "prepend         %[temp10],   %[temp1],    8          \n\t"
876     "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
877     "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
878     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
879       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
880       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
881       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
882     : [top]"r"(top), [dst]"r"(dst)
883     : "memory"
884   );
885 }
886 
VL4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)887 static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
888   int temp0, temp1, temp2, temp3, temp4;
889   int temp5, temp6, temp7, temp8, temp9;
890   __asm__ volatile (
891     "ulw              %[temp0],   0(%[top])               \n\t"
892     "ulw              %[temp1],   4(%[top])               \n\t"
893     "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
894     "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
895     "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
896     "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
897     "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
898     "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
899     "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
900     "shll.ph          %[temp9],   %[temp2],    1          \n\t"
901     "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
902     "shll.ph          %[temp8],   %[temp7],    1          \n\t"
903     "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
904     "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
905     "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
906     "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
907     "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
908     "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
909     "shll.ph          %[temp7],   %[temp7],    1          \n\t"
910     "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
911     "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
912     "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
913     "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
914     "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
915     "append           %[temp5],   %[temp4],    16         \n\t"
916     "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
917     "append           %[temp2],   %[temp0],    16         \n\t"
918     "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
919     "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
920     "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
921     "prepend          %[temp8],   %[temp6],    8          \n\t"
922     "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
923     "srl              %[temp6],   %[temp6],    16         \n\t"
924     "prepend          %[temp3],   %[temp6],    8          \n\t"
925     "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
926     "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
927     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
928       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
929       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
930       [temp9]"=&r"(temp9)
931     : [top]"r"(top), [dst]"r"(dst)
932     : "memory"
933   );
934 }
935 
HD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)936 static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
937   int temp0, temp1, temp2, temp3, temp4;
938   int temp5, temp6, temp7, temp8, temp9;
939   __asm__ volatile (
940     "ulw              %[temp0],   -5(%[top])              \n\t"
941     "ulw              %[temp1],   -1(%[top])              \n\t"
942     "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
943     "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
944     "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
945     "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
946     "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
947     "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
948     "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
949     "shll.ph          %[temp9],   %[temp2],    1          \n\t"
950     "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
951     "shll.ph          %[temp8],   %[temp7],    1          \n\t"
952     "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
953     "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
954     "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
955     "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
956     "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
957     "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
958     "shll.ph          %[temp7],   %[temp7],    1          \n\t"
959     "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
960     "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
961     "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
962     "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
963     "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
964     "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
965     "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
966     "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
967     "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
968     "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
969     "append           %[temp2],   %[temp5],    16         \n\t"
970     "append           %[temp0],   %[temp4],    16         \n\t"
971     "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
972     "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
973     "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
974     "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
975     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
976       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
977       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
978       [temp9]"=&r"(temp9)
979     : [top]"r"(top), [dst]"r"(dst)
980     : "memory"
981   );
982 }
983 
HU4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)984 static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
985   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
986   __asm__ volatile (
987     "ulw             %[temp0],   -5(%[top])              \n\t"
988     "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
989     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
990     "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
991     "replv.qb        %[temp7],   %[temp2]                \n\t"
992     "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
993     "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
994     "shll.ph         %[temp6],   %[temp3],    1          \n\t"
995     "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
996     "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
997     "shll.ph         %[temp0],   %[temp2],    1          \n\t"
998     "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
999     "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
1000     "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
1001     "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
1002     "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
1003     "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
1004     "append          %[temp0],   %[temp5],    16         \n\t"
1005     "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
1006     "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
1007     "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
1008     "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
1009     "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
1010     "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
1011     "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
1012     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1013       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1014       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
1015     : [top]"r"(top), [dst]"r"(dst)
1016     : "memory"
1017   );
1018 }
1019 
1020 //------------------------------------------------------------------------------
1021 // Chroma 8x8 prediction (paragraph 12.2)
1022 
IntraChromaPreds_MIPSdspR2(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)1023 static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
1024                                        const uint8_t* WEBP_RESTRICT left,
1025                                        const uint8_t* WEBP_RESTRICT top) {
1026   // U block
1027   DCMode8(C8DC8 + dst, left, top);
1028   VerticalPred8(C8VE8 + dst, top);
1029   HorizontalPred8(C8HE8 + dst, left);
1030   TrueMotion8(C8TM8 + dst, left, top);
1031   // V block
1032   dst += 8;
1033   if (top) top += 8;
1034   if (left) left += 16;
1035   DCMode8(C8DC8 + dst, left, top);
1036   VerticalPred8(C8VE8 + dst, top);
1037   HorizontalPred8(C8HE8 + dst, left);
1038   TrueMotion8(C8TM8 + dst, left, top);
1039 }
1040 
1041 //------------------------------------------------------------------------------
1042 // luma 16x16 prediction (paragraph 12.3)
1043 
Intra16Preds_MIPSdspR2(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)1044 static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
1045                                    const uint8_t* WEBP_RESTRICT left,
1046                                    const uint8_t* WEBP_RESTRICT top) {
1047   DCMode16(I16DC16 + dst, left, top);
1048   VerticalPred16(I16VE16 + dst, top);
1049   HorizontalPred16(I16HE16 + dst, left);
1050   TrueMotion16(I16TM16 + dst, left, top);
1051 }
1052 
1053 // Left samples are top[-5 .. -2], top_left is top[-1], top are
1054 // located at top[0..3], and top right is top[4..7]
Intra4Preds_MIPSdspR2(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)1055 static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
1056                                   const uint8_t* WEBP_RESTRICT top) {
1057   DC4(I4DC4 + dst, top);
1058   TM4(I4TM4 + dst, top);
1059   VE4(I4VE4 + dst, top);
1060   HE4(I4HE4 + dst, top);
1061   RD4(I4RD4 + dst, top);
1062   VR4(I4VR4 + dst, top);
1063   LD4(I4LD4 + dst, top);
1064   VL4(I4VL4 + dst, top);
1065   HD4(I4HD4 + dst, top);
1066   HU4(I4HU4 + dst, top);
1067 }
1068 
1069 //------------------------------------------------------------------------------
1070 // Metric
1071 
1072 #if !defined(WORK_AROUND_GCC)
1073 
1074 #define GET_SSE_INNER(A)                                                  \
1075   "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
1076   "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
1077   "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
1078   "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
1079   "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
1080   "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
1081   "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
1082   "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
1083   "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
1084   "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
1085 
1086 #define GET_SSE(A, B, C, D)               \
1087   GET_SSE_INNER(A)                        \
1088   GET_SSE_INNER(B)                        \
1089   GET_SSE_INNER(C)                        \
1090   GET_SSE_INNER(D)
1091 
SSE16x16_MIPSdspR2(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)1092 static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1093                               const uint8_t* WEBP_RESTRICT b) {
1094   int count;
1095   int temp0, temp1, temp2, temp3;
1096   __asm__ volatile (
1097     "mult   $zero,    $zero                            \n\t"
1098     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
1099     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
1100     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
1101     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
1102     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
1103     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
1104     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
1105     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
1106     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
1107     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
1108     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
1109     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
1110     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
1111     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
1112     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
1113     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
1114     "mflo   %[count]                                   \n\t"
1115     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1116       [temp3]"=&r"(temp3), [count]"=&r"(count)
1117     : [a]"r"(a), [b]"r"(b)
1118     : "memory", "hi", "lo"
1119   );
1120   return count;
1121 }
1122 
SSE16x8_MIPSdspR2(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)1123 static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1124                              const uint8_t* WEBP_RESTRICT b) {
1125   int count;
1126   int temp0, temp1, temp2, temp3;
1127   __asm__ volatile (
1128     "mult   $zero,    $zero                            \n\t"
1129     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
1130     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
1131     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
1132     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
1133     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
1134     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
1135     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
1136     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
1137     "mflo   %[count]                                   \n\t"
1138     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1139       [temp3]"=&r"(temp3), [count]"=&r"(count)
1140     : [a]"r"(a), [b]"r"(b)
1141     : "memory", "hi", "lo"
1142   );
1143   return count;
1144 }
1145 
SSE8x8_MIPSdspR2(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)1146 static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1147                             const uint8_t* WEBP_RESTRICT b) {
1148   int count;
1149   int temp0, temp1, temp2, temp3;
1150   __asm__ volatile (
1151     "mult   $zero,    $zero                            \n\t"
1152     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
1153     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
1154     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
1155     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
1156     "mflo   %[count]                                   \n\t"
1157     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1158       [temp3]"=&r"(temp3), [count]"=&r"(count)
1159     : [a]"r"(a), [b]"r"(b)
1160     : "memory", "hi", "lo"
1161   );
1162   return count;
1163 }
1164 
SSE4x4_MIPSdspR2(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)1165 static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
1166                             const uint8_t* WEBP_RESTRICT b) {
1167   int count;
1168   int temp0, temp1, temp2, temp3;
1169   __asm__ volatile (
1170     "mult   $zero,    $zero                            \n\t"
1171     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
1172     "mflo   %[count]                                   \n\t"
1173     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1174       [temp3]"=&r"(temp3), [count]"=&r"(count)
1175     : [a]"r"(a), [b]"r"(b)
1176     : "memory", "hi", "lo"
1177   );
1178   return count;
1179 }
1180 
1181 #undef GET_SSE
1182 #undef GET_SSE_INNER
1183 
1184 #endif  // !WORK_AROUND_GCC
1185 
1186 #undef FILL_8_OR_16
1187 #undef FILL_PART
1188 #undef OUTPUT_EARLY_CLOBBER_REGS_17
1189 #undef MUL_HALF
1190 #undef ABS_X8
1191 #undef ADD_SUB_HALVES_X4
1192 
1193 //------------------------------------------------------------------------------
1194 // Quantization
1195 //
1196 
1197 // macro for one pass through for loop in QuantizeBlock reading 2 values at time
1198 // QUANTDIV macro inlined
1199 // J - offset in bytes (kZigzag[n] * 2)
1200 // K - offset in bytes (kZigzag[n] * 4)
1201 // N - offset in bytes (n * 2)
1202 // N1 - offset in bytes ((n + 1) * 2)
1203 #define QUANTIZE_ONE(J, K, N, N1)                                         \
1204   "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
1205   "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
1206   "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
1207   "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
1208   "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
1209   "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
1210   "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
1211   "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
1212   "li          %[level],     0x10001                         \n\t"        \
1213   "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
1214   "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
1215   "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
1216   "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
1217   "beqz        %[temp5],     0f                              \n\t"        \
1218   "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
1219   "beq         %[temp5],     %[level],         1f            \n\t"        \
1220   "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
1221   "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
1222   "beqz        %[temp5],     2f                              \n\t"        \
1223   "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
1224   "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
1225   "sh          $0,           " #N1 "(%[pout])                \n\t"        \
1226   "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1227   "sra         %[level],     %[level],         17            \n\t"        \
1228   "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
1229   "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
1230   "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
1231   "xor         %[level],     %[level],         %[temp6]      \n\t"        \
1232   "subu        %[level],     %[level],         %[temp6]      \n\t"        \
1233   "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
1234   "or          %[ret],       %[ret],           %[level]      \n\t"        \
1235   "sh          %[level],     " #N "(%[pout])                 \n\t"        \
1236   "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
1237   "j           3f                                            \n\t"        \
1238 "2:                                                          \n\t"        \
1239   "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
1240   "srl         %[temp5],     %[coeff],         16            \n\t"        \
1241   "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
1242   "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
1243   "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
1244   "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1245   "sra         %[level],     %[level],         17            \n\t"        \
1246   "srl         %[temp6],     %[sign],          16            \n\t"        \
1247   "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
1248   "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
1249   "xor         %[level],     %[level],         %[temp6]      \n\t"        \
1250   "subu        %[level],     %[level],         %[temp6]      \n\t"        \
1251   "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
1252   "sh          $0,           " #J "(%[ppin])                 \n\t"        \
1253   "sh          $0,           " #N "(%[pout])                 \n\t"        \
1254   "or          %[ret],       %[ret],           %[level]      \n\t"        \
1255   "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
1256   "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
1257   "j           3f                                            \n\t"        \
1258 "1:                                                          \n\t"        \
1259   "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
1260   "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
1261   "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
1262   "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
1263   "srl         %[temp0],     %[coeff],         16            \n\t"        \
1264   "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
1265   "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
1266   "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
1267   "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
1268   "addu        %[level],     %[level],         %[temp2]      \n\t"        \
1269   "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
1270   "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
1271   "shra.ph     %[level],     %[level],         1             \n\t"        \
1272   "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
1273   "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
1274   "xor         %[level],     %[level],         %[sign]       \n\t"        \
1275   "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
1276   "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
1277   "or          %[ret],       %[ret],           %[level]      \n\t"        \
1278   "sh          %[level],     " #N "(%[pout])                 \n\t"        \
1279   "srl         %[level],     %[level],         16            \n\t"        \
1280   "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
1281   "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
1282   "j           3f                                            \n\t"        \
1283 "0:                                                          \n\t"        \
1284   "sh          $0,           " #N "(%[pout])                 \n\t"        \
1285   "sh          $0,           " #N1 "(%[pout])                \n\t"        \
1286   "usw         $0,           " #J "(%[ppin])                 \n\t"        \
1287 "3:                                                          \n\t"
1288 
QuantizeBlock_MIPSdspR2(int16_t in[16],int16_t out[16],const VP8Matrix * WEBP_RESTRICT const mtx)1289 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
1290                                    const VP8Matrix* WEBP_RESTRICT const mtx) {
1291   int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
1292   int sign, coeff, level;
1293   int max_level = MAX_LEVEL;
1294   int max_level1 = max_level << 16 | max_level;
1295   int ret = 0;
1296 
1297   int16_t* ppin             = &in[0];
1298   int16_t* pout             = &out[0];
1299   const uint16_t* ppsharpen = &mtx->sharpen_[0];
1300   const uint32_t* ppzthresh = &mtx->zthresh_[0];
1301   const uint16_t* ppq       = &mtx->q_[0];
1302   const uint16_t* ppiq      = &mtx->iq_[0];
1303   const uint32_t* ppbias    = &mtx->bias_[0];
1304 
1305   __asm__ volatile (
1306     QUANTIZE_ONE( 0,  0,  0,  2)
1307     QUANTIZE_ONE( 4,  8, 10, 12)
1308     QUANTIZE_ONE( 8, 16,  4,  8)
1309     QUANTIZE_ONE(12, 24, 14, 24)
1310     QUANTIZE_ONE(16, 32,  6, 16)
1311     QUANTIZE_ONE(20, 40, 22, 26)
1312     QUANTIZE_ONE(24, 48, 18, 20)
1313     QUANTIZE_ONE(28, 56, 28, 30)
1314 
1315     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
1316       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
1317       [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1318       [sign]"=&r"(sign), [coeff]"=&r"(coeff),
1319       [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
1320     : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
1321       [ppiq]"r"(ppiq), [max_level]"r"(max_level),
1322       [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
1323       [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
1324     : "memory", "hi", "lo"
1325   );
1326 
1327   return (ret != 0);
1328 }
1329 
Quantize2Blocks_MIPSdspR2(int16_t in[32],int16_t out[32],const VP8Matrix * WEBP_RESTRICT const mtx)1330 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
1331                                      const VP8Matrix* WEBP_RESTRICT const mtx) {
1332   int nz;
1333   nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
1334   nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
1335   return nz;
1336 }
1337 
1338 #undef QUANTIZE_ONE
1339 
1340 // macro for one horizontal pass in FTransformWHT
1341 // temp0..temp7 holds tmp[0]..tmp[15]
1342 // A, B, C, D - offset in bytes to load from in buffer
1343 // TEMP0, TEMP1 - registers for corresponding tmp elements
1344 #define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
1345   "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
1346   "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
1347   "lh              %[temp8],     " #C "(%[in])              \n\t"              \
1348   "lh              %[temp9],     " #D "(%[in])              \n\t"              \
1349   "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
1350   "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
1351   "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
1352   "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
1353   "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
1354   "append          %[temp8],     %[temp9],     16           \n\t"              \
1355   "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
1356   "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
1357   "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
1358 
1359 // macro for one vertical pass in FTransformWHT
1360 // temp0..temp7 holds tmp[0]..tmp[15]
1361 // A, B, C, D - offsets in bytes to store to out buffer
1362 // TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
1363 #define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
1364   "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
1365   "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
1366   "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
1367   "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
1368   "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
1369   "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
1370   "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
1371   "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
1372   "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
1373   "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
1374   "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
1375   "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
1376 
FTransformWHT_MIPSdspR2(const int16_t * WEBP_RESTRICT in,int16_t * WEBP_RESTRICT out)1377 static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
1378                                     int16_t* WEBP_RESTRICT out) {
1379   int temp0, temp1, temp2, temp3, temp4;
1380   int temp5, temp6, temp7, temp8, temp9;
1381 
1382   __asm__ volatile (
1383     HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
1384     HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
1385     HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
1386     HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
1387     VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
1388     VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
1389     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1390       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1391       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
1392       [temp9]"=&r"(temp9)
1393     : [in]"r"(in), [out]"r"(out)
1394     : "memory"
1395   );
1396 }
1397 
1398 #undef VERTICAL_PASS_WHT
1399 #undef HORIZONTAL_PASS_WHT
1400 
1401 // macro for converting coefficients to bin
1402 // convert 8 coeffs at time
1403 // A, B, C, D - offsets in bytes to load from out buffer
1404 #define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
1405   "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
1406   "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
1407   "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
1408   "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
1409   "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
1410   "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
1411   "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
1412   "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
1413   "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
1414   "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
1415   "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
1416   "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
1417   "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
1418   "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
1419   "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
1420   "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
1421   "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
1422   "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
1423   "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
1424   "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
1425   "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
1426   "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
1427   "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
1428   "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
1429   "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
1430   "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
1431   "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
1432   "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
1433   "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
1434   "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
1435   "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
1436   "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
1437   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1438   "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
1439   "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
1440   "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
1441   "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
1442   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1443   "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
1444   "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
1445   "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
1446   "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
1447   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1448   "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
1449   "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
1450   "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
1451   "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
1452   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1453   "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
1454   "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
1455   "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
1456   "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
1457   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1458   "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
1459   "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
1460   "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
1461   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1462   "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
1463   "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
1464   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1465   "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
1466   "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
1467   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
1468   "sw         %[temp8],  0(%[temp3])                   \n\t"
1469 
CollectHistogram_MIPSdspR2(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)1470 static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
1471                                        int start_block, int end_block,
1472                                        VP8Histogram* const histo) {
1473   int j;
1474   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
1475   const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
1476   for (j = start_block; j < end_block; ++j) {
1477     int16_t out[16];
1478     int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
1479 
1480     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
1481 
1482     // Convert coefficients to bin.
1483     __asm__ volatile (
1484       CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
1485       CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
1486       : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
1487         [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
1488         [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
1489       : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
1490       : "memory"
1491     );
1492   }
1493   VP8SetHistogramData(distribution, histo);
1494 }
1495 
1496 #undef CONVERT_COEFFS_TO_BIN
1497 
1498 //------------------------------------------------------------------------------
1499 // Entry point
1500 
1501 extern void VP8EncDspInitMIPSdspR2(void);
1502 
VP8EncDspInitMIPSdspR2(void)1503 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
1504   VP8FTransform = FTransform_MIPSdspR2;
1505   VP8FTransformWHT = FTransformWHT_MIPSdspR2;
1506   VP8ITransform = ITransform_MIPSdspR2;
1507 
1508   VP8TDisto4x4 = Disto4x4_MIPSdspR2;
1509   VP8TDisto16x16 = Disto16x16_MIPSdspR2;
1510 
1511   VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
1512   VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
1513   VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
1514 
1515 #if !defined(WORK_AROUND_GCC)
1516   VP8SSE16x16 = SSE16x16_MIPSdspR2;
1517   VP8SSE8x8 = SSE8x8_MIPSdspR2;
1518   VP8SSE16x8 = SSE16x8_MIPSdspR2;
1519   VP8SSE4x4 = SSE4x4_MIPSdspR2;
1520 #endif
1521 
1522   VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
1523   VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
1524 
1525   VP8CollectHistogram = CollectHistogram_MIPSdspR2;
1526 }
1527 
1528 #else  // !WEBP_USE_MIPS_DSP_R2
1529 
1530 WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
1531 
1532 #endif  // WEBP_USE_MIPS_DSP_R2
1533