1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_ports/asmdefs_mmi.h"
14
15 #define TRANSPOSE_4H \
16 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
17 MMI_LI(%[tmp0], 0x93) \
18 "mtc1 %[tmp0], %[ftmp10] \n\t" \
19 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
20 "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
21 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
22 "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
23 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
24 "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
25 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
26 "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
27 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
28 "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
29 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
30 "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
31 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
32 "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
33 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
34 "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
35 "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
36 "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
37 "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
38 "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
39
vp8_short_idct4x4llm_mmi(int16_t * input,unsigned char * pred_ptr,int pred_stride,unsigned char * dst_ptr,int dst_stride)40 void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
41 int pred_stride, unsigned char *dst_ptr,
42 int dst_stride) {
43 double ftmp[12];
44 uint32_t tmp[0];
45 DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
46 DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
47 DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
48
49 __asm__ volatile (
50 MMI_LI(%[tmp0], 0x02)
51 "mtc1 %[tmp0], %[ftmp11] \n\t"
52 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
53
54 "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
55 "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
56 "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
57 "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
58 "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
59 "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
60 "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
61 "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
62
63 // ip[0...3] + ip[8...11]
64 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
65 // ip[0...3] - ip[8...11]
66 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
67 // (ip[12...15] * sinpi8sqrt2) >> 16
68 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
69 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
70 // (ip[ 4... 7] * sinpi8sqrt2) >> 16
71 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
72 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
73 // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
74 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
75 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
76 // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
77 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
78 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
79
80 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
81 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
82 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
83 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
84 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
85 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
86 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
87 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
88
89 TRANSPOSE_4H
90 // a
91 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
92 // b
93 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
94 // c
95 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
96 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
97 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
98 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
99 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
100 // d
101 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
102 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
103 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
104 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
105 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
106
107 MMI_LI(%[tmp0], 0x03)
108 "mtc1 %[tmp0], %[ftmp11] \n\t"
109 // a + d
110 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
111 "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t"
112 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
113 // b + c
114 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
115 "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t"
116 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
117 // b - c
118 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
119 "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t"
120 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
121 // a - d
122 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
123 "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t"
124 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
125
126 TRANSPOSE_4H
127 #if _MIPS_SIM == _ABIO32
128 "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
129 "mtc1 %[tmp0], %[ftmp5] \n\t"
130 #else
131 "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t"
132 "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t"
133 #endif
134 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
135 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
136 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
137 "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
138 "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
139 MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
140 MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
141
142 #if _MIPS_SIM == _ABIO32
143 "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
144 "mtc1 %[tmp0], %[ftmp6] \n\t"
145 #else
146 "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t"
147 "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t"
148 #endif
149 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
150 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
151 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
152 "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t"
153 "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t"
154 MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
155 MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
156
157 #if _MIPS_SIM == _ABIO32
158 "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
159 "mtc1 %[tmp0], %[ftmp7] \n\t"
160 #else
161 "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t"
162 "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t"
163 #endif
164 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
165 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
166 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
167 "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t"
168 "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t"
169 MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
170 MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
171
172 #if _MIPS_SIM == _ABIO32
173 "ulw %[tmp0], 0x00(%[pred_prt]) \n\t"
174 "mtc1 %[tmp0], %[ftmp8] \n\t"
175 #else
176 "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t"
177 "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t"
178 #endif
179 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
180 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
181 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
182 "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t"
183 "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t"
184 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
185 [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
186 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
187 [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
188 [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
189 [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
190 : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
191 [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
192 [pred_stride]"r"((mips_reg)pred_stride),
193 [dst_stride]"r"((mips_reg)dst_stride)
194 : "memory"
195 );
196 }
197
vp8_dc_only_idct_add_mmi(int16_t input_dc,unsigned char * pred_ptr,int pred_stride,unsigned char * dst_ptr,int dst_stride)198 void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
199 int pred_stride, unsigned char *dst_ptr,
200 int dst_stride) {
201 int a1 = ((input_dc + 4) >> 3);
202 double ftmp[5];
203 int low32;
204
205 __asm__ volatile (
206 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
207 "pshufh %[a1], %[a1], %[ftmp0] \n\t"
208 "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
209 "mtc1 %[low32], %[ftmp1] \n\t"
210 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
211 "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
212 "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
213 "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
214 "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
215
216 MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
217 MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
218 "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
219 "mtc1 %[low32], %[ftmp1] \n\t"
220 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
221 "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
222 "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
223 "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
224 "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
225
226 MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
227 MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
228 "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
229 "mtc1 %[low32], %[ftmp1] \n\t"
230 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
231 "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
232 "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
233 "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
234 "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
235
236 MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
237 MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
238 "ulw %[low32], 0x00(%[pred_ptr]) \n\t"
239 "mtc1 %[low32], %[ftmp1] \n\t"
240 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t"
241 "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t"
242 "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t"
243 "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t"
244 "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t"
245 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
246 [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
247 [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
248 : [dst_stride]"r"((mips_reg)dst_stride),
249 [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
250 : "memory"
251 );
252 }
253
vp8_short_inv_walsh4x4_mmi(int16_t * input,int16_t * mb_dqcoeff)254 void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
255 int i;
256 int16_t output[16];
257 double ftmp[12];
258 uint32_t tmp[1];
259 DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
260
261 __asm__ volatile (
262 MMI_LI(%[tmp0], 0x03)
263 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
264 "mtc1 %[tmp0], %[ftmp11] \n\t"
265 "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
266 "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
267 "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t"
268 "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t"
269 "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t"
270 "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t"
271 "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t"
272 "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t"
273 "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
274 "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
275 "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
276 "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
277
278 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
279 "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t"
280 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
281 "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
282
283 TRANSPOSE_4H
284 // a
285 "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
286 // d
287 "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t"
288 // b
289 "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
290 // c
291 "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
292
293 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
294 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
295 "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
296 "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
297
298 "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t"
299 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
300 "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t"
301 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
302 "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t"
303 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
304 "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t"
305 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
306
307 TRANSPOSE_4H
308 "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
309 "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
310 "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
311 "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
312 "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
313 "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
314 "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
315 "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
316 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
317 [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
318 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
319 [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
320 [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
321 : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
322 : "memory"
323 );
324
325 for (i = 0; i < 16; i++) {
326 mb_dqcoeff[i * 16] = output[i];
327 }
328 }
329