1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/scale.h"
12
13 #include <assert.h>
14 #include <string.h>
15
16 #include "libyuv/cpu_id.h"
17 #include "libyuv/planar_functions.h" // For CopyARGB
18 #include "libyuv/row.h"
19 #include "libyuv/scale_row.h"
20
21 #ifdef __cplusplus
22 namespace libyuv {
23 extern "C" {
24 #endif
25
26 // This module is for Mips MMI.
27 #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
28
29 // clang-format off
30
31 // CPU agnostic row functions
ScaleRowDown2_MMI(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)32 void ScaleRowDown2_MMI(const uint8_t* src_ptr,
33 ptrdiff_t src_stride,
34 uint8_t* dst,
35 int dst_width) {
36 (void)src_stride;
37
38 uint64_t src0, src1, dest;
39 const uint64_t shift = 0x8ULL;
40
41 __asm__ volatile(
42 "1: \n\t"
43 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
44 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
45 "psrlh %[src0], %[src0], %[shift] \n\t"
46
47 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
48 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
49 "psrlh %[src1], %[src1], %[shift] \n\t"
50
51 "packushb %[dest], %[src0], %[src1] \n\t"
52 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
53 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
54
55 "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
56 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
57 "daddi %[width], %[width], -0x08 \n\t"
58 "bnez %[width], 1b \n\t"
59 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
60 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
61 [shift] "f"(shift)
62 : "memory");
63 }
64
ScaleRowDown2Linear_MMI(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)65 void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
66 ptrdiff_t src_stride,
67 uint8_t* dst,
68 int dst_width) {
69 (void)src_stride;
70
71 uint64_t src0, src1;
72 uint64_t dest, dest0, dest1;
73
74 const uint64_t mask = 0x00ff00ff00ff00ffULL;
75 const uint64_t shift = 0x8ULL;
76
77 __asm__ volatile(
78 "1: \n\t"
79 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
80 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
81 "and %[dest0], %[src0], %[mask] \n\t"
82 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
83 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
84 "and %[dest1], %[src1], %[mask] \n\t"
85 "packushb %[dest0], %[dest0], %[dest1] \n\t"
86
87 "psrlh %[src0], %[src0], %[shift] \n\t"
88 "psrlh %[src1], %[src1], %[shift] \n\t"
89 "packushb %[dest1], %[src0], %[src1] \n\t"
90
91 "pavgb %[dest], %[dest0], %[dest1] \n\t"
92 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
93 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
94
95 "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
96 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
97 "daddi %[width], %[width], -0x08 \n\t"
98 "bnez %[width], 1b \n\t"
99 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
100 [dest1] "=&f"(dest1), [dest] "=&f"(dest)
101 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
102 [shift] "f"(shift), [width] "r"(dst_width)
103 : "memory");
104 }
105
ScaleRowDown2Box_MMI(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)106 void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
107 ptrdiff_t src_stride,
108 uint8_t* dst,
109 int dst_width) {
110 const uint8_t* s = src_ptr;
111 const uint8_t* t = src_ptr + src_stride;
112
113 uint64_t s0, s1, t0, t1;
114 uint64_t dest, dest0, dest1;
115
116 const uint64_t ph = 0x0002000200020002ULL;
117 const uint64_t mask = 0x00ff00ff00ff00ffULL;
118 const uint64_t shift0 = 0x2ULL;
119 const uint64_t shift1 = 0x8ULL;
120
121 __asm__ volatile(
122 "1: \n\t"
123 "gsldrc1 %[s0], 0x00(%[s]) \n\t"
124 "gsldlc1 %[s0], 0x07(%[s]) \n\t"
125 "psrlh %[s1], %[s0], %[shift1] \n\t"
126 "and %[s0], %[s0], %[mask] \n\t"
127
128 "gsldrc1 %[t0], 0x00(%[t]) \n\t"
129 "gsldlc1 %[t0], 0x07(%[t]) \n\t"
130 "psrlh %[t1], %[t0], %[shift1] \n\t"
131 "and %[t0], %[t0], %[mask] \n\t"
132
133 "paddh %[dest0], %[s0], %[s1] \n\t"
134 "paddh %[dest0], %[dest0], %[t0] \n\t"
135 "paddh %[dest0], %[dest0], %[t1] \n\t"
136 "paddh %[dest0], %[dest0], %[ph] \n\t"
137 "psrlh %[dest0], %[dest0], %[shift0] \n\t"
138
139 "gsldrc1 %[s0], 0x08(%[s]) \n\t"
140 "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
141 "psrlh %[s1], %[s0], %[shift1] \n\t"
142 "and %[s0], %[s0], %[mask] \n\t"
143
144 "gsldrc1 %[t0], 0x08(%[t]) \n\t"
145 "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
146 "psrlh %[t1], %[t0], %[shift1] \n\t"
147 "and %[t0], %[t0], %[mask] \n\t"
148
149 "paddh %[dest1], %[s0], %[s1] \n\t"
150 "paddh %[dest1], %[dest1], %[t0] \n\t"
151 "paddh %[dest1], %[dest1], %[t1] \n\t"
152 "paddh %[dest1], %[dest1], %[ph] \n\t"
153 "psrlh %[dest1], %[dest1], %[shift0] \n\t"
154
155 "packushb %[dest], %[dest0], %[dest1] \n\t"
156 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
157 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
158
159 "daddiu %[s], %[s], 0x10 \n\t"
160 "daddiu %[t], %[t], 0x10 \n\t"
161 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
162 "daddi %[width], %[width], -0x08 \n\t"
163 "bnez %[width], 1b \n\t"
164 : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
165 [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
166 : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
167 [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
168 [mask] "f"(mask)
169 : "memory");
170 }
171
ScaleARGBRowDown2_MMI(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)172 void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
173 ptrdiff_t src_stride,
174 uint8_t* dst_argb,
175 int dst_width) {
176 (void)src_stride;
177
178 const uint32_t* src = (const uint32_t*)(src_argb);
179 uint32_t* dst = (uint32_t*)(dst_argb);
180
181 uint64_t src0, src1, dest;
182
183 __asm__ volatile(
184 "1: \n\t"
185 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
186 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
187 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
188 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
189 "punpckhwd %[dest], %[src0], %[src1] \n\t"
190
191 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
192 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
193
194 "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
195 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
196 "daddi %[width], %[width], -0x02 \n\t"
197 "bnez %[width], 1b \n\t"
198 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
199 : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
200 : "memory");
201 }
202
ScaleARGBRowDown2Linear_MMI(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)203 void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
204 ptrdiff_t src_stride,
205 uint8_t* dst_argb,
206 int dst_width) {
207 (void)src_stride;
208
209 uint64_t src0, src1;
210 uint64_t dest, dest_hi, dest_lo;
211
212 __asm__ volatile(
213 "1: \n\t"
214 "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
215 "lwc1 %[src1], 0x08(%[src_ptr]) \n\t"
216 "punpcklwd %[dest_lo], %[src0], %[src1] \n\t"
217 "lwc1 %[src0], 0x04(%[src_ptr]) \n\t"
218 "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t"
219 "punpcklwd %[dest_hi], %[src0], %[src1] \n\t"
220
221 "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t"
222 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
223 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
224
225 "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
226 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
227 "daddi %[width], %[width], -0x02 \n\t"
228 "bnez %[width], 1b \n\t"
229 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
230 [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
231 : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
232 : "memory");
233 }
234
ScaleARGBRowDown2Box_MMI(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)235 void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
236 ptrdiff_t src_stride,
237 uint8_t* dst_argb,
238 int dst_width) {
239 const uint8_t* s = src_argb;
240 const uint8_t* t = src_argb + src_stride;
241
242 uint64_t s0, s_hi, s_lo;
243 uint64_t t0, t_hi, t_lo;
244 uint64_t dest, dest_hi, dest_lo;
245
246 const uint64_t mask = 0x0ULL;
247 const uint64_t ph = 0x0002000200020002ULL;
248 const uint64_t shfit = 0x2ULL;
249
250 __asm__ volatile(
251 "1: \n\t"
252 "gsldrc1 %[s0], 0x00(%[s]) \n\t"
253 "gsldlc1 %[s0], 0x07(%[s]) \n\t"
254 "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
255 "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
256 "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t"
257
258 "gsldrc1 %[t0], 0x00(%[t]) \n\t"
259 "gsldlc1 %[t0], 0x07(%[t]) \n\t"
260 "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
261 "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
262 "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t"
263 "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t"
264
265 "paddh %[dest_lo], %[dest_lo], %[ph] \n\t"
266 "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t"
267
268 "gsldrc1 %[s0], 0x08(%[s]) \n\t"
269 "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
270 "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
271 "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
272 "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t"
273
274 "gsldrc1 %[t0], 0x08(%[t]) \n\t"
275 "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
276 "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
277 "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
278 "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t"
279 "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t"
280
281 "paddh %[dest_hi], %[dest_hi], %[ph] \n\t"
282 "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t"
283
284 "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
285 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
286 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
287
288 "daddiu %[s], %[s], 0x10 \n\t"
289 "daddiu %[t], %[t], 0x10 \n\t"
290 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
291 "daddi %[width], %[width], -0x02 \n\t"
292 "bnez %[width], 1b \n\t"
293 : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
294 [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
295 [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
296 : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
297 [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
298 : "memory");
299 }
300
ScaleRowDown2_16_MMI(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)301 void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
302 ptrdiff_t src_stride,
303 uint16_t* dst,
304 int dst_width) {
305 (void)src_stride;
306
307 uint64_t src0, src1, dest;
308 const uint64_t shift = 0x10ULL;
309
310 __asm__ volatile(
311 "1: \n\t"
312 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
313 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
314 "psrlw %[src0], %[src0], %[shift] \n\t"
315
316 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
317 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
318 "psrlw %[src1], %[src1], %[shift] \n\t"
319
320 "packsswh %[dest], %[src0], %[src1] \n\t"
321 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
322 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
323
324 "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
325 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
326 "daddi %[width], %[width], -0x04 \n\t"
327 "bnez %[width], 1b \n\t"
328 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
329 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
330 [shift] "f"(shift)
331 : "memory");
332 }
333
ScaleRowDown2Linear_16_MMI(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)334 void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
335 ptrdiff_t src_stride,
336 uint16_t* dst,
337 int dst_width) {
338 (void)src_stride;
339
340 uint64_t src0, src1;
341 uint64_t dest, dest_hi, dest_lo;
342
343 __asm__ volatile(
344 "1: \n\t"
345 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
346 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
347 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
348 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
349 "punpcklhw %[dest_lo], %[src0], %[src1] \n\t"
350 "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
351
352 "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t"
353 "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t"
354
355 "pavgh %[dest], %[src0], %[src1] \n\t"
356 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
357 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
358
359 "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
360 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
361 "daddi %[width], %[width], -0x04 \n\t"
362 "bnez %[width], 1b \n\t"
363 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
364 [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
365 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
366 : "memory");
367 }
368
ScaleRowDown2Box_16_MMI(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)369 void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
370 ptrdiff_t src_stride,
371 uint16_t* dst,
372 int dst_width) {
373 const uint16_t* s = src_ptr;
374 const uint16_t* t = src_ptr + src_stride;
375
376 uint64_t s0, s1, s_hi, s_lo;
377 uint64_t t0, t1, t_hi, t_lo;
378 uint64_t dest, dest0, dest1;
379
380 const uint64_t ph = 0x0000000200000002ULL;
381 const uint64_t mask = 0x0000ffff0000ffffULL;
382 const uint64_t shift0 = 0x10ULL;
383 const uint64_t shift1 = 0x2ULL;
384
385 __asm__ volatile(
386 "1: \n\t"
387 "gsldrc1 %[s0], 0x00(%[s]) \n\t"
388 "gsldlc1 %[s0], 0x07(%[s]) \n\t"
389 "psrlw %[s1], %[s0], %[shift0] \n\t"
390 "and %[s0], %[s0], %[mask] \n\t"
391
392 "gsldrc1 %[t0], 0x00(%[t]) \n\t"
393 "gsldlc1 %[t0], 0x07(%[t]) \n\t"
394 "psrlw %[t1], %[t0], %[shift0] \n\t"
395 "and %[t0], %[t0], %[mask] \n\t"
396
397 "paddw %[dest0], %[s0], %[s1] \n\t"
398 "paddw %[dest0], %[dest0], %[t0] \n\t"
399 "paddw %[dest0], %[dest0], %[t1] \n\t"
400 "paddw %[dest0], %[dest0], %[ph] \n\t"
401 "psrlw %[dest0], %[dest0], %[shift1] \n\t"
402
403 "gsldrc1 %[s0], 0x08(%[s]) \n\t"
404 "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
405 "psrlw %[s1], %[s0], %[shift0] \n\t"
406 "and %[s0], %[s0], %[mask] \n\t"
407
408 "gsldrc1 %[t0], 0x08(%[t]) \n\t"
409 "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
410 "psrlw %[t1], %[t0], %[shift0] \n\t"
411 "and %[t0], %[t0], %[mask] \n\t"
412
413 "paddw %[dest1], %[s0], %[s1] \n\t"
414 "paddw %[dest1], %[dest1], %[t0] \n\t"
415 "paddw %[dest1], %[dest1], %[t1] \n\t"
416 "paddw %[dest1], %[dest1], %[ph] \n\t"
417 "psrlw %[dest1], %[dest1], %[shift1] \n\t"
418
419 "packsswh %[dest], %[dest0], %[dest1] \n\t"
420 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
421 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
422
423 "daddiu %[s], %[s], 0x10 \n\t"
424 "daddiu %[t], %[t], 0x10 \n\t"
425 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
426 "daddi %[width], %[width], -0x04 \n\t"
427 "bnez %[width], 1b \n\t"
428 : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
429 [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
430 [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
431 [dest] "=&f"(dest)
432 : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
433 [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
434 [mask] "f"(mask)
435 : "memory");
436 }
437
ScaleRowDown4_MMI(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)438 void ScaleRowDown4_MMI(const uint8_t* src_ptr,
439 ptrdiff_t src_stride,
440 uint8_t* dst,
441 int dst_width) {
442 (void)src_stride;
443
444 uint64_t src0, src1;
445 uint64_t dest, dest_hi, dest_lo;
446
447 const uint64_t shift = 0x10ULL;
448 const uint64_t mask = 0x000000ff000000ffULL;
449
450 __asm__ volatile(
451 "1: \n\t"
452 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
453 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
454 "psrlw %[src0], %[src0], %[shift] \n\t"
455 "and %[src0], %[src0], %[mask] \n\t"
456 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
457 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
458 "psrlw %[src1], %[src1], %[shift] \n\t"
459 "and %[src1], %[src1], %[mask] \n\t"
460 "packsswh %[dest_lo], %[src0], %[src1] \n\t"
461
462 "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
463 "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
464 "psrlw %[src0], %[src0], %[shift] \n\t"
465 "and %[src0], %[src0], %[mask] \n\t"
466 "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
467 "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
468 "psrlw %[src1], %[src1], %[shift] \n\t"
469 "and %[src1], %[src1], %[mask] \n\t"
470 "packsswh %[dest_hi], %[src0], %[src1] \n\t"
471
472 "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
473 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
474 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
475
476 "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
477 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
478 "daddi %[width], %[width], -0x08 \n\t"
479 "bnez %[width], 1b \n\t"
480 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
481 [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
482 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
483 [shift] "f"(shift), [mask] "f"(mask)
484 : "memory");
485 }
486
ScaleRowDown4_16_MMI(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)487 void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
488 ptrdiff_t src_stride,
489 uint16_t* dst,
490 int dst_width) {
491 (void)src_stride;
492
493 uint64_t src0, src1;
494 uint64_t dest, dest_hi, dest_lo;
495
496 const uint64_t mask = 0x0ULL;
497
498 __asm__ volatile(
499 "1: \n\t"
500 "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
501 "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
502 "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
503 "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
504 "punpckhhw %[dest_lo], %[src0], %[src1] \n\t"
505 "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t"
506
507 "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
508 "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
509 "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
510 "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
511 "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
512 "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t"
513
514 "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
515 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
516 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
517
518 "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
519 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
520 "daddi %[width], %[width], -0x04 \n\t"
521 "bnez %[width], 1b \n\t"
522 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
523 [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
524 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
525 [mask] "f"(mask)
526 : "memory");
527 }
528
529 #define DO_SCALEROWDOWN4BOX_PUNPCKADD() \
530 "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
531 "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
532 "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
533 "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
534
535 #define DO_SCALEROWDOWN4BOX_LOOP(reg) \
536 "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
537 "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
538 "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
539 \
540 "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
541 DO_SCALEROWDOWN4BOX_PUNPCKADD() \
542 \
543 "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
544 DO_SCALEROWDOWN4BOX_PUNPCKADD() \
545 \
546 "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
547 DO_SCALEROWDOWN4BOX_PUNPCKADD() \
548 \
549 "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \
550 "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \
551 "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \
552 "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \
553 "paddh " #reg ", " #reg ", %[ph] \n\t" \
554 "psrlh " #reg ", " #reg ", %[shift] \n\t" \
555 \
556 "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
557 "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
558 "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
559 "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
560
561 /* LibYUVScaleTest.ScaleDownBy4_Box */
ScaleRowDown4Box_MMI(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)562 void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
563 ptrdiff_t src_stride,
564 uint8_t* dst,
565 int dst_width) {
566 const uint8_t* src0_ptr = src_ptr;
567 const uint8_t* src1_ptr = src_ptr + src_stride;
568 const uint8_t* src2_ptr = src_ptr + src_stride * 2;
569 const uint8_t* src3_ptr = src_ptr + src_stride * 3;
570
571 uint64_t src, src_hi, src_lo;
572 uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
573
574 const uint64_t mask0 = 0x0ULL;
575 const uint64_t mask1 = 0x0001000100010001ULL;
576 const uint64_t ph = 0x0008000800080008ULL;
577 const uint64_t shift = 0x4ULL;
578
579 __asm__ volatile(
580 "1: \n\t"
581
582 DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
583 DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
584 DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
585 DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
586
587 "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
588 "packsswh %[dest_hi], %[dest2], %[dest3] \n\t"
589
590 "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
591 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
592 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
593
594 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
595 "daddi %[width], %[width], -0x08 \n\t"
596 "bnez %[width], 1b \n\t"
597 : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
598 [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
599 [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
600 [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
601 : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
602 [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
603 [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
604 [ph] "f"(ph), [mask1] "f"(mask1)
605 : "memory");
606 }
607
608 #define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
609 "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
610 "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
611 "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
612 "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
613
614 #define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \
615 "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
616 "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
617 "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
618 \
619 "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
620 DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
621 \
622 "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
623 DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
624 \
625 "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
626 DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
627 \
628 "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \
629 "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \
630 "paddw %[dest], %[dest_hi], %[dest] \n\t" \
631 "paddw %[dest], %[dest], %[ph] \n\t" \
632 "psraw %[dest], %[dest], %[shift] \n\t" \
633 "and " #reg ", %[dest], %[mask1] \n\t" \
634 \
635 "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
636 "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
637 "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
638 "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
639
640 /* LibYUVScaleTest.ScaleDownBy4_Box_16 */
ScaleRowDown4Box_16_MMI(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)641 void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
642 ptrdiff_t src_stride,
643 uint16_t* dst,
644 int dst_width) {
645 const uint16_t* src0_ptr = src_ptr;
646 const uint16_t* src1_ptr = src_ptr + src_stride;
647 const uint16_t* src2_ptr = src_ptr + src_stride * 2;
648 const uint16_t* src3_ptr = src_ptr + src_stride * 3;
649
650 uint64_t src, src_hi, src_lo;
651 uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
652
653 const uint64_t mask0 = 0x0ULL;
654 const uint64_t mask1 = 0x00000000ffffffffULL;
655 const uint64_t ph = 0x0000000800000008ULL;
656 const uint64_t shift = 0x04ULL;
657
658 __asm__ volatile(
659 "1: \n\t"
660
661 DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
662 DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
663 DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
664 DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
665 "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t"
666 "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t"
667
668 "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
669 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
670 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
671
672 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
673 "daddi %[width], %[width], -0x04 \n\t"
674 "bnez %[width], 1b \n\t"
675 : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
676 [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
677 [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
678 [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
679 : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
680 [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
681 [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
682 [ph] "f"(ph), [mask1] "f"(mask1)
683 : "memory");
684 }
685
686 // Scales a single row of pixels up by 2x using point sampling.
ScaleColsUp2_MMI(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)687 void ScaleColsUp2_MMI(uint8_t* dst_ptr,
688 const uint8_t* src_ptr,
689 int dst_width,
690 int x,
691 int dx) {
692 uint64_t src, dest;
693
694 (void)x;
695 (void)dx;
696
697 __asm__ volatile(
698 "1: \n\t"
699 "lwc1 %[src], 0x00(%[src_ptr]) \n\t"
700
701 "punpcklbh %[dest], %[src], %[src] \n\t"
702 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
703 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
704
705 "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
706 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
707 "daddi %[width], %[width], -0x08 \n\t"
708 "bnez %[width], 1b \n\t"
709 : [src] "=&f"(src), [dest] "=&f"(dest)
710 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
711 : "memory");
712 }
713
ScaleColsUp2_16_MMI(uint16_t * dst_ptr,const uint16_t * src_ptr,int dst_width,int x,int dx)714 void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
715 const uint16_t* src_ptr,
716 int dst_width,
717 int x,
718 int dx) {
719 uint64_t src, dest;
720
721 (void)x;
722 (void)dx;
723
724 __asm__ volatile(
725 "1: \n\t"
726 "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
727 "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
728
729 "punpcklhw %[dest], %[src], %[src] \n\t"
730 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
731 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
732
733 "punpckhhw %[dest], %[src], %[src] \n\t"
734 "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
735 "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
736
737 "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
738 "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
739 "daddi %[width], %[width], -0x08 \n\t"
740 "bnez %[width], 1b \n\t"
741 : [src] "=&f"(src), [dest] "=&f"(dest)
742 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
743 : "memory");
744 }
745
ScaleAddRow_MMI(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)746 void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
747 uint64_t src, src_hi, src_lo, dest0, dest1;
748 const uint64_t mask = 0x0ULL;
749
750 __asm__ volatile(
751 "1: \n\t"
752 "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
753 "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
754 "punpcklbh %[src_lo], %[src], %[mask] \n\t"
755 "punpckhbh %[src_hi], %[src], %[mask] \n\t"
756
757 "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
758 "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
759 "paddush %[dest0], %[dest0], %[src_lo] \n\t"
760 "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
761 "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
762 "paddush %[dest1], %[dest1], %[src_hi] \n\t"
763
764 "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
765 "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
766 "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
767 "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
768
769 "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
770 "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
771 "daddi %[width], %[width], -0x08 \n\t"
772 "bnez %[width], 1b \n\t"
773 : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
774 [src_lo] "=&f"(src_lo), [src] "=&f"(src)
775 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
776 [mask] "f"(mask)
777 : "memory");
778 }
779
ScaleAddRow_16_MMI(const uint16_t * src_ptr,uint32_t * dst_ptr,int src_width)780 void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
781 uint32_t* dst_ptr,
782 int src_width) {
783 uint64_t src, src_hi, src_lo, dest0, dest1;
784 const uint64_t mask = 0x0ULL;
785
786 __asm__ volatile(
787 "1: \n\t"
788 "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
789 "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
790 "punpcklhw %[src_lo], %[src], %[mask] \n\t"
791 "punpckhhw %[src_hi], %[src], %[mask] \n\t"
792
793 "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
794 "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
795 "paddw %[dest0], %[dest0], %[src_lo] \n\t"
796 "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
797 "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
798
799 "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
800 "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
801 "paddw %[dest1], %[dest1], %[src_hi] \n\t"
802 "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
803 "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
804
805 "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
806 "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
807 "daddi %[width], %[width], -0x04 \n\t"
808 "bnez %[width], 1b \n\t"
809 : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
810 [src_lo] "=&f"(src_lo), [src] "=&f"(src)
811 : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
812 [mask] "f"(mask)
813 : "memory");
814 }
815
ScaleARGBRowDownEven_MMI(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)816 void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
817 ptrdiff_t src_stride,
818 int src_stepx,
819 uint8_t* dst_argb,
820 int dst_width) {
821 (void)src_stride;
822
823 uint64_t src0, src1, dest;
824
825 __asm__ volatile(
826 "1: \n\t"
827 "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
828 "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
829 "lwc1 %[src1], 0x00(%[src_ptr]) \n\t"
830 "punpcklwd %[dest], %[src0], %[src1] \n\t"
831
832 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
833 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
834
835 "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
836 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
837 "daddi %[width], %[width], -0x02 \n\t"
838 "bnez %[width], 1b \n\t"
839 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
840 : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
841 [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
842 : "memory");
843 }
844
ScaleARGBRowDownEvenBox_MMI(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)845 void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
846 ptrdiff_t src_stride,
847 int src_stepx,
848 uint8_t* dst_argb,
849 int dst_width) {
850 const uint8_t* src0_ptr = src_argb;
851 const uint8_t* src1_ptr = src_argb + src_stride;
852
853 uint64_t src0, src1, src_hi, src_lo;
854 uint64_t dest, dest_hi, dest_lo, dest0, dest1;
855
856 const uint64_t mask = 0x0ULL;
857 const uint64_t ph = 0x0002000200020002ULL;
858 const uint64_t shift = 0x2ULL;
859
860 __asm__ volatile(
861 "1: \n\t"
862
863 "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
864 "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
865 "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
866 "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
867
868 "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
869 "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
870 "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
871 "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
872 "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
873 "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
874 "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t"
875 "paddh %[dest0], %[dest0], %[ph] \n\t"
876 "psrlh %[dest0], %[dest0], %[shift] \n\t"
877
878 "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
879 "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
880
881 "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
882 "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
883 "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
884 "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
885
886 "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
887 "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
888 "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
889 "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
890 "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
891 "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
892 "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t"
893 "paddh %[dest1], %[dest1], %[ph] \n\t"
894 "psrlh %[dest1], %[dest1], %[shift] \n\t"
895
896 "packushb %[dest], %[dest0], %[dest1] \n\t"
897 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
898 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
899
900 "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
901 "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
902 "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
903 "daddi %[width], %[width], -0x02 \n\t"
904 "bnez %[width], 1b \n\t"
905 : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
906 [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
907 [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
908 [src1] "=&f"(src1), [dest] "=&f"(dest)
909 : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
910 [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
911 [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
912 [ph] "f"(ph)
913 : "memory");
914 }
915
916 // Scales a single row of pixels using point sampling.
ScaleARGBCols_MMI(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)917 void ScaleARGBCols_MMI(uint8_t* dst_argb,
918 const uint8_t* src_argb,
919 int dst_width,
920 int x,
921 int dx) {
922 const uint32_t* src = (const uint32_t*)(src_argb);
923 uint32_t* dst = (uint32_t*)(dst_argb);
924
925 const uint32_t* src_tmp;
926
927 uint64_t dest, offset;
928
929 const uint64_t shift0 = 16;
930 const uint64_t shift1 = 2;
931
932 __asm__ volatile(
933 "1: \n\t"
934 "srav %[offset], %[x], %[shift0] \n\t"
935 "sllv %[offset], %[offset], %[shift1] \n\t"
936 "dadd %[src_tmp], %[src_ptr], %[offset] \n\t"
937 "lwc1 %[dest], 0x00(%[src_tmp]) \n\t"
938 "swc1 %[dest], 0x00(%[dst_ptr]) \n\t"
939
940 "dadd %[x], %[x], %[dx] \n\t"
941
942 "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t"
943 "daddi %[width], %[width], -0x01 \n\t"
944 "bnez %[width], 1b \n\t"
945 : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
946 : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
947 [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
948 : "memory");
949 }
950
951 // Scales a single row of pixels up by 2x using point sampling.
ScaleARGBColsUp2_MMI(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)952 void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
953 const uint8_t* src_argb,
954 int dst_width,
955 int x,
956 int dx) {
957 uint64_t src, dest0, dest1;
958 (void)x;
959 (void)dx;
960
961 __asm__ volatile(
962 "1: \n\t"
963 "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
964 "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
965 "punpcklwd %[dest0], %[src], %[src] \n\t"
966 "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
967 "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
968 "punpckhwd %[dest1], %[src], %[src] \n\t"
969 "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
970 "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
971
972 "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
973 "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
974 "daddi %[width], %[width], -0x04 \n\t"
975 "bnez %[width], 1b \n\t"
976 : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
977 : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
978 : "memory");
979 }
980
981 // Divide num by div and return as 16.16 fixed point result.
982 /* LibYUVBaseTest.TestFixedDiv */
FixedDiv_MIPS(int num,int div)983 int FixedDiv_MIPS(int num, int div) {
984 int quotient = 0;
985 const int shift = 16;
986
987 asm(
988 "dsll %[num], %[num], %[shift] \n\t"
989 "ddiv %[num], %[div] \t\n"
990 "mflo %[quo] \t\n"
991 : [quo] "+&r"(quotient)
992 : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
993
994 return quotient;
995 }
996
997 // Divide num by div and return as 16.16 fixed point result.
998 /* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
FixedDiv1_MIPS(int num,int div)999 int FixedDiv1_MIPS(int num, int div) {
1000 int quotient = 0;
1001 const int shift = 16;
1002 const int val1 = 1;
1003 const int64_t val11 = 0x00010001ULL;
1004
1005 asm(
1006 "dsll %[num], %[num], %[shift] \n\t"
1007 "dsub %[num], %[num], %[val11] \n\t"
1008 "dsub %[div], %[div], %[val1] \n\t"
1009 "ddiv %[num], %[div] \t\n"
1010 "mflo %[quo] \t\n"
1011 : [quo] "+&r"(quotient)
1012 : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
1013 [shift] "r"(shift));
1014
1015 return quotient;
1016 }
1017
1018 // Read 8x2 upsample with filtering and write 16x1.
1019 // actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_MMI(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1020 void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
1021 ptrdiff_t src_stride,
1022 uint16_t* dst,
1023 int dst_width) {
1024 const uint16_t* src2_ptr = src_ptr + src_stride;
1025
1026 uint64_t src0, src1;
1027 uint64_t dest, dest04, dest15, dest26, dest37;
1028 uint64_t tmp0, tmp1, tmp2, tmp3;
1029
1030 const uint64_t mask0 = 0x0003000900030009ULL;
1031 const uint64_t mask1 = 0x0001000300010003ULL;
1032 const uint64_t mask2 = 0x0009000300090003ULL;
1033 const uint64_t mask3 = 0x0003000100030001ULL;
1034 const uint64_t ph = 0x0000000800000008ULL;
1035 const uint64_t shift = 4;
1036
1037 __asm__ volatile(
1038 "1: \n\t"
1039 "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t"
1040 "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t"
1041 "pmaddhw %[dest04], %[src0], %[mask0] \n\t"
1042 "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t"
1043 "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t"
1044 "pmaddhw %[dest], %[src1], %[mask1] \n\t"
1045 "paddw %[dest04], %[dest04], %[dest] \n\t"
1046 "paddw %[dest04], %[dest04], %[ph] \n\t"
1047 "psrlw %[dest04], %[dest04], %[shift] \n\t"
1048
1049 "pmaddhw %[dest15], %[src0], %[mask2] \n\t"
1050 "pmaddhw %[dest], %[src1], %[mask3] \n\t"
1051 "paddw %[dest15], %[dest15], %[dest] \n\t"
1052 "paddw %[dest15], %[dest15], %[ph] \n\t"
1053 "psrlw %[dest15], %[dest15], %[shift] \n\t"
1054
1055 "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t"
1056 "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t"
1057 "pmaddhw %[dest26], %[src0], %[mask0] \n\t"
1058 "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t"
1059 "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t"
1060 "pmaddhw %[dest], %[src1], %[mask1] \n\t"
1061 "paddw %[dest26], %[dest26], %[dest] \n\t"
1062 "paddw %[dest26], %[dest26], %[ph] \n\t"
1063 "psrlw %[dest26], %[dest26], %[shift] \n\t"
1064
1065 "pmaddhw %[dest37], %[src0], %[mask2] \n\t"
1066 "pmaddhw %[dest], %[src1], %[mask3] \n\t"
1067 "paddw %[dest37], %[dest37], %[dest] \n\t"
1068 "paddw %[dest37], %[dest37], %[ph] \n\t"
1069 "psrlw %[dest37], %[dest37], %[shift] \n\t"
1070
1071 /* tmp0 = ( 00 04 02 06 ) */
1072 "packsswh %[tmp0], %[dest04], %[dest26] \n\t"
1073 /* tmp1 = ( 01 05 03 07 ) */
1074 "packsswh %[tmp1], %[dest15], %[dest37] \n\t"
1075
1076 /* tmp2 = ( 00 01 04 05 )*/
1077 "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t"
1078 /* tmp3 = ( 02 03 06 07 )*/
1079 "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t"
1080
1081 /* ( 00 01 02 03 ) */
1082 "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t"
1083 "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
1084 "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
1085
1086 /* ( 04 05 06 07 ) */
1087 "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t"
1088 "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
1089 "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
1090
1091 "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
1092 "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t"
1093 "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
1094 "daddi %[width], %[width], -0x08 \n\t"
1095 "bnez %[width], 1b \n\t"
1096 : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
1097 [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
1098 [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
1099 [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
1100 : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
1101 [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
1102 [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
1103 : "memory");
1104 }
1105
1106 // clang-format on
1107
1108 #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
1109
1110 #ifdef __cplusplus
1111 } // extern "C"
1112 } // namespace libyuv
1113 #endif
1114