• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18
19#include <machine/cpu-features.h>
20#include <machine/asm.h>
21
22/*
23        r0 = dst
24        r1 = y0 base pointer
25        r2 = y1 base pointer
26        r3 = y2 base pointer
27        sp = coeffs
28        sp = length / 2
29*/
30
31ENTRY(rsdIntrinsicConvolve3x3_K)
32        push            {r4-r8, r10, r11, lr}
33        vpush           {q4-q7}
34
35        /* Get the coeffs pointer from the stack and load the
36           coefficients in the q0, q1 NEON registers */
37        ldr r4, [sp, #32+64]
38        vld1.16 {q0, q1}, [r4]
39
40        /* Get count from the stack */
41        ldr r4, [sp, #36+64]
42
43        /* Load the frequently used immediate in a register */
44        mov r5, #8
45
461:
47        /* Load and post-increase the address by r5=#8 */
48        vld1.8 {q13}, [r1], r5
49        vld1.8 {q14}, [r2], r5
50        vld1.8 {q15}, [r3], r5
51
52        /* Signal memory for data that will be used in the loop after the next */
53        PLD         (r1, r5)
54        PLD         (r2, r5)
55        PLD         (r3, r5)
56
57        vmovl.u8 q2, d26
58        vmovl.u8 q3, d27
59        vmovl.u8 q4, d28
60        vmovl.u8 q5, d29
61        vmovl.u8 q6, d30
62        vmovl.u8 q7, d31
63
64/*
65        The two pixel source array is
66        d4,  d5,  d6,  d7
67        d8,  d9,  d10, d11
68        d12, d13, d14, d15
69*/
70
71        vmull.s16 q8, d4, d0[0]
72        vmlal.s16 q8, d5, d0[1]
73        vmlal.s16 q8, d6, d0[2]
74        vmlal.s16 q8, d8, d0[3]
75        vmlal.s16 q8, d9, d1[0]
76        vmlal.s16 q8, d10, d1[1]
77        vmlal.s16 q8, d12, d1[2]
78        vmlal.s16 q8, d13, d1[3]
79        vmlal.s16 q8, d14, d2[0]
80
81        vmull.s16 q9, d5, d0[0]
82        vmlal.s16 q9, d6, d0[1]
83        vmlal.s16 q9, d7, d0[2]
84        vmlal.s16 q9, d9, d0[3]
85        vmlal.s16 q9, d10, d1[0]
86        vmlal.s16 q9, d11, d1[1]
87        vmlal.s16 q9, d13, d1[2]
88        vmlal.s16 q9, d14, d1[3]
89        vmlal.s16 q9, d15, d2[0]
90
91        vshrn.i32 d16, q8, #8
92        vshrn.i32 d17, q9, #8
93
94        vqmovun.s16 d16, q8
95        vst1.8 d16, [r0]!
96
97        /* Are we done yet? */
98        subs r4, r4, #1
99        bne 1b
100
101        /* We're done, bye! */
102        vpop            {q4-q7}
103        pop             {r4-r8, r10, r11, lr}
104        bx              lr
105END(rsdIntrinsicConvolve3x3_K)
106
107/*
108        r0 = dst
109        r1 = src
110        r2 = matrix
111        r3 = length
112*/
113ENTRY(rsdIntrinsicColorMatrix4x4_K)
114        stmfd           sp!, {r4, lr}
115        vpush           {q4-q7}
116
117        vld1.16 {q2}, [r2]!
118        vld1.16 {q3}, [r2]!
119
1201:
121        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
122        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
123        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
124        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
125
126        vmovl.u8 q12, d0  /* R */
127        vmovl.u8 q13, d1  /* G */
128        vmovl.u8 q14, d2  /* B */
129        vmovl.u8 q15, d3  /* A */
130
131        vmull.s16 q8,  d24, d4[0]
132        vmull.s16 q9,  d24, d4[1]
133        vmull.s16 q10, d24, d4[2]
134        vmull.s16 q11, d24, d4[3]
135
136        vmlal.s16 q8,  d26, d5[0]
137        vmlal.s16 q9,  d26, d5[1]
138        vmlal.s16 q10, d26, d5[2]
139        vmlal.s16 q11, d26, d5[3]
140
141        vmlal.s16 q8,  d28, d6[0]
142        vmlal.s16 q9,  d28, d6[1]
143        vmlal.s16 q10, d28, d6[2]
144        vmlal.s16 q11, d28, d6[3]
145
146        vmlal.s16 q8,  d30, d7[0]
147        vmlal.s16 q9,  d30, d7[1]
148        vmlal.s16 q10, d30, d7[2]
149        vmlal.s16 q11, d30, d7[3]
150
151        vshrn.i32 d24, q8, #8
152        vshrn.i32 d26, q9, #8
153        vshrn.i32 d28, q10, #8
154        vshrn.i32 d30, q11, #8
155
156        vqmovun.s16 d0, q12
157        vqmovun.s16 d1, q13
158        vqmovun.s16 d2, q14
159        vqmovun.s16 d3, q15
160
161        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
162        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
163        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
164        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
165
166        subs r3, r3, #1
167        bne 1b
168
169        vpop            {q4-q7}
170        ldmfd           sp!, {r4, lr}
171        bx              lr
172END(rsdIntrinsicColorMatrix4x4_K)
173
174/*
175        r0 = dst
176        r1 = src
177        r2 = matrix
178        r3 = length
179*/
180ENTRY(rsdIntrinsicColorMatrix3x3_K)
181        stmfd           sp!, {r4, lr}
182        vpush           {q4-q7}
183
184        vld1.16 {q2}, [r2]!
185        vld1.16 {q3}, [r2]!
186
1871:
188        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
189        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
190        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
191        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
192
193        vmovl.u8 q12, d0
194        vmovl.u8 q13, d1
195        vmovl.u8 q14, d2
196
197        vmull.s16 q8,  d24, d4[0]
198        vmull.s16 q9,  d24, d4[1]
199        vmull.s16 q10, d24, d4[2]
200
201        vmlal.s16 q8,  d26, d5[0]
202        vmlal.s16 q9,  d26, d5[1]
203        vmlal.s16 q10, d26, d5[2]
204
205        vmlal.s16 q8,  d28, d6[0]
206        vmlal.s16 q9,  d28, d6[1]
207        vmlal.s16 q10, d28, d6[2]
208
209        vshrn.i32 d24, q8, #8
210        vshrn.i32 d26, q9, #8
211        vshrn.i32 d28, q10, #8
212
213        vqmovun.s16 d0, q12
214        vqmovun.s16 d1, q13
215        vqmovun.s16 d2, q14
216
217        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
218        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
219        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
220        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
221
222        subs r3, r3, #1
223        bne 1b
224
225        vpop            {q4-q7}
226        ldmfd           sp!, {r4, lr}
227        bx              lr
228END(rsdIntrinsicColorMatrix3x3_K)
229
230/*
231        r0 = dst
232        r1 = src
233        r2 = matrix
234        r3 = length
235*/
236ENTRY(rsdIntrinsicColorMatrixDot_K)
237        stmfd           sp!, {r4, lr}
238        vpush           {q4-q7}
239
240        vld1.16 {q2}, [r2]!
241        vld1.16 {q3}, [r2]!
242
2431:
244        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
245        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
246        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
247        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
248
249        vmovl.u8 q12, d0
250        vmovl.u8 q13, d1
251        vmovl.u8 q14, d2
252
253        vmull.s16 q8,  d24, d4[0]
254        vmlal.s16 q8,  d26, d5[0]
255        vmlal.s16 q8,  d28, d6[0]
256        vshrn.i32 d24, q8, #8
257        vqmovun.s16 d0, q12
258        vmov.u8 d1, d0
259        vmov.u8 d2, d0
260
261        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
262        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
263        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
264        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
265
266        subs r3, r3, #1
267        bne 1b
268
269        vpop            {q4-q7}
270        ldmfd           sp!, {r4, lr}
271        bx              lr
272END(rsdIntrinsicColorMatrixDot_K)
273
274
275/*
276static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
277                  const float* gPtr, int iradius, int x1, int x2)
278
279    r0 = out
280    r1 = pin
281    r2 = stride
282    r3 = gptr
283    r4 = sp, ct
284    r5 = sp+4, x1
285    r6 = sp+8, x2
286*/
287ENTRY(rsdIntrinsicBlurVFU4_K)
288        push            {r4-r8, r10, r11, lr}
289        vpush           {q4-q7}
290
291        ldr r4, [sp, #32+64]
292        ldr r5, [sp, #32+64 + 4]
293        ldr r6, [sp, #32+64 + 8]
294
2951:
296        veor q10, q10, q10         /* float4 blurredPixel = 0; */
297        veor q11, q11, q11         /* float4 blurredPixel = 0; */
298        add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
299        mov r10, r3
300
301        mov r11, r4
302
3032:
304        vld1.32 {d2}, [r7]
305        vmovl.u8 q1, d2
306        vmovl.u16 q3, d2
307        vmovl.u16 q4, d3
308        vcvt.f32.s32 q3, q3
309        vcvt.f32.s32 q4, q4
310        vld1.32 {d0[0]}, [r10]!
311        add r7, r7, r2
312        vmla.f32 q10, q3, d0[0]
313        vmla.f32 q11, q4, d0[0]
314        subs r11, r11, #1
315        bne 2b
316
317        vst1.32 {q10}, [r0]!
318        vst1.32 {q11}, [r0]!
319        add r5, r5, #2
320        cmp r5, r6
321        bne 1b
322
323
324        vpop            {q4-q7}
325        pop             {r4-r8, r10, r11, lr}
326        bx              lr
327END(rsdIntrinsicBlurVFU4_K)
328
329/*
330static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
331                  const float* gPtr, int iradius, int x1, int x2)
332
333    r0 = out
334    r1 = pin
335    r2 = gptr
336    r3 = ct
337    r4 = sp, x1
338    r5 = sp+4, x2
339*/
340ENTRY(rsdIntrinsicBlurHFU4_K)
341        push            {r4-r8, r10, r11, lr}
342        vpush           {q4-q7}
343
344        ldr r4, [sp, #32+64]
345        ldr r5, [sp, #32+64 + 4]
346
3471:
348        add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
349        mov r10, r2
350        mov r11, r3
351
352        vld1.32 {q1}, [r7]!
353        vld1.32 {d6[0]}, [r10]!
354        vmul.f32 q0, q1, d6[0]
355        sub r11, r11, #1
356
3572:
358        vld1.32 {q1}, [r7]!
359        vld1.32 {q2}, [r7]!
360        vld1.32 {d6}, [r10]!
361        vmla.f32 q0, q1, d6[0]
362        vmla.f32 q0, q2, d6[1]
363        subs r11, r11, #2
364        bne 2b
365
366        vcvt.s32.f32 q0, q0
367        vmovn.u32 d0, q0
368        vmovn.u16 d0, q0
369
370        vst1.32 {d0[0]}, [r0]!
371        add r4, r4, #1
372        cmp r4, r5
373        bne 1b
374
375        vpop            {q4-q7}
376        pop             {r4-r8, r10, r11, lr}
377        bx              lr
378END(rsdIntrinsicBlurHFU4_K)
379
380ENTRY(rsdIntrinsicBlurHFU1_K)
381        push            {r4-r8, r10, r11, lr}
382        vpush           {q4-q7}
383
384        ldr r4, [sp, #32+64]
385        ldr r5, [sp, #32+64 + 4]
386
3871:
388        add r7, r1, r4, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
389        mov r10, r2
390        mov r11, r3
391
392        veor q0, q0
393
3942:
395        vld1.32 {q1}, [r7]
396        add r7, r7, #4
397        vld1.32 {d4[0]}, [r10]!
398        vmla.f32 q0, q1, d4[0]
399        subs r11, r11, #1
400        bne 2b
401
402        vcvt.s32.f32 q0, q0
403        vmovn.u32 d0, q0
404        vmovn.u16 d0, q0
405
406        vst1.32 {d0[0]}, [r0]!
407        add r4, r4, #4
408        cmp r4, r5
409        bne 1b
410
411        vpop            {q4-q7}
412        pop             {r4-r8, r10, r11, lr}
413        bx              lr
414END(rsdIntrinsicBlurHFU1_K)
415
416/*
417    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
418        r0 = dst
419        r1 = Y
420        r2 = VU
421        r3 = length (pixels / 8)
422        ---- Args below will be in the stack ----
423        sp = YuvCoeff
424
425        This function converts 8 pixels per iteration
426*/
427ENTRY(rsdIntrinsicYuv_K)
428        push        {r4, r5, lr}            @ preserve clobbered int registers
429        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
430
431        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
432
433        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
434        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
435        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
436        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
437
438        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
439
440        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
441                                            @ the coeffs matrix (Q2)
442
443        1:
444        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
445        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
446        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
447        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
448
449        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
450        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
451        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
452
453        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
454        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
455        vmov.u16    d11, d10                @ Copying V to d11
456        vmov.u16    d13, d12                @ Copying U to d13
457        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
458        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
459
460
461        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
462        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
463        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
464        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
465
466                                            @                  R    G    B
467                                            @     Pixel(0-3)  Q8,  Q9, Q10
468                                            @     Pixel(4-7) Q11, Q12, Q13
469                                            @
470
471                                            @ Pixel(0-3)
472        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
473        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
474        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
475        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
476
477                                            @ Pixel(4-7)
478        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
479        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
480        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
481        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
482
483                                            @ Pixel(0-3)
484        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
485        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
486        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
487
488                                            @ Pixel(4-7)
489        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
490        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
491        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
492
493        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
494        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
495        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
496
497        subs        r3, r3, #1              @ Checking length (r3)
498        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
499
500        bne 1b                              @ if not done with length, loop
501
502        vpop        {Q4-Q7}                 @ Restore Vregisters
503        pop         {r4, r5, lr}            @ Restore int registers
504        bx          lr
505END(rsdIntrinsicYuv_K)
506
507/*
508    Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
509        r0 = dst
510        r1 = Y
511        r2 = V,
512        r3 = U
513        ---- Args below will be in the stack ----
514        sp = length (pixels / 8)
515        sp+4 = YuvCoeff
516
517        This function converts 8 pixels per iteration
518*/
519ENTRY(rsdIntrinsicYuv2_K)
520        push        {r4, r5, r6, lr}        @ preserve clobbered int registers
521        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
522
523        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
524
525        ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
526        ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
527        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
528        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
529        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
530
531        mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
532
533        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
534                                            @ the coeffs matrix (Q2)
535
536        1:
537        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
538        vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
539        vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
540        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
541        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
542
543        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
544        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
545        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
546
547        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
548        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
549        vmov.u16    d11, d10                @ Copying V to d11
550        vmov.u16    d13, d12                @ Copying U to d13
551        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
552        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
553
554
555        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
556        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
557        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
558        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
559
560                                            @                  R    G    B
561                                            @     Pixel(0-3)  Q8,  Q9, Q10
562                                            @     Pixel(4-7) Q11, Q12, Q13
563                                            @
564
565                                            @ Pixel(0-3)
566        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
567        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
568        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
569        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
570
571                                            @ Pixel(4-7)
572        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
573        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
574        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
575        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
576
577                                            @ Pixel(0-3)
578        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
579        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
580        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
581
582                                            @ Pixel(4-7)
583        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
584        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
585        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
586
587        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
588        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
589        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
590
591        subs        r6, r6, #1              @ Checking length (r6)
592        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
593
594        bne 1b                              @ if not done with length, loop
595
596        vpop        {Q4-Q7}                 @ Restore Vregisters
597        pop         {r4, r5, r6, lr}        @ Restore int registers
598        bx          lr
599END(rsdIntrinsicYuv2_K)
600
601/* Convolve 5x5 */
602
603/*
604        r0 = dst
605        r1 = y0 base pointer
606        r2 = y1 base pointer
607        r3 = y2 base pointer
608        r4 = y3 base pointer
609        r5 = y4 base pointer
610        r6 = coeffs
611        r7 = length
612*/
613ENTRY(rsdIntrinsicConvolve5x5_K)
614        push        {r4-r7, lr}
615        vpush       {q4-q7}
616
617        /* load y3 in r4 */
618        ldr     r4, [sp, #20 + 64]
619
620        /* load y4 in r5 */
621        ldr     r5, [sp, #24 + 64]
622
623        /* Load the coefficients pointer */
624        ldr     r6, [sp, #28 + 64]
625
626        /* Create the coefficients vector */
627        vld1.16     {d0, d1, d2, d3}, [r6]!
628        vld1.16     {d4, d5, d6}, [r6]
629
630        vmov.u32  q15, #0x7f
631
632        /* load the count */
633        ldr     r6, [sp, #32 + 64]
634
635        /* Load the frequently used immediate in a register */
636        mov     r7, #8
637
6381:
639        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
640        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
641        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
642
643        /* Signal memory for data that will be used in the loop after the next */
644        PLD         (r1, r7)
645        PLD         (r2, r7)
646
647        /* Promoting the 8bit channels to 16bit */
648        vmovl.u8 q9,  d24
649        vmovl.u8 q10, d25
650        vmovl.u8 q11, d26
651        vmovl.u8 q12, d27
652        vmovl.u8 q13, d28
653        vmovl.u8 q14, d29
654
655/*
656        d18,  d19,  d20, d21, d22, d23,
657        d24,  d25
658*/
659        vmull.s16 q4, d18, d0[0]
660        vmlal.s16 q4, d19, d0[1]
661        vmlal.s16 q4, d20, d0[2]
662        vmlal.s16 q4, d21, d0[3]
663        vmlal.s16 q4, d22, d1[0]
664
665        vmlal.s16 q4, d24, d1[1]
666        vmlal.s16 q4, d25, d1[2]
667        vmlal.s16 q4, d26, d1[3]
668        vmlal.s16 q4, d27, d2[0]
669        vmlal.s16 q4, d28, d2[1]
670
671        vmull.s16 q5, d19, d0[0]
672        vmlal.s16 q5, d20, d0[1]
673        vmlal.s16 q5, d21, d0[2]
674        vmlal.s16 q5, d22, d0[3]
675        vmlal.s16 q5, d23, d1[0]
676
677        vmlal.s16 q5, d25, d1[1]
678        vmlal.s16 q5, d26, d1[2]
679        vmlal.s16 q5, d27, d1[3]
680        vmlal.s16 q5, d28, d2[0]
681        vmlal.s16 q5, d29, d2[1]
682
683
684        /* Next 2 rows */
685        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
686        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
687        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
688
689        /* Signal memory for data that will be used in the loop after the next */
690        PLD         (r3, r7)
691        PLD         (r4, r7)
692
693        /* Promoting the 8bit channels to 16bit */
694        vmovl.u8 q9,  d24
695        vmovl.u8 q10, d25
696        vmovl.u8 q11, d26
697        vmovl.u8 q12, d27
698        vmovl.u8 q13, d28
699        vmovl.u8 q14, d29
700
701/*
702        d18,  d19,  d20, d21, d22, d23,
703        d24,  d25
704*/
705        vmlal.s16 q4, d18, d2[2]
706        vmlal.s16 q4, d19, d2[3]
707        vmlal.s16 q4, d20, d3[0]
708        vmlal.s16 q4, d21, d3[1]
709        vmlal.s16 q4, d22, d3[2]
710
711        vmlal.s16 q4, d24, d3[3]
712        vmlal.s16 q4, d25, d4[0]
713        vmlal.s16 q4, d26, d4[1]
714        vmlal.s16 q4, d27, d4[2]
715        vmlal.s16 q4, d28, d4[3]
716
717        vmlal.s16 q5, d19, d2[2]
718        vmlal.s16 q5, d20, d2[3]
719        vmlal.s16 q5, d21, d3[0]
720        vmlal.s16 q5, d22, d3[1]
721        vmlal.s16 q5, d23, d3[2]
722
723        vmlal.s16 q5, d25, d3[3]
724        vmlal.s16 q5, d26, d4[0]
725        vmlal.s16 q5, d27, d4[1]
726        vmlal.s16 q5, d28, d4[2]
727        vmlal.s16 q5, d29, d4[3]
728
729        /* Last row */
730        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
731        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
732
733        /* Signal memory for data that will be used in the loop after the next */
734        PLD         (r5, r7)
735
736        /* Promoting the 8bit channels to 16bit */
737        vmovl.u8 q9,  d24
738        vmovl.u8 q10, d25
739        vmovl.u8 q11, d26
740
741/*
742        d18,  d19,  d20, d21, d22, d23,
743        d24,  d25
744*/
745
746        vmlal.s16 q4, d18, d5[0]
747        vmlal.s16 q4, d19, d5[1]
748        vmlal.s16 q4, d20, d5[2]
749        vmlal.s16 q4, d21, d5[3]
750        vmlal.s16 q4, d22, d6[0]
751
752        vmlal.s16 q5, d19, d5[0]
753        vmlal.s16 q5, d20, d5[1]
754        vmlal.s16 q5, d21, d5[2]
755        vmlal.s16 q5, d22, d5[3]
756        vmlal.s16 q5, d23, d6[0]
757
758
759
760        vadd.i32 q4, q4, q15
761        vadd.i32 q5, q5, q15
762
763/*      Narrow it to a d-reg 32 -> 16 bit */
764        vrshrn.i32 d8, q4, #8
765        vrshrn.i32 d9, q5, #8
766
767
768/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
769        vqmovun.s16 d8, q4
770
771        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
772
773        /* Are we done? */
774        subs r6, r6, #1
775        bne 1b
776
777        /* Yup, bye */
778        vpop        {q4-q7}
779        pop         {r4-r7, lr}
780        bx          lr
781
782END(rsdIntrinsicConvolve5x5_K)
783
784
785
786
787/*
788        dst = src + dst * (1.0 - src.a)
789
790        r0 = dst
791        r1 = src
792        r2 = length
793*/
794ENTRY(rsdIntrinsicBlendSrcOver_K)
795        .save           {r4, lr}
796        stmfd           sp!, {r4, lr}
797        vpush           {q4-q7}
798
799        mov r4, #255
800        vdup.16 q7, r4
801
802        mov r4, r0
8031:
804
805        /* src */
806        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
807        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
808        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
809        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
810        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
811        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
812        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
813        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
814        vshll.u8 q12, d0, #8
815        vshll.u8 q13, d1, #8
816        vshll.u8 q14, d2, #8
817        vmovl.u8 q6, d3
818        vsub.i16 q6, q7, q6        // q6 = 1 - src.a
819        vshll.u8 q15, d3, #8
820
821        /* dst */
822        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
823        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
824        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
825        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
826        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
827        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
828        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
829        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
830        vmovl.u8 q8, d0
831        vmovl.u8 q9, d1
832        vmovl.u8 q10, d2
833        vmovl.u8 q11, d3
834
835        vmla.i16 q12, q8, q6
836        vmla.i16 q13, q9, q6
837        vmla.i16 q14, q10, q6
838        vmla.i16 q15, q11, q6
839
840        vshrn.i16 d0, q12, #8
841        vshrn.i16 d1, q13, #8
842        vshrn.i16 d2, q14, #8
843        vshrn.i16 d3, q15, #8
844        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
845        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
846        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
847        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
848        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
849        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
850        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
851        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
852
853        subs r2, r2, #1
854        bne 1b
855
856        vpop            {q4-q7}
857        ldmfd           sp!, {r4, lr}
858        bx              lr
859END(rsdIntrinsicBlendSrcOver_K)
860
861/*
862        dst = dst + src * (1.0 - dst.a)
863
864        r0 = dst
865        r1 = src
866        r2 = length
867*/
868ENTRY(rsdIntrinsicBlendDstOver_K)
869        .save           {r4, lr}
870        stmfd           sp!, {r4, lr}
871        vpush           {q4-q7}
872
873        mov r4, #255
874        vdup.16 q7, r4
875
876        mov r4, r0
8771:
878
879        /* src */
880        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
881        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
882        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
883        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
884        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
885        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
886        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
887        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
888        vmovl.u8 q12, d0
889        vmovl.u8 q13, d1
890        vmovl.u8 q14, d2
891        vmovl.u8 q15, d3
892
893        /* dst */
894        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
895        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
896        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
897        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
898        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
899        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
900        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
901        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
902        vshll.u8 q8, d0, #8
903        vshll.u8 q9, d1, #8
904        vshll.u8 q10, d2, #8
905        vmovl.u8 q6, d3
906        vsub.i16 q6, q7, q6        // q6 = 1 - dst.a
907        vshll.u8 q11, d3, #8
908
909
910        vmla.i16 q8, q12, q6
911        vmla.i16 q9, q13, q6
912        vmla.i16 q10, q14, q6
913        vmla.i16 q11, q15, q6
914
915        vshrn.i16 d0, q8, #8
916        vshrn.i16 d1, q9, #8
917        vshrn.i16 d2, q10, #8
918        vshrn.i16 d3, q11, #8
919        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
920        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
921        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
922        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
923        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
924        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
925        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
926        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
927
928        subs r2, r2, #1
929        bne 1b
930
931        vpop            {q4-q7}
932        ldmfd           sp!, {r4, lr}
933        bx              lr
934END(rsdIntrinsicBlendDstOver_K)
935
936/*
937        dst = src * dst.a
938
939        r0 = dst
940        r1 = src
941        r2 = length
942*/
943ENTRY(rsdIntrinsicBlendSrcIn_K)
944        .save           {r4, lr}
945        stmfd           sp!, {r4, lr}
946        vpush           {q4-q7}
947
948        mov r4, r0
9491:
950
951        /* src */
952        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
953        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
954        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
955        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
956        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
957        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
958        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
959        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
960        vmovl.u8 q12, d0
961        vmovl.u8 q13, d1
962        vmovl.u8 q14, d2
963        vmovl.u8 q15, d3
964
965        /* dst */
966        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
967        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
968        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
969        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
970        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
971        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
972        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
973        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
974        //vmovl.u8 q8, d0
975        //vmovl.u8 q9, d1
976        //vmovl.u8 q10, d2
977        vmovl.u8 q11, d3
978
979        vmul.i16 q12, q12, q11
980        vmul.i16 q13, q13, q11
981        vmul.i16 q14, q14, q11
982        vmul.i16 q15, q15, q11
983
984        vshrn.i16 d0, q12, #8
985        vshrn.i16 d1, q13, #8
986        vshrn.i16 d2, q14, #8
987        vshrn.i16 d3, q15, #8
988        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
989        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
990        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
991        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
992        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
993        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
994        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
995        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
996
997        subs r2, r2, #1
998        bne 1b
999
1000        vpop            {q4-q7}
1001        ldmfd           sp!, {r4, lr}
1002        bx              lr
1003END(rsdIntrinsicBlendSrcIn_K)
1004
1005/*
1006        dst = dst * src.a
1007
1008        r0 = dst
1009        r1 = src
1010        r2 = length
1011*/
1012ENTRY(rsdIntrinsicBlendDstIn_K)
1013        .save           {r4, lr}
1014        stmfd           sp!, {r4, lr}
1015        vpush           {q4-q7}
1016
1017        mov r4, r0
10181:
1019
1020        /* src */
1021        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1022        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1023        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1024        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1025        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1026        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1027        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1028        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1029        //vmovl.u8 q12, d0
1030        //vmovl.u8 q13, d1
1031        //vmovl.u8 q14, d2
1032        vmovl.u8 q15, d3
1033
1034        /* dst */
1035        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1036        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1037        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1038        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1039        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1040        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1041        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1042        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1043        vmovl.u8 q8, d0
1044        vmovl.u8 q9, d1
1045        vmovl.u8 q10, d2
1046        vmovl.u8 q11, d3
1047
1048        vmul.i16 q8, q8, q15
1049        vmul.i16 q9, q9, q15
1050        vmul.i16 q10, q10, q15
1051        vmul.i16 q11, q11, q15
1052
1053        vshrn.i16 d0, q8, #8
1054        vshrn.i16 d1, q9, #8
1055        vshrn.i16 d2, q10, #8
1056        vshrn.i16 d3, q11, #8
1057        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1058        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1059        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1060        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1061        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1062        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1063        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1064        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1065
1066        subs r2, r2, #1
1067        bne 1b
1068
1069        vpop            {q4-q7}
1070        ldmfd           sp!, {r4, lr}
1071        bx              lr
1072END(rsdIntrinsicBlendDstIn_K)
1073
1074
1075
1076/*
1077        dst = src * (1.0 - dst.a)
1078
1079        r0 = dst
1080        r1 = src
1081        r2 = length
1082*/
1083ENTRY(rsdIntrinsicBlendSrcOut_K)
1084        .save           {r4, lr}
1085        stmfd           sp!, {r4, lr}
1086        vpush           {q4-q7}
1087
1088        mov r4, #255
1089        vdup.16 q7, r4
1090
1091        mov r4, r0
10921:
1093
1094        /* src */
1095        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1096        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1097        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1098        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1099        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1100        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1101        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1102        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1103        vmovl.u8 q12, d0
1104        vmovl.u8 q13, d1
1105        vmovl.u8 q14, d2
1106        vmovl.u8 q15, d3
1107
1108        /* dst */
1109        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1110        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1111        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1112        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1113        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1114        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1115        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1116        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1117        //vmovl.u8 q8, d0
1118        //vmovl.u8 q9, d1
1119        //vmovl.u8 q10, d2
1120        vmovl.u8 q11, d3
1121
1122
1123        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
1124        vmul.i16 q12, q12, q6
1125        vmul.i16 q13, q13, q6
1126        vmul.i16 q14, q14, q6
1127        vmul.i16 q15, q15, q6
1128
1129        vshrn.i16 d0, q12, #8
1130        vshrn.i16 d1, q13, #8
1131        vshrn.i16 d2, q14, #8
1132        vshrn.i16 d3, q15, #8
1133        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1134        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1135        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1136        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1137        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1138        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1139        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1140        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1141
1142        subs r2, r2, #1
1143        bne 1b
1144
1145        vpop            {q4-q7}
1146        ldmfd           sp!, {r4, lr}
1147        bx              lr
1148END(rsdIntrinsicBlendSrcOut_K)
1149
1150
1151/*
1152        dst = dst * (1.0 - src.a)
1153
1154        r0 = dst
1155        r1 = src
1156        r2 = length
1157*/
1158ENTRY(rsdIntrinsicBlendDstOut_K)
1159        .save           {r4, lr}
1160        stmfd           sp!, {r4, lr}
1161        vpush           {q4-q7}
1162
1163        mov r4, #255
1164        vdup.16 q7, r4
1165
1166        mov r4, r0
11671:
1168
1169        /* src */
1170        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1171        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1172        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1173        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1174        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1175        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1176        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1177        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1178        //vmovl.u8 q12, d0
1179        //vmovl.u8 q13, d1
1180        //vmovl.u8 q14, d2
1181        vmovl.u8 q15, d3
1182
1183        /* dst */
1184        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1185        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1186        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1187        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1188        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1189        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1190        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1191        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1192        vmovl.u8 q8, d0
1193        vmovl.u8 q9, d1
1194        vmovl.u8 q10, d2
1195        vmovl.u8 q11, d3
1196
1197
1198        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
1199        vmul.i16 q12, q8, q6
1200        vmul.i16 q13, q9, q6
1201        vmul.i16 q14, q10, q6
1202        vmul.i16 q15, q11, q6
1203
1204        vshrn.i16 d0, q12, #8
1205        vshrn.i16 d1, q13, #8
1206        vshrn.i16 d2, q14, #8
1207        vshrn.i16 d3, q15, #8
1208        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1209        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1210        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1211        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1212        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1213        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1214        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1215        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1216
1217        subs r2, r2, #1
1218        bne 1b
1219
1220        vpop            {q4-q7}
1221        ldmfd           sp!, {r4, lr}
1222        bx              lr
1223END(rsdIntrinsicBlendDstOut_K)
1224
1225
1226/*
1227        dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb
1228        dst.a = dst.a
1229
1230        r0 = dst
1231        r1 = src
1232        r2 = length
1233*/
1234ENTRY(rsdIntrinsicBlendSrcAtop_K)
1235        .save           {r4, lr}
1236        stmfd           sp!, {r4, lr}
1237        vpush           {q4-q7}
1238
1239        mov r4, #255
1240        vdup.16 q7, r4
1241
1242        mov r4, r0
12431:
1244
1245        /* src */
1246        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1247        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1248        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1249        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1250        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1251        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1252        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1253        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1254        vmovl.u8 q12, d0
1255        vmovl.u8 q13, d1
1256        vmovl.u8 q14, d2
1257        vmovl.u8 q15, d3
1258
1259        /* dst */
1260        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1261        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1262        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1263        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1264        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1265        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1266        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1267        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1268        vmovl.u8 q8, d0
1269        vmovl.u8 q9, d1
1270        vmovl.u8 q10, d2
1271        vmovl.u8 q11, d3
1272
1273
1274        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
1275        vmul.i16 q8, q8, q6
1276        vmul.i16 q9, q9, q6
1277        vmul.i16 q10, q10, q6
1278
1279        vmla.i16 q8, q12, q11
1280        vmla.i16 q9, q13, q11
1281        vmla.i16 q10, q14, q11
1282
1283
1284        vshrn.i16 d0, q8, #8
1285        vshrn.i16 d1, q9, #8
1286        vshrn.i16 d2, q10, #8
1287        //vshrn.i16 d3, q15, #8
1288        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1289        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1290        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1291        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1292        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1293        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1294        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1295        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1296
1297        subs r2, r2, #1
1298        bne 1b
1299
1300        vpop            {q4-q7}
1301        ldmfd           sp!, {r4, lr}
1302        bx              lr
1303END(rsdIntrinsicBlendSrcAtop_K)
1304
1305/*
1306        dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb
1307        dst.a = src.a
1308
1309        r0 = dst
1310        r1 = src
1311        r2 = length
1312*/
1313ENTRY(rsdIntrinsicBlendDstAtop_K)
1314        .save           {r4, lr}
1315        stmfd           sp!, {r4, lr}
1316        vpush           {q4-q7}
1317
1318        mov r4, #255
1319        vdup.16 q7, r4
1320
1321        mov r4, r0
13221:
1323
1324        /* src */
1325        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1326        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1327        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1328        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1329        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1330        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1331        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1332        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1333        vmovl.u8 q12, d0
1334        vmovl.u8 q13, d1
1335        vmovl.u8 q14, d2
1336        vmovl.u8 q15, d3
1337
1338        /* dst */
1339        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1340        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1341        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1342        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1343        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1344        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1345        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1346        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1347        vmovl.u8 q8, d0
1348        vmovl.u8 q9, d1
1349        vmovl.u8 q10, d2
1350        vmovl.u8 q11, d3
1351
1352
1353        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
1354        vmul.i16 q12, q12, q6
1355        vmul.i16 q13, q13, q6
1356        vmul.i16 q14, q14, q6
1357
1358        vmla.i16 q12, q8, q15
1359        vmla.i16 q13, q9, q15
1360        vmla.i16 q14, q10, q15
1361
1362
1363        vshrn.i16 d0, q12, #8
1364        vshrn.i16 d1, q13, #8
1365        vshrn.i16 d2, q14, #8
1366        //vshrn.i16 d3, q15, #8
1367        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1368        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1369        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1370        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1371        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1372        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1373        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1374        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1375
1376        subs r2, r2, #1
1377        bne 1b
1378
1379        vpop            {q4-q7}
1380        ldmfd           sp!, {r4, lr}
1381        bx              lr
1382END(rsdIntrinsicBlendDstAtop_K)
1383
1384/*
1385        dst = dst ^ src
1386
1387        r0 = dst
1388        r1 = src
1389        r2 = length
1390*/
1391ENTRY(rsdIntrinsicBlendXor_K)
1392        .save           {r4, lr}
1393        stmfd           sp!, {r4, lr}
1394        vpush           {q4-q7}
1395
1396        mov r4, #255
1397        vdup.16 q7, r4
1398
1399        mov r4, r0
14001:
1401
1402        /* src */
1403        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1404        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1405        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1406        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1407        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1408        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1409        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1410        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1411        vmov.u8 d4, d0
1412        vmov.u8 d5, d1
1413        vmov.u8 d6, d2
1414        vmov.u8 d7, d3
1415
1416        /* dst */
1417        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1418        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1419        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1420        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1421        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1422        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1423        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1424        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1425
1426        veor d0, d0, d4
1427        veor d1, d1, d5
1428        veor d2, d2, d6
1429        veor d3, d3, d7
1430
1431        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1432        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1433        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1434        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1435        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1436        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1437        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1438        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1439
1440        subs r2, r2, #1
1441        bne 1b
1442
1443        vpop            {q4-q7}
1444        ldmfd           sp!, {r4, lr}
1445        bx              lr
1446END(rsdIntrinsicBlendXor_K)
1447
1448/*
1449        dst = dst * src
1450
1451        r0 = dst
1452        r1 = src
1453        r2 = length
1454*/
1455ENTRY(rsdIntrinsicBlendMultiply_K)
1456        .save           {r4, lr}
1457        stmfd           sp!, {r4, lr}
1458        vpush           {q4-q7}
1459
1460        mov r4, #255
1461        vdup.16 q7, r4
1462
1463        mov r4, r0
14641:
1465
1466        /* src */
1467        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1468        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1469        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1470        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1471        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1472        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1473        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1474        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1475        vmovl.u8 q12, d0
1476        vmovl.u8 q13, d1
1477        vmovl.u8 q14, d2
1478        vmovl.u8 q15, d3
1479
1480        /* dst */
1481        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1482        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1483        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1484        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1485        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1486        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1487        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1488        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1489        vmovl.u8 q8, d0
1490        vmovl.u8 q9, d1
1491        vmovl.u8 q10, d2
1492        vmovl.u8 q11, d3
1493
1494
1495        vmul.i16 q8, q8, q12
1496        vmul.i16 q9, q9, q13
1497        vmul.i16 q10, q10, q14
1498        vmul.i16 q11, q11, q15
1499
1500        vshrn.i16 d0, q8, #8
1501        vshrn.i16 d1, q9, #8
1502        vshrn.i16 d2, q10, #8
1503        vshrn.i16 d3, q11, #8
1504        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1505        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1506        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1507        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1508        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1509        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1510        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1511        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1512
1513        subs r2, r2, #1
1514        bne 1b
1515
1516        vpop            {q4-q7}
1517        ldmfd           sp!, {r4, lr}
1518        bx              lr
1519END(rsdIntrinsicBlendMultiply_K)
1520
1521/*
1522        dst = min(src + dst, 1.0)
1523
1524        r0 = dst
1525        r1 = src
1526        r2 = length
1527*/
1528ENTRY(rsdIntrinsicBlendAdd_K)
1529        .save           {r4, lr}
1530        stmfd           sp!, {r4, lr}
1531        vpush           {q4-q7}
1532
1533        mov r4, #255
1534        vdup.16 q7, r4
1535
1536        mov r4, r0
15371:
1538
1539        /* src */
1540        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1541        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1542        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1543        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1544        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1545        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1546        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1547        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1548        vmovl.u8 q12, d0
1549        vmovl.u8 q13, d1
1550        vmovl.u8 q14, d2
1551        vmovl.u8 q15, d3
1552
1553        /* dst */
1554        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1555        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1556        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1557        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1558        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1559        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1560        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1561        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1562        vmovl.u8 q8, d0
1563        vmovl.u8 q9, d1
1564        vmovl.u8 q10, d2
1565        vmovl.u8 q11, d3
1566
1567
1568        vadd.i16 q8, q8, q12
1569        vadd.i16 q9, q9, q13
1570        vadd.i16 q10, q10, q14
1571        vadd.i16 q11, q11, q15
1572
1573        vqmovun.s16 d0, q8
1574        vqmovun.s16 d1, q9
1575        vqmovun.s16 d2, q10
1576        vqmovun.s16 d3, q11
1577        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1578        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1579        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1580        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1581        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1582        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1583        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1584        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1585
1586        subs r2, r2, #1
1587        bne 1b
1588
1589        vpop            {q4-q7}
1590        ldmfd           sp!, {r4, lr}
1591        bx              lr
1592END(rsdIntrinsicBlendAdd_K)
1593
1594
1595/*
1596        dst = max(dst - src, 0.0)
1597
1598        r0 = dst
1599        r1 = src
1600        r2 = length
1601*/
1602ENTRY(rsdIntrinsicBlendSub_K)
1603        .save           {r4, lr}
1604        stmfd           sp!, {r4, lr}
1605        vpush           {q4-q7}
1606
1607        mov r4, #255
1608        vdup.16 q7, r4
1609
1610        mov r4, r0
16111:
1612
1613        /* src */
1614        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1615        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1616        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1617        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1618        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1619        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1620        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1621        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1622        vmovl.u8 q12, d0
1623        vmovl.u8 q13, d1
1624        vmovl.u8 q14, d2
1625        vmovl.u8 q15, d3
1626
1627        /* dst */
1628        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1629        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1630        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1631        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1632        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1633        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1634        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1635        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1636        vmovl.u8 q8, d0
1637        vmovl.u8 q9, d1
1638        vmovl.u8 q10, d2
1639        vmovl.u8 q11, d3
1640
1641
1642        vsub.i16 q8, q8, q12
1643        vsub.i16 q9, q9, q13
1644        vsub.i16 q10, q10, q14
1645        vsub.i16 q11, q11, q15
1646
1647        vqmovun.s16 d0, q8
1648        vqmovun.s16 d1, q9
1649        vqmovun.s16 d2, q10
1650        vqmovun.s16 d3, q11
1651        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1652        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1653        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1654        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1655        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1656        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1657        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1658        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1659
1660        subs r2, r2, #1
1661        bne 1b
1662
1663        vpop            {q4-q7}
1664        ldmfd           sp!, {r4, lr}
1665        bx              lr
1666END(rsdIntrinsicBlendSub_K)
1667
1668
1669/* 3D LUT */
1670
1671/*
1672        r0 = dst
1673        r1 = src
1674        r2 = cube base pointer
1675        r3 = cube Y stride
1676        r4 = cube Z stride
1677        r5 = count
1678        xr10 = * constants
1679
1680        d0  / q0  = weight 1 p1
1681        d1        = weight 2 p1
1682
1683        d2  / q1  = weight 1 p2
1684        d3        = weight 2 p2
1685
1686        d4  / q2  = src1
1687        d5        = src2
1688
1689        d6  / q3  = baseCoord
1690        d7        = baseCoord
1691
1692        d8  / q4  = coord1 p1
1693        d9        =
1694
1695        d10 / q5  = coord1 p2
1696        d11       =
1697
1698        d12 / q6  =
1699        d13       =
1700
1701        d14 / q7  =
1702        d15       =
1703
1704
1705        d16 / q8  = x0 y0 z0
1706        d17       = x1 y0 z0
1707        d18 / q9  = x0 y1 z0
1708        d19       = x1 y1 z0
1709        d20 / q10 = x0 y0 z1
1710        d21       = x1 y0 z1
1711        d22 / q11 = x0 y1 z1
1712        d23       = x1 y1 z1
1713
1714        d24 / q12 = alpha mash
1715        d25       = current pixel alpha
1716        d26 / q13 = 4, y stride
1717        d27       = z stride, 0
1718        d28 / q14 = 0x8000
1719        d29       = 0x7fff
1720        d30 / q15 = 0, 0, 0, 0xffff
1721
1722
1723        d31 = coordMult
1724*/
1725
1726ENTRY(rsdIntrinsic3DLUT_K)
1727        push        {r4-r8, r10, r11, lr}
1728        vpush       {q4-q7}
1729
1730        /* load Z stride in r4 */
1731        ldr     r4, [sp, #32 + 64]
1732
1733        /* Load count */
1734        ldr     r5, [sp, #36 + 64]
1735
1736        vmov.u16 d28, #0x8000
1737        vmov.u16 d29, #0x7fff
1738        vmov.u32 d24, #0xff000000
1739
1740        /* load constants using r10 */
1741        ldr     r10, [sp, #40 + 64]
1742        vld1.32 {d31}, [r10]!
1743        vld1.32 {d30}, [r10]!
1744
1745        mov r6, #4
1746        vmov d26, r6, r3
1747        mov r6, #0
1748        vmov d27, r4, r6
1749
1750        add r8, r3, r4
1751
1752
1753
17541:
1755        vld1.8 {d4}, [r1]!
1756        vand.u8 d25, d4, d24
1757        vmovl.u8 q2, d4
1758
1759
1760        vmull.u16 q3, d4, d31
1761        vshr.u32 q4, q3, #15       // coord1 p1
1762        vmovn.u32 d1, q3
1763        vand.u16 d1, d29           // weight 2
1764        vsub.u16 d0, d28, d1       // weight 1
1765        vmul.u32 q4, q4, q13           // q4 = x*4, y*ystride, z*zstride, 0
1766
1767        vmull.u16 q3, d5, d31
1768        vshr.u32 q5, q3, #15       // coord1 p2
1769        vmovn.u32 d3, q3
1770        vand.u16 d3, d29           // weight 2
1771        vsub.u16 d2, d28, d3       // weight 1
1772        vmul.u32 q5, q5, q13       // q5 = x*4, y*ystride, z*zstride, 0
1773
1774        vpadd.u32 d8, d8, d9
1775        vpadd.u32 d9, d10, d11
1776        vpadd.u32 d8, d8, d9
1777        vmov r6, r7, d8            // base pointers
1778
1779        add  r6, r6, r2
1780        add  r7, r7, r2
1781
1782        vld1.8 {d16}, [r6]
1783        add r11, r6, r3
1784        vld1.8 {d18}, [r11]
1785        add r11, r6, r4
1786        vld1.8 {d20}, [r11]
1787        add r11, r6, r8
1788        vld1.8 {d22}, [r11]
1789
1790        vmovl.u8 q8, d16
1791        vmovl.u8 q9, d18
1792        vmovl.u8 q10, d20
1793        vmovl.u8 q11, d22
1794
1795        vmull.u16 q6, d16, d0[0]
1796        vmlal.u16 q6, d17, d1[0]
1797        vshrn.u32 d16, q6, #7
1798        vmull.u16 q6, d18, d0[0]
1799        vmlal.u16 q6, d19, d1[0]
1800        vshrn.u32 d18, q6, #7
1801        vmull.u16 q6, d20, d0[0]
1802        vmlal.u16 q6, d21, d1[0]
1803        vshrn.u32 d20, q6, #7
1804        vmull.u16 q6, d22, d0[0]
1805        vmlal.u16 q6, d23, d1[0]
1806        vshrn.u32 d22, q6, #7
1807
1808        vmull.u16 q6, d16, d0[1]
1809        vmlal.u16 q6, d18, d1[1]
1810        vshrn.u32 d16, q6, #15
1811        vmull.u16 q6, d20, d0[1]
1812        vmlal.u16 q6, d22, d1[1]
1813        vshrn.u32 d18, q6, #15
1814
1815        vmull.u16 q6, d16, d0[2]
1816        vmlal.u16 q6, d18, d1[2]
1817        vshrn.u32 d14, q6, #15
1818
1819
1820        vld1.8 {d16}, [r7]
1821        add r11, r7, r3
1822        vld1.8 {d18}, [r11]
1823        add r11, r7, r4
1824        vld1.8 {d20}, [r11]
1825        add r11, r7, r8
1826        vld1.8 {d22}, [r11]
1827        vmovl.u8 q8, d16
1828        vmovl.u8 q9, d18
1829        vmovl.u8 q10, d20
1830        vmovl.u8 q11, d22
1831
1832        vmull.u16 q6, d16, d2[0]
1833        vmlal.u16 q6, d17, d3[0]
1834        vshrn.u32 d16, q6, #7
1835        vmull.u16 q6, d18, d2[0]
1836        vmlal.u16 q6, d19, d3[0]
1837        vshrn.u32 d18, q6, #7
1838        vmull.u16 q6, d20, d2[0]
1839        vmlal.u16 q6, d21, d3[0]
1840        vshrn.u32 d20, q6, #7
1841        vmull.u16 q6, d22, d2[0]
1842        vmlal.u16 q6, d23, d3[0]
1843        vshrn.u32 d22, q6, #7
1844
1845        vmull.u16 q6, d16, d2[1]
1846        vmlal.u16 q6, d18, d3[1]
1847        vshrn.u32 d16, q6, #15
1848        vmull.u16 q6, d20, d2[1]
1849        vmlal.u16 q6, d22, d3[1]
1850        vshrn.u32 d18, q6, #15
1851
1852        vmull.u16 q6, d16, d2[2]
1853        vmlal.u16 q6, d18, d3[2]
1854        vshrn.u32 d15, q6, #15
1855
1856        vrshrn.u16 d14, q7, #8
1857
1858        vbic.u8 d14, d14, d24  // mix in alpha
1859        vorr.u8 d14, d14, d25
1860        vst1.32 {d14}, [r0]!
1861
1862
1863        /* Are we done? */
1864        subs r5, r5, #1
1865        bne 1b
1866
1867        /* Yup, bye */
1868        vpop            {q4-q7}
1869        pop         {r4-r8, r10, r11, lr}
1870        bx          lr
1871
1872END(rsdIntrinsic3DLUT_K)
1873
1874
1875