• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18
19#include <machine/cpu-features.h>
20#include <machine/asm.h>
21
22/*
23        r0 = dst
24        r1 = y0 base pointer
25        r2 = y1 base pointer
26        r3 = y2 base pointer
27        sp = coeffs
28        sp = length / 2
29*/
30
31ENTRY(rsdIntrinsicConvolve3x3_K)
32        push            {r4-r8, r10, r11, lr}
33        vpush           {q4-q7}
34
35        /* Get the coeffs pointer from the stack and load the
36           coefficients in the q0, q1 NEON registers */
37        ldr r4, [sp, #32+64]
38        vld1.16 {q0, q1}, [r4]
39
40        /* Get count from the stack */
41        ldr r4, [sp, #36+64]
42
43        /* Load the frequently used immediate in a register */
44        mov r5, #8
45
461:
47        /* Load and post-increase the address by r5=#8 */
48        vld1.8 {q13}, [r1], r5
49        vld1.8 {q14}, [r2], r5
50        vld1.8 {q15}, [r3], r5
51
52        /* Signal memory for data that will be used in the loop after the next */
53        PLD         (r1, r5)
54        PLD         (r2, r5)
55        PLD         (r3, r5)
56
57        vmovl.u8 q2, d26
58        vmovl.u8 q3, d27
59        vmovl.u8 q4, d28
60        vmovl.u8 q5, d29
61        vmovl.u8 q6, d30
62        vmovl.u8 q7, d31
63
64/*
65        The two pixel source array is
66        d4,  d5,  d6,  d7
67        d8,  d9,  d10, d11
68        d12, d13, d14, d15
69*/
70
71        vmull.s16 q8, d4, d0[0]
72        vmlal.s16 q8, d5, d0[1]
73        vmlal.s16 q8, d6, d0[2]
74        vmlal.s16 q8, d8, d0[3]
75        vmlal.s16 q8, d9, d1[0]
76        vmlal.s16 q8, d10, d1[1]
77        vmlal.s16 q8, d12, d1[2]
78        vmlal.s16 q8, d13, d1[3]
79        vmlal.s16 q8, d14, d2[0]
80
81        vmull.s16 q9, d5, d0[0]
82        vmlal.s16 q9, d6, d0[1]
83        vmlal.s16 q9, d7, d0[2]
84        vmlal.s16 q9, d9, d0[3]
85        vmlal.s16 q9, d10, d1[0]
86        vmlal.s16 q9, d11, d1[1]
87        vmlal.s16 q9, d13, d1[2]
88        vmlal.s16 q9, d14, d1[3]
89        vmlal.s16 q9, d15, d2[0]
90
91        vshrn.i32 d16, q8, #8
92        vshrn.i32 d17, q9, #8
93
94        vqmovun.s16 d16, q8
95        vst1.8 d16, [r0]!
96
97        /* Are we done yet? */
98        subs r4, r4, #1
99        bne 1b
100
101        /* We're done, bye! */
102        vpop            {q4-q7}
103        pop             {r4-r8, r10, r11, lr}
104        bx              lr
105END(rsdIntrinsicConvolve3x3_K)
106
107
108/*
109static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
110                  const float* gPtr, int iradius, int x1, int x2)
111
112    r0 = out
113    r1 = pin
114    r2 = stride
115    r3 = gptr
116    r4 = sp, ct
117    r5 = sp+4, x1
118    r6 = sp+8, x2
119*/
120ENTRY(rsdIntrinsicBlurVFU4_K)
121        push            {r4-r8, r10, r11, lr}
122        vpush           {q4-q7}
123
124        ldr r4, [sp, #32+64]
125        ldr r5, [sp, #32+64 + 4]
126        ldr r6, [sp, #32+64 + 8]
127
1281:
129        veor q10, q10, q10         /* float4 blurredPixel = 0; */
130        veor q11, q11, q11         /* float4 blurredPixel = 0; */
131        add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
132        mov r10, r3
133
134        mov r11, r4
135
1362:
137        vld1.32 {d2}, [r7]
138        vmovl.u8 q1, d2
139        vmovl.u16 q3, d2
140        vmovl.u16 q4, d3
141        vcvt.f32.s32 q3, q3
142        vcvt.f32.s32 q4, q4
143        vld1.32 {d0[0]}, [r10]!
144        add r7, r7, r2
145        vmla.f32 q10, q3, d0[0]
146        vmla.f32 q11, q4, d0[0]
147        subs r11, r11, #1
148        bne 2b
149
150        vst1.32 {q10}, [r0]!
151        vst1.32 {q11}, [r0]!
152        add r5, r5, #2
153        cmp r5, r6
154        bne 1b
155
156
157        vpop            {q4-q7}
158        pop             {r4-r8, r10, r11, lr}
159        bx              lr
160END(rsdIntrinsicBlurVFU4_K)
161
162/*
163static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
164                  const float* gPtr, int iradius, int x1, int x2)
165
166    r0 = out
167    r1 = pin
168    r2 = gptr
169    r3 = ct
170    r4 = sp, x1
171    r5 = sp+4, x2
172*/
173ENTRY(rsdIntrinsicBlurHFU4_K)
174        push            {r4-r8, r10, r11, lr}
175        vpush           {q4-q7}
176
177        ldr r4, [sp, #32+64]
178        ldr r5, [sp, #32+64 + 4]
179
1801:
181        add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
182        mov r10, r2
183        mov r11, r3
184
185        vld1.32 {q1}, [r7]!
186        vld1.32 {d6[0]}, [r10]!
187        vmul.f32 q0, q1, d6[0]
188        sub r11, r11, #1
189
1902:
191        vld1.32 {q1}, [r7]!
192        vld1.32 {q2}, [r7]!
193        vld1.32 {d6}, [r10]!
194        vmla.f32 q0, q1, d6[0]
195        vmla.f32 q0, q2, d6[1]
196        subs r11, r11, #2
197        bne 2b
198
199        vcvt.s32.f32 q0, q0
200        vmovn.u32 d0, q0
201        vmovn.u16 d0, q0
202
203        vst1.32 {d0[0]}, [r0]!
204        add r4, r4, #1
205        cmp r4, r5
206        bne 1b
207
208        vpop            {q4-q7}
209        pop             {r4-r8, r10, r11, lr}
210        bx              lr
211END(rsdIntrinsicBlurHFU4_K)
212
213ENTRY(rsdIntrinsicBlurHFU1_K)
214        push            {r4-r8, r10, r11, lr}
215        vpush           {q4-q7}
216
217        ldr r4, [sp, #32+64]
218        ldr r5, [sp, #32+64 + 4]
219
2201:
221        add r7, r1, r4, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
222        mov r10, r2
223        mov r11, r3
224
225        veor q0, q0
226
2272:
228        vld1.32 {q1}, [r7]
229        add r7, r7, #4
230        vld1.32 {d4[0]}, [r10]!
231        vmla.f32 q0, q1, d4[0]
232        subs r11, r11, #1
233        bne 2b
234
235        vcvt.s32.f32 q0, q0
236        vmovn.u32 d0, q0
237        vmovn.u16 d0, q0
238
239        vst1.32 {d0[0]}, [r0]!
240        add r4, r4, #4
241        cmp r4, r5
242        bne 1b
243
244        vpop            {q4-q7}
245        pop             {r4-r8, r10, r11, lr}
246        bx              lr
247END(rsdIntrinsicBlurHFU1_K)
248
249/*
250    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
251        r0 = dst
252        r1 = Y
253        r2 = VU
254        r3 = length (pixels / 8)
255        ---- Args below will be in the stack ----
256        sp = YuvCoeff
257
258        This function converts 8 pixels per iteration
259*/
260ENTRY(rsdIntrinsicYuv_K)
261        push        {r4, r5, lr}            @ preserve clobbered int registers
262        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
263
264        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
265
266        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
267        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
268        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
269        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
270
271        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
272
273        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
274                                            @ the coeffs matrix (Q2)
275
276        1:
277        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
278        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
279        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
280        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
281
282        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
283        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
284        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
285
286        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
287        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
288        vmov.u16    d11, d10                @ Copying V to d11
289        vmov.u16    d13, d12                @ Copying U to d13
290        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
291        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
292
293
294        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
295        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
296        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
297        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
298
299                                            @                  R    G    B
300                                            @     Pixel(0-3)  Q8,  Q9, Q10
301                                            @     Pixel(4-7) Q11, Q12, Q13
302                                            @
303
304                                            @ Pixel(0-3)
305        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
306        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
307        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
308        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
309
310                                            @ Pixel(4-7)
311        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
312        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
313        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
314        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
315
316                                            @ Pixel(0-3)
317        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
318        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
319        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
320
321                                            @ Pixel(4-7)
322        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
323        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
324        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
325
326        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
327        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
328        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
329
330        subs        r3, r3, #1              @ Checking length (r3)
331        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
332
333        bne 1b                              @ if not done with length, loop
334
335        vpop        {Q4-Q7}                 @ Restore Vregisters
336        pop         {r4, r5, lr}            @ Restore int registers
337        bx          lr
338END(rsdIntrinsicYuv_K)
339
340/*
341    Function called with the following arguments: dst, Y, vu, len, YuvCoeff
342        r0 = dst
343        r1 = Y
344        r2 = UV
345        r3 = length (pixels / 8)
346        ---- Args below will be in the stack ----
347        sp = YuvCoeff
348
349        This function converts 8 pixels per iteration
350*/
351ENTRY(rsdIntrinsicYuvR_K)
352        push        {r4, r5, lr}            @ preserve clobbered int registers
353        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
354
355        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
356
357        ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
358        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
359        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
360        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
361
362        mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
363
364        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
365                                            @ the coeffs matrix (Q2)
366
367        1:
368        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
369        vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
370        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
371        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
372
373        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
374        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
375        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
376
377        vsubl.u8    Q5, d14, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
378        vsubl.u8    Q6, d12, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
379        vmov.u16    d11, d10                @ Copying V to d11
380        vmov.u16    d13, d12                @ Copying U to d13
381        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
382        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
383
384
385        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
386        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
387        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
388        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
389
390                                            @                  R    G    B
391                                            @     Pixel(0-3)  Q8,  Q9, Q10
392                                            @     Pixel(4-7) Q11, Q12, Q13
393                                            @
394
395                                            @ Pixel(0-3)
396        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
397        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
398        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
399        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
400
401                                            @ Pixel(4-7)
402        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
403        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
404        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
405        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
406
407                                            @ Pixel(0-3)
408        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
409        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
410        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
411
412                                            @ Pixel(4-7)
413        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
414        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
415        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
416
417        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
418        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
419        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
420
421        subs        r3, r3, #1              @ Checking length (r3)
422        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
423
424        bne 1b                              @ if not done with length, loop
425
426        vpop        {Q4-Q7}                 @ Restore Vregisters
427        pop         {r4, r5, lr}            @ Restore int registers
428        bx          lr
429END(rsdIntrinsicYuvR_K)
430
431/*
432    Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
433        r0 = dst
434        r1 = Y
435        r2 = V,
436        r3 = U
437        ---- Args below will be in the stack ----
438        sp = length (pixels / 8)
439        sp+4 = YuvCoeff
440
441        This function converts 8 pixels per iteration
442*/
443ENTRY(rsdIntrinsicYuv2_K)
444        push        {r4, r5, r6, lr}        @ preserve clobbered int registers
445        vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
446
447        mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
448
449        ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
450        ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
451        vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
452        vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
453        vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
454
455        mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
456
457        vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
458                                            @ the coeffs matrix (Q2)
459
460        1:
461        vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
462        vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
463        vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
464        pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
465        pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
466
467        vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
468        vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
469        vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
470
471        vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
472        vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
473        vmov.u16    d11, d10                @ Copying V to d11
474        vmov.u16    d13, d12                @ Copying U to d13
475        vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
476        vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
477
478
479        vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
480        vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
481        vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
482        vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
483
484                                            @                  R    G    B
485                                            @     Pixel(0-3)  Q8,  Q9, Q10
486                                            @     Pixel(4-7) Q11, Q12, Q13
487                                            @
488
489                                            @ Pixel(0-3)
490        vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
491        vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
492        vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
493        vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
494
495                                            @ Pixel(4-7)
496        vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
497        vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
498        vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
499        vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
500
501                                            @ Pixel(0-3)
502        vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
503        vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
504        vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
505
506                                            @ Pixel(4-7)
507        vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
508        vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
509        vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
510
511        vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
512        vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
513        vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
514
515        subs        r6, r6, #1              @ Checking length (r6)
516        vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
517
518        bne 1b                              @ if not done with length, loop
519
520        vpop        {Q4-Q7}                 @ Restore Vregisters
521        pop         {r4, r5, r6, lr}        @ Restore int registers
522        bx          lr
523END(rsdIntrinsicYuv2_K)
524
525/* Convolve 5x5 */
526
527/*
528        r0 = dst
529        r1 = y0 base pointer
530        r2 = y1 base pointer
531        r3 = y2 base pointer
532        r4 = y3 base pointer
533        r5 = y4 base pointer
534        r6 = coeffs
535        r7 = length
536*/
537ENTRY(rsdIntrinsicConvolve5x5_K)
538        push        {r4-r7, lr}
539        vpush       {q4-q7}
540
541        /* load y3 in r4 */
542        ldr     r4, [sp, #20 + 64]
543
544        /* load y4 in r5 */
545        ldr     r5, [sp, #24 + 64]
546
547        /* Load the coefficients pointer */
548        ldr     r6, [sp, #28 + 64]
549
550        /* Create the coefficients vector */
551        vld1.16     {d0, d1, d2, d3}, [r6]!
552        vld1.16     {d4, d5, d6}, [r6]
553
554        vmov.u32  q15, #0x7f
555
556        /* load the count */
557        ldr     r6, [sp, #32 + 64]
558
559        /* Load the frequently used immediate in a register */
560        mov     r7, #8
561
5621:
563        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
564        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
565        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
566
567        /* Signal memory for data that will be used in the loop after the next */
568        PLD         (r1, r7)
569        PLD         (r2, r7)
570
571        /* Promoting the 8bit channels to 16bit */
572        vmovl.u8 q9,  d24
573        vmovl.u8 q10, d25
574        vmovl.u8 q11, d26
575        vmovl.u8 q12, d27
576        vmovl.u8 q13, d28
577        vmovl.u8 q14, d29
578
579/*
580        d18,  d19,  d20, d21, d22, d23,
581        d24,  d25
582*/
583        vmull.s16 q4, d18, d0[0]
584        vmlal.s16 q4, d19, d0[1]
585        vmlal.s16 q4, d20, d0[2]
586        vmlal.s16 q4, d21, d0[3]
587        vmlal.s16 q4, d22, d1[0]
588
589        vmlal.s16 q4, d24, d1[1]
590        vmlal.s16 q4, d25, d1[2]
591        vmlal.s16 q4, d26, d1[3]
592        vmlal.s16 q4, d27, d2[0]
593        vmlal.s16 q4, d28, d2[1]
594
595        vmull.s16 q5, d19, d0[0]
596        vmlal.s16 q5, d20, d0[1]
597        vmlal.s16 q5, d21, d0[2]
598        vmlal.s16 q5, d22, d0[3]
599        vmlal.s16 q5, d23, d1[0]
600
601        vmlal.s16 q5, d25, d1[1]
602        vmlal.s16 q5, d26, d1[2]
603        vmlal.s16 q5, d27, d1[3]
604        vmlal.s16 q5, d28, d2[0]
605        vmlal.s16 q5, d29, d2[1]
606
607
608        /* Next 2 rows */
609        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
610        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
611        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
612
613        /* Signal memory for data that will be used in the loop after the next */
614        PLD         (r3, r7)
615        PLD         (r4, r7)
616
617        /* Promoting the 8bit channels to 16bit */
618        vmovl.u8 q9,  d24
619        vmovl.u8 q10, d25
620        vmovl.u8 q11, d26
621        vmovl.u8 q12, d27
622        vmovl.u8 q13, d28
623        vmovl.u8 q14, d29
624
625/*
626        d18,  d19,  d20, d21, d22, d23,
627        d24,  d25
628*/
629        vmlal.s16 q4, d18, d2[2]
630        vmlal.s16 q4, d19, d2[3]
631        vmlal.s16 q4, d20, d3[0]
632        vmlal.s16 q4, d21, d3[1]
633        vmlal.s16 q4, d22, d3[2]
634
635        vmlal.s16 q4, d24, d3[3]
636        vmlal.s16 q4, d25, d4[0]
637        vmlal.s16 q4, d26, d4[1]
638        vmlal.s16 q4, d27, d4[2]
639        vmlal.s16 q4, d28, d4[3]
640
641        vmlal.s16 q5, d19, d2[2]
642        vmlal.s16 q5, d20, d2[3]
643        vmlal.s16 q5, d21, d3[0]
644        vmlal.s16 q5, d22, d3[1]
645        vmlal.s16 q5, d23, d3[2]
646
647        vmlal.s16 q5, d25, d3[3]
648        vmlal.s16 q5, d26, d4[0]
649        vmlal.s16 q5, d27, d4[1]
650        vmlal.s16 q5, d28, d4[2]
651        vmlal.s16 q5, d29, d4[3]
652
653        /* Last row */
654        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
655        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
656
657        /* Signal memory for data that will be used in the loop after the next */
658        PLD         (r5, r7)
659
660        /* Promoting the 8bit channels to 16bit */
661        vmovl.u8 q9,  d24
662        vmovl.u8 q10, d25
663        vmovl.u8 q11, d26
664
665/*
666        d18,  d19,  d20, d21, d22, d23,
667        d24,  d25
668*/
669
670        vmlal.s16 q4, d18, d5[0]
671        vmlal.s16 q4, d19, d5[1]
672        vmlal.s16 q4, d20, d5[2]
673        vmlal.s16 q4, d21, d5[3]
674        vmlal.s16 q4, d22, d6[0]
675
676        vmlal.s16 q5, d19, d5[0]
677        vmlal.s16 q5, d20, d5[1]
678        vmlal.s16 q5, d21, d5[2]
679        vmlal.s16 q5, d22, d5[3]
680        vmlal.s16 q5, d23, d6[0]
681
682
683
684        vadd.i32 q4, q4, q15
685        vadd.i32 q5, q5, q15
686
687/*      Narrow it to a d-reg 32 -> 16 bit */
688        vrshrn.i32 d8, q4, #8
689        vrshrn.i32 d9, q5, #8
690
691
692/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
693        vqmovun.s16 d8, q4
694
695        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
696
697        /* Are we done? */
698        subs r6, r6, #1
699        bne 1b
700
701        /* Yup, bye */
702        vpop        {q4-q7}
703        pop         {r4-r7, lr}
704        bx          lr
705
706END(rsdIntrinsicConvolve5x5_K)
707
708
709
710
711/*
712        dst = src + dst * (1.0 - src.a)
713
714        r0 = dst
715        r1 = src
716        r2 = length
717*/
718ENTRY(rsdIntrinsicBlendSrcOver_K)
719        .save           {r4, lr}
720        stmfd           sp!, {r4, lr}
721        vpush           {q4-q7}
722
723        mov r4, #255
724        vdup.16 q7, r4
725
726        mov r4, r0
7271:
728
729        /* src */
730        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
731        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
732        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
733        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
734        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
735        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
736        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
737        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
738        vshll.u8 q12, d0, #8
739        vshll.u8 q13, d1, #8
740        vshll.u8 q14, d2, #8
741        vmovl.u8 q6, d3
742        vsub.i16 q6, q7, q6        // q6 = 1 - src.a
743        vshll.u8 q15, d3, #8
744
745        /* dst */
746        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
747        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
748        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
749        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
750        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
751        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
752        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
753        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
754        vmovl.u8 q8, d0
755        vmovl.u8 q9, d1
756        vmovl.u8 q10, d2
757        vmovl.u8 q11, d3
758
759        vmla.i16 q12, q8, q6
760        vmla.i16 q13, q9, q6
761        vmla.i16 q14, q10, q6
762        vmla.i16 q15, q11, q6
763
764        vshrn.i16 d0, q12, #8
765        vshrn.i16 d1, q13, #8
766        vshrn.i16 d2, q14, #8
767        vshrn.i16 d3, q15, #8
768        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
769        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
770        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
771        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
772        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
773        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
774        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
775        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
776
777        subs r2, r2, #1
778        bne 1b
779
780        vpop            {q4-q7}
781        ldmfd           sp!, {r4, lr}
782        bx              lr
783END(rsdIntrinsicBlendSrcOver_K)
784
785/*
786        dst = dst + src * (1.0 - dst.a)
787
788        r0 = dst
789        r1 = src
790        r2 = length
791*/
792ENTRY(rsdIntrinsicBlendDstOver_K)
793        .save           {r4, lr}
794        stmfd           sp!, {r4, lr}
795        vpush           {q4-q7}
796
797        mov r4, #255
798        vdup.16 q7, r4
799
800        mov r4, r0
8011:
802
803        /* src */
804        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
805        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
806        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
807        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
808        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
809        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
810        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
811        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
812        vmovl.u8 q12, d0
813        vmovl.u8 q13, d1
814        vmovl.u8 q14, d2
815        vmovl.u8 q15, d3
816
817        /* dst */
818        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
819        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
820        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
821        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
822        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
823        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
824        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
825        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
826        vshll.u8 q8, d0, #8
827        vshll.u8 q9, d1, #8
828        vshll.u8 q10, d2, #8
829        vmovl.u8 q6, d3
830        vsub.i16 q6, q7, q6        // q6 = 1 - dst.a
831        vshll.u8 q11, d3, #8
832
833
834        vmla.i16 q8, q12, q6
835        vmla.i16 q9, q13, q6
836        vmla.i16 q10, q14, q6
837        vmla.i16 q11, q15, q6
838
839        vshrn.i16 d0, q8, #8
840        vshrn.i16 d1, q9, #8
841        vshrn.i16 d2, q10, #8
842        vshrn.i16 d3, q11, #8
843        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
844        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
845        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
846        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
847        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
848        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
849        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
850        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
851
852        subs r2, r2, #1
853        bne 1b
854
855        vpop            {q4-q7}
856        ldmfd           sp!, {r4, lr}
857        bx              lr
858END(rsdIntrinsicBlendDstOver_K)
859
860/*
861        dst = src * dst.a
862
863        r0 = dst
864        r1 = src
865        r2 = length
866*/
867ENTRY(rsdIntrinsicBlendSrcIn_K)
868        .save           {r4, lr}
869        stmfd           sp!, {r4, lr}
870        vpush           {q4-q7}
871
872        mov r4, r0
8731:
874
875        /* src */
876        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
877        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
878        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
879        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
880        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
881        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
882        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
883        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
884        vmovl.u8 q12, d0
885        vmovl.u8 q13, d1
886        vmovl.u8 q14, d2
887        vmovl.u8 q15, d3
888
889        /* dst */
890        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
891        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
892        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
893        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
894        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
895        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
896        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
897        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
898        //vmovl.u8 q8, d0
899        //vmovl.u8 q9, d1
900        //vmovl.u8 q10, d2
901        vmovl.u8 q11, d3
902
903        vmul.i16 q12, q12, q11
904        vmul.i16 q13, q13, q11
905        vmul.i16 q14, q14, q11
906        vmul.i16 q15, q15, q11
907
908        vshrn.i16 d0, q12, #8
909        vshrn.i16 d1, q13, #8
910        vshrn.i16 d2, q14, #8
911        vshrn.i16 d3, q15, #8
912        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
913        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
914        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
915        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
916        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
917        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
918        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
919        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
920
921        subs r2, r2, #1
922        bne 1b
923
924        vpop            {q4-q7}
925        ldmfd           sp!, {r4, lr}
926        bx              lr
927END(rsdIntrinsicBlendSrcIn_K)
928
929/*
930        dst = dst * src.a
931
932        r0 = dst
933        r1 = src
934        r2 = length
935*/
936ENTRY(rsdIntrinsicBlendDstIn_K)
937        .save           {r4, lr}
938        stmfd           sp!, {r4, lr}
939        vpush           {q4-q7}
940
941        mov r4, r0
9421:
943
944        /* src */
945        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
946        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
947        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
948        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
949        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
950        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
951        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
952        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
953        //vmovl.u8 q12, d0
954        //vmovl.u8 q13, d1
955        //vmovl.u8 q14, d2
956        vmovl.u8 q15, d3
957
958        /* dst */
959        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
960        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
961        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
962        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
963        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
964        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
965        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
966        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
967        vmovl.u8 q8, d0
968        vmovl.u8 q9, d1
969        vmovl.u8 q10, d2
970        vmovl.u8 q11, d3
971
972        vmul.i16 q8, q8, q15
973        vmul.i16 q9, q9, q15
974        vmul.i16 q10, q10, q15
975        vmul.i16 q11, q11, q15
976
977        vshrn.i16 d0, q8, #8
978        vshrn.i16 d1, q9, #8
979        vshrn.i16 d2, q10, #8
980        vshrn.i16 d3, q11, #8
981        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
982        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
983        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
984        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
985        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
986        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
987        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
988        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
989
990        subs r2, r2, #1
991        bne 1b
992
993        vpop            {q4-q7}
994        ldmfd           sp!, {r4, lr}
995        bx              lr
996END(rsdIntrinsicBlendDstIn_K)
997
998
999
1000/*
1001        dst = src * (1.0 - dst.a)
1002
1003        r0 = dst
1004        r1 = src
1005        r2 = length
1006*/
1007ENTRY(rsdIntrinsicBlendSrcOut_K)
1008        .save           {r4, lr}
1009        stmfd           sp!, {r4, lr}
1010        vpush           {q4-q7}
1011
1012        mov r4, #255
1013        vdup.16 q7, r4
1014
1015        mov r4, r0
10161:
1017
1018        /* src */
1019        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1020        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1021        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1022        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1023        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1024        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1025        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1026        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1027        vmovl.u8 q12, d0
1028        vmovl.u8 q13, d1
1029        vmovl.u8 q14, d2
1030        vmovl.u8 q15, d3
1031
1032        /* dst */
1033        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1034        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1035        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1036        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1037        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1038        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1039        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1040        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1041        //vmovl.u8 q8, d0
1042        //vmovl.u8 q9, d1
1043        //vmovl.u8 q10, d2
1044        vmovl.u8 q11, d3
1045
1046
1047        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
1048        vmul.i16 q12, q12, q6
1049        vmul.i16 q13, q13, q6
1050        vmul.i16 q14, q14, q6
1051        vmul.i16 q15, q15, q6
1052
1053        vshrn.i16 d0, q12, #8
1054        vshrn.i16 d1, q13, #8
1055        vshrn.i16 d2, q14, #8
1056        vshrn.i16 d3, q15, #8
1057        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1058        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1059        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1060        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1061        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1062        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1063        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1064        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1065
1066        subs r2, r2, #1
1067        bne 1b
1068
1069        vpop            {q4-q7}
1070        ldmfd           sp!, {r4, lr}
1071        bx              lr
1072END(rsdIntrinsicBlendSrcOut_K)
1073
1074
1075/*
1076        dst = dst * (1.0 - src.a)
1077
1078        r0 = dst
1079        r1 = src
1080        r2 = length
1081*/
1082ENTRY(rsdIntrinsicBlendDstOut_K)
1083        .save           {r4, lr}
1084        stmfd           sp!, {r4, lr}
1085        vpush           {q4-q7}
1086
1087        mov r4, #255
1088        vdup.16 q7, r4
1089
1090        mov r4, r0
10911:
1092
1093        /* src */
1094        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1095        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1096        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1097        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1098        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1099        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1100        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1101        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1102        //vmovl.u8 q12, d0
1103        //vmovl.u8 q13, d1
1104        //vmovl.u8 q14, d2
1105        vmovl.u8 q15, d3
1106
1107        /* dst */
1108        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1109        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1110        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1111        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1112        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1113        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1114        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1115        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1116        vmovl.u8 q8, d0
1117        vmovl.u8 q9, d1
1118        vmovl.u8 q10, d2
1119        vmovl.u8 q11, d3
1120
1121
1122        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
1123        vmul.i16 q12, q8, q6
1124        vmul.i16 q13, q9, q6
1125        vmul.i16 q14, q10, q6
1126        vmul.i16 q15, q11, q6
1127
1128        vshrn.i16 d0, q12, #8
1129        vshrn.i16 d1, q13, #8
1130        vshrn.i16 d2, q14, #8
1131        vshrn.i16 d3, q15, #8
1132        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1133        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1134        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1135        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1136        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1137        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1138        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1139        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1140
1141        subs r2, r2, #1
1142        bne 1b
1143
1144        vpop            {q4-q7}
1145        ldmfd           sp!, {r4, lr}
1146        bx              lr
1147END(rsdIntrinsicBlendDstOut_K)
1148
1149
1150/*
1151        dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb
1152        dst.a = dst.a
1153
1154        r0 = dst
1155        r1 = src
1156        r2 = length
1157*/
1158ENTRY(rsdIntrinsicBlendSrcAtop_K)
1159        .save           {r4, lr}
1160        stmfd           sp!, {r4, lr}
1161        vpush           {q4-q7}
1162
1163        mov r4, #255
1164        vdup.16 q7, r4
1165
1166        mov r4, r0
11671:
1168
1169        /* src */
1170        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1171        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1172        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1173        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1174        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1175        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1176        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1177        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1178        vmovl.u8 q12, d0
1179        vmovl.u8 q13, d1
1180        vmovl.u8 q14, d2
1181        vmovl.u8 q15, d3
1182
1183        /* dst */
1184        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1185        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1186        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1187        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1188        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1189        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1190        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1191        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1192        vmovl.u8 q8, d0
1193        vmovl.u8 q9, d1
1194        vmovl.u8 q10, d2
1195        vmovl.u8 q11, d3
1196
1197
1198        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
1199        vmul.i16 q8, q8, q6
1200        vmul.i16 q9, q9, q6
1201        vmul.i16 q10, q10, q6
1202
1203        vmla.i16 q8, q12, q11
1204        vmla.i16 q9, q13, q11
1205        vmla.i16 q10, q14, q11
1206
1207
1208        vshrn.i16 d0, q8, #8
1209        vshrn.i16 d1, q9, #8
1210        vshrn.i16 d2, q10, #8
1211        //vshrn.i16 d3, q15, #8
1212        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1213        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1214        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1215        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1216        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1217        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1218        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1219        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1220
1221        subs r2, r2, #1
1222        bne 1b
1223
1224        vpop            {q4-q7}
1225        ldmfd           sp!, {r4, lr}
1226        bx              lr
1227END(rsdIntrinsicBlendSrcAtop_K)
1228
1229/*
1230        dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb
1231        dst.a = src.a
1232
1233        r0 = dst
1234        r1 = src
1235        r2 = length
1236*/
1237ENTRY(rsdIntrinsicBlendDstAtop_K)
1238        .save           {r4, lr}
1239        stmfd           sp!, {r4, lr}
1240        vpush           {q4-q7}
1241
1242        mov r4, #255
1243        vdup.16 q7, r4
1244
1245        mov r4, r0
12461:
1247
1248        /* src */
1249        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1250        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1251        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1252        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1253        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1254        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1255        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1256        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1257        vmovl.u8 q12, d0
1258        vmovl.u8 q13, d1
1259        vmovl.u8 q14, d2
1260        vmovl.u8 q15, d3
1261
1262        /* dst */
1263        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1264        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1265        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1266        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1267        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1268        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1269        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1270        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1271        vmovl.u8 q8, d0
1272        vmovl.u8 q9, d1
1273        vmovl.u8 q10, d2
1274        vmovl.u8 q11, d3
1275
1276
1277        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
1278        vmul.i16 q12, q12, q6
1279        vmul.i16 q13, q13, q6
1280        vmul.i16 q14, q14, q6
1281
1282        vmla.i16 q12, q8, q15
1283        vmla.i16 q13, q9, q15
1284        vmla.i16 q14, q10, q15
1285
1286
1287        vshrn.i16 d0, q12, #8
1288        vshrn.i16 d1, q13, #8
1289        vshrn.i16 d2, q14, #8
1290        //vshrn.i16 d3, q15, #8
1291        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1292        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1293        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1294        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1295        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1296        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1297        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1298        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1299
1300        subs r2, r2, #1
1301        bne 1b
1302
1303        vpop            {q4-q7}
1304        ldmfd           sp!, {r4, lr}
1305        bx              lr
1306END(rsdIntrinsicBlendDstAtop_K)
1307
1308/*
1309        dst = dst ^ src
1310
1311        r0 = dst
1312        r1 = src
1313        r2 = length
1314*/
1315ENTRY(rsdIntrinsicBlendXor_K)
1316        .save           {r4, lr}
1317        stmfd           sp!, {r4, lr}
1318        vpush           {q4-q7}
1319
1320        mov r4, #255
1321        vdup.16 q7, r4
1322
1323        mov r4, r0
13241:
1325
1326        /* src */
1327        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1328        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1329        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1330        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1331        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1332        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1333        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1334        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1335        vmov.u8 d4, d0
1336        vmov.u8 d5, d1
1337        vmov.u8 d6, d2
1338        vmov.u8 d7, d3
1339
1340        /* dst */
1341        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1342        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1343        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1344        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1345        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1346        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1347        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1348        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1349
1350        veor d0, d0, d4
1351        veor d1, d1, d5
1352        veor d2, d2, d6
1353        veor d3, d3, d7
1354
1355        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1356        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1357        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1358        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1359        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1360        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1361        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1362        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1363
1364        subs r2, r2, #1
1365        bne 1b
1366
1367        vpop            {q4-q7}
1368        ldmfd           sp!, {r4, lr}
1369        bx              lr
1370END(rsdIntrinsicBlendXor_K)
1371
1372/*
1373        dst = dst * src
1374
1375        r0 = dst
1376        r1 = src
1377        r2 = length
1378*/
1379ENTRY(rsdIntrinsicBlendMultiply_K)
1380        .save           {r4, lr}
1381        stmfd           sp!, {r4, lr}
1382        vpush           {q4-q7}
1383
1384        mov r4, #255
1385        vdup.16 q7, r4
1386
1387        mov r4, r0
13881:
1389
1390        /* src */
1391        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1392        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1393        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1394        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1395        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1396        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1397        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1398        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1399        vmovl.u8 q12, d0
1400        vmovl.u8 q13, d1
1401        vmovl.u8 q14, d2
1402        vmovl.u8 q15, d3
1403
1404        /* dst */
1405        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1406        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1407        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1408        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1409        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1410        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1411        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1412        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1413        vmovl.u8 q8, d0
1414        vmovl.u8 q9, d1
1415        vmovl.u8 q10, d2
1416        vmovl.u8 q11, d3
1417
1418
1419        vmul.i16 q8, q8, q12
1420        vmul.i16 q9, q9, q13
1421        vmul.i16 q10, q10, q14
1422        vmul.i16 q11, q11, q15
1423
1424        vshrn.i16 d0, q8, #8
1425        vshrn.i16 d1, q9, #8
1426        vshrn.i16 d2, q10, #8
1427        vshrn.i16 d3, q11, #8
1428        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1429        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1430        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1431        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1432        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1433        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1434        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1435        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1436
1437        subs r2, r2, #1
1438        bne 1b
1439
1440        vpop            {q4-q7}
1441        ldmfd           sp!, {r4, lr}
1442        bx              lr
1443END(rsdIntrinsicBlendMultiply_K)
1444
1445/*
1446        dst = min(src + dst, 1.0)
1447
1448        r0 = dst
1449        r1 = src
1450        r2 = length
1451*/
1452ENTRY(rsdIntrinsicBlendAdd_K)
1453        .save           {r4, lr}
1454        stmfd           sp!, {r4, lr}
1455        vpush           {q4-q7}
1456
1457        mov r4, #255
1458        vdup.16 q7, r4
1459
1460        mov r4, r0
14611:
1462
1463        /* src */
1464        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1465        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1466        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1467        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1468        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1469        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1470        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1471        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1472        vmovl.u8 q12, d0
1473        vmovl.u8 q13, d1
1474        vmovl.u8 q14, d2
1475        vmovl.u8 q15, d3
1476
1477        /* dst */
1478        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1479        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1480        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1481        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1482        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1483        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1484        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1485        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1486        vmovl.u8 q8, d0
1487        vmovl.u8 q9, d1
1488        vmovl.u8 q10, d2
1489        vmovl.u8 q11, d3
1490
1491
1492        vadd.i16 q8, q8, q12
1493        vadd.i16 q9, q9, q13
1494        vadd.i16 q10, q10, q14
1495        vadd.i16 q11, q11, q15
1496
1497        vqmovun.s16 d0, q8
1498        vqmovun.s16 d1, q9
1499        vqmovun.s16 d2, q10
1500        vqmovun.s16 d3, q11
1501        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1502        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1503        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1504        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1505        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1506        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1507        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1508        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1509
1510        subs r2, r2, #1
1511        bne 1b
1512
1513        vpop            {q4-q7}
1514        ldmfd           sp!, {r4, lr}
1515        bx              lr
1516END(rsdIntrinsicBlendAdd_K)
1517
1518
1519/*
1520        dst = max(dst - src, 0.0)
1521
1522        r0 = dst
1523        r1 = src
1524        r2 = length
1525*/
1526ENTRY(rsdIntrinsicBlendSub_K)
1527        .save           {r4, lr}
1528        stmfd           sp!, {r4, lr}
1529        vpush           {q4-q7}
1530
1531        mov r4, #255
1532        vdup.16 q7, r4
1533
1534        mov r4, r0
15351:
1536
1537        /* src */
1538        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1539        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1540        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1541        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1542        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1543        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1544        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1545        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1546        vmovl.u8 q12, d0
1547        vmovl.u8 q13, d1
1548        vmovl.u8 q14, d2
1549        vmovl.u8 q15, d3
1550
1551        /* dst */
1552        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1553        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1554        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1555        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1556        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1557        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1558        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1559        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1560        vmovl.u8 q8, d0
1561        vmovl.u8 q9, d1
1562        vmovl.u8 q10, d2
1563        vmovl.u8 q11, d3
1564
1565
1566        vsub.i16 q8, q8, q12
1567        vsub.i16 q9, q9, q13
1568        vsub.i16 q10, q10, q14
1569        vsub.i16 q11, q11, q15
1570
1571        vqmovun.s16 d0, q8
1572        vqmovun.s16 d1, q9
1573        vqmovun.s16 d2, q10
1574        vqmovun.s16 d3, q11
1575        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1576        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1577        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1578        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1579        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1580        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1581        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1582        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1583
1584        subs r2, r2, #1
1585        bne 1b
1586
1587        vpop            {q4-q7}
1588        ldmfd           sp!, {r4, lr}
1589        bx              lr
1590END(rsdIntrinsicBlendSub_K)
1591
1592
1593/* 3D LUT */
1594
1595/*
1596        r0 = dst
1597        r1 = src
1598        r2 = cube base pointer
1599        r3 = cube Y stride
1600        r4 = cube Z stride
1601        r5 = count
1602        xr10 = * constants
1603
1604        d0  / q0  = weight 1 p1
1605        d1        = weight 2 p1
1606
1607        d2  / q1  = weight 1 p2
1608        d3        = weight 2 p2
1609
1610        d4  / q2  = src1
1611        d5        = src2
1612
1613        d6  / q3  = baseCoord
1614        d7        = baseCoord
1615
1616        d8  / q4  = coord1 p1
1617        d9        =
1618
1619        d10 / q5  = coord1 p2
1620        d11       =
1621
1622        d12 / q6  =
1623        d13       =
1624
1625        d14 / q7  =
1626        d15       =
1627
1628
1629        d16 / q8  = x0 y0 z0
1630        d17       = x1 y0 z0
1631        d18 / q9  = x0 y1 z0
1632        d19       = x1 y1 z0
1633        d20 / q10 = x0 y0 z1
1634        d21       = x1 y0 z1
1635        d22 / q11 = x0 y1 z1
1636        d23       = x1 y1 z1
1637
1638        d24 / q12 = alpha mash
1639        d25       = current pixel alpha
1640        d26 / q13 = 4, y stride
1641        d27       = z stride, 0
1642        d28 / q14 = 0x8000
1643        d29       = 0x7fff
1644        d30 / q15 = 0, 0, 0, 0xffff
1645
1646
1647        d31 = coordMult
1648*/
1649
1650ENTRY(rsdIntrinsic3DLUT_K)
1651        push        {r4-r8, r10, r11, lr}
1652        vpush       {q4-q7}
1653
1654        /* load Z stride in r4 */
1655        ldr     r4, [sp, #32 + 64]
1656
1657        /* Load count */
1658        ldr     r5, [sp, #36 + 64]
1659
1660        vmov.u16 d28, #0x8000
1661        vmov.u16 d29, #0x7fff
1662        vmov.u32 d24, #0xff000000
1663
1664        /* load constants using r10 */
1665        ldr     r10, [sp, #40 + 64]
1666        vld1.32 {d31}, [r10]!
1667        vld1.32 {d30}, [r10]!
1668
1669        mov r6, #4
1670        vmov d26, r6, r3
1671        mov r6, #0
1672        vmov d27, r4, r6
1673
1674        add r8, r3, r4
1675
1676
1677
16781:
1679        vld1.8 {d4}, [r1]!
1680        vand.u8 d25, d4, d24
1681        vmovl.u8 q2, d4
1682
1683
1684        vmull.u16 q3, d4, d31
1685        vshr.u32 q4, q3, #15       // coord1 p1
1686        vmovn.u32 d1, q3
1687        vand.u16 d1, d29           // weight 2
1688        vsub.u16 d0, d28, d1       // weight 1
1689        vmul.u32 q4, q4, q13           // q4 = x*4, y*ystride, z*zstride, 0
1690
1691        vmull.u16 q3, d5, d31
1692        vshr.u32 q5, q3, #15       // coord1 p2
1693        vmovn.u32 d3, q3
1694        vand.u16 d3, d29           // weight 2
1695        vsub.u16 d2, d28, d3       // weight 1
1696        vmul.u32 q5, q5, q13       // q5 = x*4, y*ystride, z*zstride, 0
1697
1698        vpadd.u32 d8, d8, d9
1699        vpadd.u32 d9, d10, d11
1700        vpadd.u32 d8, d8, d9
1701        vmov r6, r7, d8            // base pointers
1702
1703        add  r6, r6, r2
1704        add  r7, r7, r2
1705
1706        vld1.8 {d16}, [r6]
1707        add r11, r6, r3
1708        vld1.8 {d18}, [r11]
1709        add r11, r6, r4
1710        vld1.8 {d20}, [r11]
1711        add r11, r6, r8
1712        vld1.8 {d22}, [r11]
1713
1714        vmovl.u8 q8, d16
1715        vmovl.u8 q9, d18
1716        vmovl.u8 q10, d20
1717        vmovl.u8 q11, d22
1718
1719        vmull.u16 q6, d16, d0[0]
1720        vmlal.u16 q6, d17, d1[0]
1721        vshrn.u32 d16, q6, #7
1722        vmull.u16 q6, d18, d0[0]
1723        vmlal.u16 q6, d19, d1[0]
1724        vshrn.u32 d18, q6, #7
1725        vmull.u16 q6, d20, d0[0]
1726        vmlal.u16 q6, d21, d1[0]
1727        vshrn.u32 d20, q6, #7
1728        vmull.u16 q6, d22, d0[0]
1729        vmlal.u16 q6, d23, d1[0]
1730        vshrn.u32 d22, q6, #7
1731
1732        vmull.u16 q6, d16, d0[1]
1733        vmlal.u16 q6, d18, d1[1]
1734        vshrn.u32 d16, q6, #15
1735        vmull.u16 q6, d20, d0[1]
1736        vmlal.u16 q6, d22, d1[1]
1737        vshrn.u32 d18, q6, #15
1738
1739        vmull.u16 q6, d16, d0[2]
1740        vmlal.u16 q6, d18, d1[2]
1741        vshrn.u32 d14, q6, #15
1742
1743
1744        vld1.8 {d16}, [r7]
1745        add r11, r7, r3
1746        vld1.8 {d18}, [r11]
1747        add r11, r7, r4
1748        vld1.8 {d20}, [r11]
1749        add r11, r7, r8
1750        vld1.8 {d22}, [r11]
1751        vmovl.u8 q8, d16
1752        vmovl.u8 q9, d18
1753        vmovl.u8 q10, d20
1754        vmovl.u8 q11, d22
1755
1756        vmull.u16 q6, d16, d2[0]
1757        vmlal.u16 q6, d17, d3[0]
1758        vshrn.u32 d16, q6, #7
1759        vmull.u16 q6, d18, d2[0]
1760        vmlal.u16 q6, d19, d3[0]
1761        vshrn.u32 d18, q6, #7
1762        vmull.u16 q6, d20, d2[0]
1763        vmlal.u16 q6, d21, d3[0]
1764        vshrn.u32 d20, q6, #7
1765        vmull.u16 q6, d22, d2[0]
1766        vmlal.u16 q6, d23, d3[0]
1767        vshrn.u32 d22, q6, #7
1768
1769        vmull.u16 q6, d16, d2[1]
1770        vmlal.u16 q6, d18, d3[1]
1771        vshrn.u32 d16, q6, #15
1772        vmull.u16 q6, d20, d2[1]
1773        vmlal.u16 q6, d22, d3[1]
1774        vshrn.u32 d18, q6, #15
1775
1776        vmull.u16 q6, d16, d2[2]
1777        vmlal.u16 q6, d18, d3[2]
1778        vshrn.u32 d15, q6, #15
1779
1780        vrshrn.u16 d14, q7, #8
1781
1782        vbic.u8 d14, d14, d24  // mix in alpha
1783        vorr.u8 d14, d14, d25
1784        vst1.32 {d14}, [r0]!
1785
1786
1787        /* Are we done? */
1788        subs r5, r5, #1
1789        bne 1b
1790
1791        /* Yup, bye */
1792        vpop            {q4-q7}
1793        pop         {r4-r8, r10, r11, lr}
1794        bx          lr
1795
1796END(rsdIntrinsic3DLUT_K)
1797
1798
1799