1/* libs/pixelflinger/t32cb16blend.S 2** 3** Copyright 2006, The Android Open Source Project 4** 5** Licensed under the Apache License, Version 2.0 (the "License"); 6** you may not use this file except in compliance with the License. 7** You may obtain a copy of the License at 8** 9** http://www.apache.org/licenses/LICENSE-2.0 10** 11** Unless required by applicable law or agreed to in writing, software 12** distributed under the License is distributed on an "AS IS" BASIS, 13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14** See the License for the specific language governing permissions and 15** limitations under the License. 16*/ 17 18 19 .text 20 .align 21 22 .global scanline_t32cb16blend_arm 23 24 25/* 26 * .macro pixel 27 * 28 * \DREG is a 32-bit register containing *two* original destination RGB565 29 * pixels, with the even one in the low-16 bits, and the odd one in the 30 * high 16 bits. 31 * 32 * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. 33 * 34 * \FB is a target register that will contain the blended pixel values. 35 * 36 * \ODD is either 0 or 1 and indicates if we're blending the lower or 37 * upper 16-bit pixels in DREG into FB 38 * 39 * 40 * clobbered: r6, r7, lr 41 * 42 */ 43 44.macro pixel, DREG, SRC, FB, ODD 45 46 // SRC = 0xAABBGGRR 47 mov r7, \SRC, lsr #24 // sA 48 add r7, r7, r7, lsr #7 // sA + (sA >> 7) 49 rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7)) 50 511: 52 53.if \ODD 54 55 // red 56 mov lr, \DREG, lsr #(16 + 11) 57 smulbb lr, r7, lr 58 mov r6, \SRC, lsr #3 59 and r6, r6, #0x1F 60 add lr, r6, lr, lsr #8 61 cmp lr, #0x1F 62 orrhs \FB, \FB, #(0x1F<<(16 + 11)) 63 orrlo \FB, \FB, lr, lsl #(16 + 11) 64 65 // green 66 and r6, \DREG, #(0x3F<<(16 + 5)) 67 smulbt r6, r7, r6 68 mov lr, \SRC, lsr #(8+2) 69 and lr, lr, #0x3F 70 add r6, lr, r6, lsr #(5+8) 71 cmp r6, #0x3F 72 orrhs \FB, \FB, #(0x3F<<(16 + 5)) 73 orrlo \FB, \FB, r6, lsl #(16 + 5) 74 75 // blue 76 and lr, \DREG, #(0x1F << 16) 77 smulbt lr, r7, lr 78 mov r6, \SRC, lsr #(8+8+3) 79 and r6, r6, #0x1F 80 add lr, r6, lr, lsr #8 81 cmp lr, #0x1F 82 orrhs \FB, \FB, #(0x1F << 16) 83 orrlo \FB, \FB, lr, lsl #16 84 85.else 86 87 // red 88 mov lr, \DREG, lsr #11 89 and lr, lr, #0x1F 90 smulbb lr, r7, lr 91 mov r6, \SRC, lsr #3 92 and r6, r6, #0x1F 93 add lr, r6, lr, lsr #8 94 cmp lr, #0x1F 95 movhs \FB, #(0x1F<<11) 96 movlo \FB, lr, lsl #11 97 98 99 // green 100 and r6, \DREG, #(0x3F<<5) 101 smulbb r6, r7, r6 102 mov lr, \SRC, lsr #(8+2) 103 and lr, lr, #0x3F 104 add r6, lr, r6, lsr #(5+8) 105 cmp r6, #0x3F 106 orrhs \FB, \FB, #(0x3F<<5) 107 orrlo \FB, \FB, r6, lsl #5 108 109 // blue 110 and lr, \DREG, #0x1F 111 smulbb lr, r7, lr 112 mov r6, \SRC, lsr #(8+8+3) 113 and r6, r6, #0x1F 114 add lr, r6, lr, lsr #8 115 cmp lr, #0x1F 116 orrhs \FB, \FB, #0x1F 117 orrlo \FB, \FB, lr 118 119.endif 120 121 .endm 122 123 124// r0: dst ptr 125// r1: src ptr 126// r2: count 127// r3: d 128// r4: s0 129// r5: s1 130// r6: pixel 131// r7: pixel 132// r8: free 133// r9: free 134// r10: free 135// r11: free 136// r12: scratch 137// r14: pixel 138 139scanline_t32cb16blend_arm: 140 stmfd sp!, {r4-r7, lr} 141 142 pld [r0] 143 pld [r1] 144 145 // align DST to 32 bits 146 tst r0, #0x3 147 beq aligned 148 subs r2, r2, #1 149 ldmlofd sp!, {r4-r7, lr} // return 150 bxlo lr 151 152last: 153 ldr r4, [r1], #4 154 ldrh r3, [r0] 155 pixel r3, r4, r12, 0 156 strh r12, [r0], #2 157 158aligned: 159 subs r2, r2, #2 160 blo 9f 161 162 // The main loop is unrolled twice and processes 4 pixels 1638: ldmia r1!, {r4, r5} 164 // stream the source 165 pld [r1, #32] 166 add r0, r0, #4 167 // it's all zero, skip this pixel 168 orrs r3, r4, r5 169 beq 7f 170 171 // load the destination 172 ldr r3, [r0, #-4] 173 // stream the destination 174 pld [r0, #32] 175 pixel r3, r4, r12, 0 176 pixel r3, r5, r12, 1 177 // effectively, we're getting write-combining by virtue of the 178 // cpu's write-back cache. 179 str r12, [r0, #-4] 180 181 // 2nd iterration of the loop, don't stream anything 182 subs r2, r2, #2 183 movlt r4, r5 184 blt 9f 185 ldmia r1!, {r4, r5} 186 add r0, r0, #4 187 orrs r3, r4, r5 188 beq 7f 189 ldr r3, [r0, #-4] 190 pixel r3, r4, r12, 0 191 pixel r3, r5, r12, 16 192 str r12, [r0, #-4] 193 194 1957: subs r2, r2, #2 196 bhs 8b 197 mov r4, r5 198 1999: adds r2, r2, #1 200 ldmlofd sp!, {r4-r7, lr} // return 201 bxlo lr 202 b last 203