1 /*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitMask.h"
9 #include "SkColor_opts_neon.h"
10
SkBlitLCD16OpaqueRow_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
12 SkColor color, int width,
13 SkPMColor opaqueDst) {
14 int colR = SkColorGetR(color);
15 int colG = SkColorGetG(color);
16 int colB = SkColorGetB(color);
17
18 uint8x8_t vcolR, vcolG, vcolB;
19 uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
20
21 if (width >= 8) {
22 vcolR = vdup_n_u8(colR);
23 vcolG = vdup_n_u8(colG);
24 vcolB = vdup_n_u8(colB);
25 vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
26 vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
27 vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
28 vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
29 }
30
31 while (width >= 8) {
32 uint8x8x4_t vdst;
33 uint16x8_t vmask;
34 uint16x8_t vmaskR, vmaskG, vmaskB;
35 uint8x8_t vsel_trans, vsel_opq;
36
37 vdst = vld4_u8((uint8_t*)dst);
38 vmask = vld1q_u16(src);
39
40 // Prepare compare masks
41 vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
42 vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
43
44 // Get all the color masks on 5 bits
45 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
46 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
47 SK_B16_BITS + SK_R16_BITS + 1);
48 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
49
50 // Upscale to 0..32
51 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
52 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
53 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
54
55 vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
56 vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
57
58 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
59 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
60 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
61
62 vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
63 vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
64 vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
65
66 vst4_u8((uint8_t*)dst, vdst);
67
68 dst += 8;
69 src += 8;
70 width -= 8;
71 }
72
73 // Leftovers
74 for (int i = 0; i < width; i++) {
75 dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
76 opaqueDst);
77 }
78 }
79
SkBlitLCD16Row_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)80 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
81 SkColor color, int width, SkPMColor) {
82 int colA = SkColorGetA(color);
83 int colR = SkColorGetR(color);
84 int colG = SkColorGetG(color);
85 int colB = SkColorGetB(color);
86
87 colA = SkAlpha255To256(colA);
88
89 uint8x8_t vcolR, vcolG, vcolB;
90 uint16x8_t vcolA;
91
92 if (width >= 8) {
93 vcolA = vdupq_n_u16(colA);
94 vcolR = vdup_n_u8(colR);
95 vcolG = vdup_n_u8(colG);
96 vcolB = vdup_n_u8(colB);
97 }
98
99 while (width >= 8) {
100 uint8x8x4_t vdst;
101 uint16x8_t vmask;
102 uint16x8_t vmaskR, vmaskG, vmaskB;
103
104 vdst = vld4_u8((uint8_t*)dst);
105 vmask = vld1q_u16(src);
106
107 // Get all the color masks on 5 bits
108 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
109 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
110 SK_B16_BITS + SK_R16_BITS + 1);
111 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
112
113 // Upscale to 0..32
114 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
115 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
116 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
117
118 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
119 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
120 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
121
122 vdst.val[NEON_A] = vdup_n_u8(0xFF);
123 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
124 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
125 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
126
127 vst4_u8((uint8_t*)dst, vdst);
128
129 dst += 8;
130 src += 8;
131 width -= 8;
132 }
133
134 for (int i = 0; i < width; i++) {
135 dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
136 }
137 }
138
139 #define LOAD_LANE_16(reg, n) \
140 reg = vld1q_lane_u16(device, reg, n); \
141 device = (uint16_t*)((char*)device + deviceRB);
142
143 #define STORE_LANE_16(reg, n) \
144 vst1_lane_u16(dst, reg, n); \
145 dst = (uint16_t*)((char*)dst + deviceRB);
146
SkRGB16BlitterBlitV_neon(uint16_t * device,int height,size_t deviceRB,unsigned scale,uint32_t src32)147 void SkRGB16BlitterBlitV_neon(uint16_t* device,
148 int height,
149 size_t deviceRB,
150 unsigned scale,
151 uint32_t src32) {
152 if (height >= 8)
153 {
154 uint16_t* dst = device;
155
156 // prepare constants
157 uint16x8_t vdev = vdupq_n_u16(0);
158 uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
159 uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
160 uint32x4_t vsrc32 = vdupq_n_u32(src32);
161 uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);
162
163 while (height >= 8){
164 LOAD_LANE_16(vdev, 0)
165 LOAD_LANE_16(vdev, 1)
166 LOAD_LANE_16(vdev, 2)
167 LOAD_LANE_16(vdev, 3)
168 LOAD_LANE_16(vdev, 4)
169 LOAD_LANE_16(vdev, 5)
170 LOAD_LANE_16(vdev, 6)
171 LOAD_LANE_16(vdev, 7)
172
173 // Expand_rgb_16
174 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
175 uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
176 uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);
177
178 // Compact_rgb_16
179 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
180 vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
181 vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
182 vdst32_hi = vshrq_n_u32(vdst32_hi, 5);
183
184 uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
185 uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
186 uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
187 vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
188 vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
189 uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);
190
191 STORE_LANE_16(vdst16_lo, 0)
192 STORE_LANE_16(vdst16_lo, 1)
193 STORE_LANE_16(vdst16_lo, 2)
194 STORE_LANE_16(vdst16_lo, 3)
195 STORE_LANE_16(vdst16_hi, 0)
196 STORE_LANE_16(vdst16_hi, 1)
197 STORE_LANE_16(vdst16_hi, 2)
198 STORE_LANE_16(vdst16_hi, 3)
199 height -= 8;
200 }
201 }
202 while (height != 0){
203 uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
204 *device = SkCompact_rgb_16((src32 + dst32) >> 5);
205 device = (uint16_t*)((char*)device + deviceRB);
206 height--;
207 }
208 }
209
210 #undef LOAD_LANE_16
211 #undef STORE_LANE_16
212