1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitRow_opts_arm_neon.h"
9
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19
20 /* Neon version of S32_Blend_BlitRow32()
21 * portable version is in src/core/SkBlitRow_D32.cpp
22 */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)23 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
24 const SkPMColor* SK_RESTRICT src,
25 int count, U8CPU alpha) {
26 SkASSERT(alpha <= 255);
27
28 if (count <= 0) {
29 return;
30 }
31
32 uint16_t src_scale = SkAlpha255To256(alpha);
33 uint16_t dst_scale = 256 - src_scale;
34
35 while (count >= 2) {
36 uint8x8_t vsrc, vdst, vres;
37 uint16x8_t vsrc_wide, vdst_wide;
38
39 /* These commented prefetches are a big win for count
40 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
41 * They also hurt a little (<5%) on an A15
42 */
43 //__builtin_prefetch(src+32);
44 //__builtin_prefetch(dst+32);
45
46 // Load
47 vsrc = vreinterpret_u8_u32(vld1_u32(src));
48 vdst = vreinterpret_u8_u32(vld1_u32(dst));
49
50 // Process src
51 vsrc_wide = vmovl_u8(vsrc);
52 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
53
54 // Process dst
55 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
56
57 // Combine
58 vdst_wide += vsrc_wide;
59 vres = vshrn_n_u16(vdst_wide, 8);
60
61 // Store
62 vst1_u32(dst, vreinterpret_u32_u8(vres));
63
64 src += 2;
65 dst += 2;
66 count -= 2;
67 }
68
69 if (count == 1) {
70 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
71 uint16x8_t vsrc_wide, vdst_wide;
72
73 // Load
74 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
75 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
76
77 // Process
78 vsrc_wide = vmovl_u8(vsrc);
79 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
80 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
81 vdst_wide += vsrc_wide;
82 vres = vshrn_n_u16(vdst_wide, 8);
83
84 // Store
85 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
86 }
87 }
88
89 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)90 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
91 const SkPMColor* SK_RESTRICT src,
92 int count, U8CPU alpha) {
93
94 SkASSERT(255 > alpha);
95
96 if (count <= 0) {
97 return;
98 }
99
100 unsigned alpha256 = SkAlpha255To256(alpha);
101
102 // First deal with odd counts
103 if (count & 1) {
104 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
105 uint16x8_t vdst_wide, vsrc_wide;
106 unsigned dst_scale;
107
108 // Load
109 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
110 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
111
112 // Calc dst_scale
113 dst_scale = vget_lane_u8(vsrc, 3);
114 dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
115
116 // Process src
117 vsrc_wide = vmovl_u8(vsrc);
118 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
119
120 // Process dst
121 vdst_wide = vmovl_u8(vdst);
122 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
123
124 // Combine
125 vdst_wide += vsrc_wide;
126 vres = vshrn_n_u16(vdst_wide, 8);
127
128 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
129 dst++;
130 src++;
131 count--;
132 }
133
134 if (count) {
135 uint8x8_t alpha_mask;
136 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
137 alpha_mask = vld1_u8(alpha_mask_setup);
138
139 do {
140
141 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
142 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
143
144 __builtin_prefetch(src+32);
145 __builtin_prefetch(dst+32);
146
147 // Load
148 vsrc = vreinterpret_u8_u32(vld1_u32(src));
149 vdst = vreinterpret_u8_u32(vld1_u32(dst));
150
151 // Prepare src_scale
152 vsrc_scale = vdupq_n_u16(alpha256);
153
154 // Calc dst_scale
155 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
156 vdst_scale = vmovl_u8(vsrc_alphas);
157 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
158 // A 16-bit lane would overflow if we used 0xFFFF here,
159 // so use an approximation with 0xFF00 that is off by 1,
160 // and add back 1 after to get the correct value.
161 // This is valid if alpha256 <= 255.
162 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
163 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
164 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
165
166 // Process src
167 vsrc_wide = vmovl_u8(vsrc);
168 vsrc_wide *= vsrc_scale;
169
170 // Process dst
171 vdst_wide = vmovl_u8(vdst);
172 vdst_wide *= vdst_scale;
173
174 // Combine
175 vdst_wide += vsrc_wide;
176 vres = vshrn_n_u16(vdst_wide, 8);
177
178 vst1_u32(dst, vreinterpret_u32_u8(vres));
179
180 src += 2;
181 dst += 2;
182 count -= 2;
183 } while(count);
184 }
185 }
186
187 ///////////////////////////////////////////////////////////////////////////////
188
189 #endif // #ifdef SK_CPU_ARM32
190
191 ///////////////////////////////////////////////////////////////////////////////
192
193 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
194 nullptr, // S32_Opaque,
195 S32_Blend_BlitRow32_neon, // S32_Blend,
196 nullptr, // Ported to SkOpts
197 #ifdef SK_CPU_ARM32
198 S32A_Blend_BlitRow32_neon // S32A_Blend
199 #else
200 nullptr
201 #endif
202 };
203