1 /*
2 * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef GRAPHIC_LITE_GRAPHIC_NEON_UTILS_H
17 #define GRAPHIC_LITE_GRAPHIC_NEON_UTILS_H
18
19 #include "graphic_config.h"
20 #ifdef ARM_NEON_OPT
21 #include <arm_neon.h>
22 #include "gfx_utils/color.h"
23 #include "gfx_utils/graphic_math.h"
24 #include "gfx_utils/graphic_types.h"
25
26 namespace OHOS {
27 #define NEON_STEP_4 4
28 #define NEON_STEP_8 8
29 #define NEON_STEP_32 32
30 #define NEON_A 3
31 #define NEON_R 2
32 #define NEON_G 1
33 #define NEON_B 0
34
35 // return vIn / 255
NeonFastDiv255(uint16x8_t vIn)36 static inline uint8x8_t NeonFastDiv255(uint16x8_t vIn)
37 {
38 // 257: 2^8 + 1; 8: number of shifts
39 return vmovn_u16(vshrq_n_u16(vIn + vshrq_n_u16(vIn + vdupq_n_u16(257), 8), 8));
40 }
41
42 // return a * b / 255
NeonMulDiv255(uint8x8_t a,uint8x8_t b)43 static inline uint8x8_t NeonMulDiv255(uint8x8_t a, uint8x8_t b)
44 {
45 return NeonFastDiv255(vmull_u8(a, b));
46 }
47
48 // return a / b
49 // a, b and result are floating-point numbers.
NeonDiv(float32x4_t a,float32x4_t b)50 static inline float32x4_t NeonDiv(float32x4_t a, float32x4_t b)
51 {
52 float32x4_t reciprocal = vrecpeq_f32(b);
53 reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
54 reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
55 return vmulq_f32(a, reciprocal);
56 }
57
58 // return a / b
59 // a is a 16-bits integer, b and result are 8-bits integers.
NeonDivInt(uint16x8_t a,uint8x8_t b)60 static inline uint8x8_t NeonDivInt(uint16x8_t a, uint8x8_t b)
61 {
62 float32x4_t low = NeonDiv(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a))),
63 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(b)))));
64 float32x4_t high = NeonDiv(vcvtq_f32_u32(vmovl_u16(vget_high_u16(a))),
65 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(b)))));
66 return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(low)), vmovn_u32(vcvtq_u32_f32(high))));
67 }
68
NeonMemcpy(void * dst,int32_t dstSize,const void * src,int32_t srcSize)69 static void NeonMemcpy(void* dst, int32_t dstSize, const void* src, int32_t srcSize)
70 {
71 int32_t sz = MATH_MIN(dstSize, srcSize);
72 // 64-bytes aligned
73 int32_t mod = sz % 64;
74 if (mod) {
75 if (memcpy_s(dst, mod, src, mod) != EOK) {
76 return;
77 }
78 sz -= mod;
79 if (sz == 0) {
80 return;
81 }
82 dst = (uint8_t*)dst + mod;
83 src = (uint8_t*)src + mod;
84 }
85
86 asm volatile (
87 "NEONCopyPLD: \n"
88 " PLD [%[src], #0xC0] \n"
89 " VLDM %[src]!, {d0-d7} \n"
90 " VSTM %[dst]!, {d0-d7} \n"
91 " SUBS %[sz], %[sz], #0x40 \n"
92 " BGT NEONCopyPLD \n"
93 : [dst]"+r"(dst), [src]"+r"(src), [sz]"+r"(sz) : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
94 }
95
NeonBlendRGBA(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)96 static inline void NeonBlendRGBA(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
97 uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
98 {
99 uint8x8_t da = NeonMulDiv255(a1, vdup_n_u8(OPA_OPAQUE) - a2);
100 a1 = a1 - NeonMulDiv255(a2, a1) + a2;
101 uint16x8_t r = vmull_u8(r2, a2) + vmull_u8(r1, da);
102 uint16x8_t g = vmull_u8(g2, a2) + vmull_u8(g1, da);
103 uint16x8_t b = vmull_u8(b2, a2) + vmull_u8(b1, da);
104 r1 = NeonDivInt(r, a1);
105 g1 = NeonDivInt(g, a1);
106 b1 = NeonDivInt(b, a1);
107 }
108
NeonBlendRGB(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)109 static inline void NeonBlendRGB(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
110 uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
111 {
112 uint8x8_t da = vdup_n_u8(OPA_OPAQUE) - a2;
113 r1 = NeonMulDiv255(r2, a2) + NeonMulDiv255(r1, da);
114 g1 = NeonMulDiv255(g2, a2) + NeonMulDiv255(g1, da);
115 b1 = NeonMulDiv255(b2, a2) + NeonMulDiv255(b1, da);
116 }
117
LoadBuf_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)118 static inline void LoadBuf_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
119 {
120 uint8x8x4_t vBuf = vld4_u8(buf);
121 r = vBuf.val[NEON_R];
122 g = vBuf.val[NEON_G];
123 b = vBuf.val[NEON_B];
124 a = vBuf.val[NEON_A];
125 }
126
LoadBuf_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)127 static inline void LoadBuf_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
128 {
129 uint8x8x3_t vBuf = vld3_u8(buf);
130 r = vBuf.val[NEON_R];
131 g = vBuf.val[NEON_G];
132 b = vBuf.val[NEON_B];
133 }
134
LoadBuf_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)135 static inline void LoadBuf_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
136 {
137 uint16x8_t vBuf = vld1q_u16(reinterpret_cast<uint16_t*>(buf));
138 // 3: RRRRRGGG|GGGBBBBB => RRGGGGGG|BBBBB000
139 b = vmovn_u16(vshlq_n_u16(vBuf, 3));
140 // 5, 2: RRRRRGGG|GGGBBBBB => XXXRRRRR|GGGGGG00
141 g = vshl_n_u8(vshrn_n_u16(vBuf, 5), 2);
142 // 11, 3: RRRRRGGG|GGGBBBBB => XXXXXXXX|RRRRR000
143 r = vmovn_u16(vshlq_n_u16(vshrq_n_u16(vBuf, 11), 3));
144 }
145
LoadBufA_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)146 static inline void LoadBufA_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a, uint8_t opa)
147 {
148 uint8x8x4_t vBuf = vld4_u8(buf);
149 r = vBuf.val[NEON_R];
150 g = vBuf.val[NEON_G];
151 b = vBuf.val[NEON_B];
152 a = NeonMulDiv255(vBuf.val[NEON_A], vdup_n_u8(opa));
153 }
154
LoadBufA_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)155 static inline void LoadBufA_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a, uint8_t opa)
156 {
157 uint8x8x3_t vBuf = vld3_u8(buf);
158 r = vBuf.val[NEON_R];
159 g = vBuf.val[NEON_G];
160 b = vBuf.val[NEON_B];
161 a = vdup_n_u8(opa);
162 }
163
LoadBufA_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)164 static inline void LoadBufA_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a, uint8_t opa)
165 {
166 uint16x8_t vBuf = vld1q_u16(reinterpret_cast<uint16_t*>(buf));
167 // 3: RRRRRGGG|GGGBBBBB => RRGGGGGG|BBBBB000
168 b = vmovn_u16(vshlq_n_u16(vBuf, 3));
169 // 5, 2: RRRRRGGG|GGGBBBBB => XXXRRRRR|GGGGGG00
170 g = vshl_n_u8(vshrn_n_u16(vBuf, 5), 2);
171 // 11, 3: RRRRRGGG|GGGBBBBB => XXXXXXXX|RRRRR000
172 r = vmovn_u16(vshlq_n_u16(vshrq_n_u16(vBuf, 11), 3));
173 a = vdup_n_u8(opa);
174 }
175
StoreBuf_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)176 static inline void StoreBuf_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
177 {
178 uint8x8x4_t vBuf;
179 vBuf.val[NEON_R] = r;
180 vBuf.val[NEON_G] = g;
181 vBuf.val[NEON_B] = b;
182 vBuf.val[NEON_A] = a;
183 vst4_u8(buf, vBuf);
184 }
185
StoreBuf_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)186 static inline void StoreBuf_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
187 {
188 uint8x8x3_t vBuf;
189 vBuf.val[NEON_R] = r;
190 vBuf.val[NEON_G] = g;
191 vBuf.val[NEON_B] = b;
192 vst3_u8(buf, vBuf);
193 }
194
StoreBuf_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)195 static inline void StoreBuf_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
196 {
197 /* red left shift 8 bit.
198 * vBuf => RRRRRXXX|XXXXXXXX
199 */
200 uint16x8_t vBuf = vshll_n_u8(r, 8);
201 /* Keep the first 5 digits of vBuf, and splice it with vshll_n_u8(g, 8).
202 * vBuf => RRRRRGGG|GGGXXXXX
203 */
204 vBuf = vsriq_n_u16(vBuf, vshll_n_u8(g, 8), 5);
205 /* Keep the first 11 digits of vBuf, and splice it with vshll_n_u8(b, 8).
206 * vBuf => RRRRRGGG|GGGBBBBB
207 */
208 vBuf = vsriq_n_u16(vBuf, vshll_n_u8(b, 8), 11);
209 vst1q_u16(reinterpret_cast<uint16_t*>(buf), vBuf);
210 }
211 }
212 #endif
213 #endif