• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef GRAPHIC_LITE_GRAPHIC_NEON_UTILS_H
17 #define GRAPHIC_LITE_GRAPHIC_NEON_UTILS_H
18 
19 #include "graphic_config.h"
20 #ifdef ARM_NEON_OPT
21 #include <arm_neon.h>
22 #include "gfx_utils/color.h"
23 #include "gfx_utils/graphic_math.h"
24 #include "gfx_utils/graphic_types.h"
25 
26 namespace OHOS {
27 #define NEON_STEP_4       4
28 #define NEON_STEP_8       8
29 #define NEON_STEP_32      32
30 #define NEON_A            3
31 #define NEON_R            2
32 #define NEON_G            1
33 #define NEON_B            0
34 
35 // return vIn / 255
NeonFastDiv255(uint16x8_t vIn)36 static inline uint8x8_t NeonFastDiv255(uint16x8_t vIn)
37 {
38     // 257: 2^8 + 1; 8: number of shifts
39     return vmovn_u16(vshrq_n_u16(vIn + vshrq_n_u16(vIn + vdupq_n_u16(257), 8), 8));
40 }
41 
42 // return a * b / 255
NeonMulDiv255(uint8x8_t a,uint8x8_t b)43 static inline uint8x8_t NeonMulDiv255(uint8x8_t a, uint8x8_t b)
44 {
45     return NeonFastDiv255(vmull_u8(a, b));
46 }
47 
48 // return a / b
49 // a, b and result are floating-point numbers.
NeonDiv(float32x4_t a,float32x4_t b)50 static inline float32x4_t NeonDiv(float32x4_t a, float32x4_t b)
51 {
52     float32x4_t reciprocal = vrecpeq_f32(b);
53     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
54     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
55     return vmulq_f32(a, reciprocal);
56 }
57 
58 // return a / b
59 // a is a 16-bits integer, b and result are 8-bits integers.
NeonDivInt(uint16x8_t a,uint8x8_t b)60 static inline uint8x8_t NeonDivInt(uint16x8_t a, uint8x8_t b)
61 {
62     float32x4_t low = NeonDiv(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a))),
63                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(b)))));
64     float32x4_t high = NeonDiv(vcvtq_f32_u32(vmovl_u16(vget_high_u16(a))),
65                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(b)))));
66     return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(low)), vmovn_u32(vcvtq_u32_f32(high))));
67 }
68 
NeonMemcpy(void * dst,int32_t dstSize,const void * src,int32_t srcSize)69 static void NeonMemcpy(void* dst, int32_t dstSize, const void* src, int32_t srcSize)
70 {
71     int32_t sz = MATH_MIN(dstSize, srcSize);
72     // 64-bytes aligned
73     int32_t mod = sz % 64;
74     if (mod) {
75         if (memcpy_s(dst, mod, src, mod) != EOK) {
76             return;
77         }
78         sz -= mod;
79         if (sz == 0) {
80             return;
81         }
82         dst = (uint8_t*)dst + mod;
83         src = (uint8_t*)src + mod;
84     }
85 
86     asm volatile (
87     "NEONCopyPLD: \n"
88             " PLD [%[src], #0xC0] \n"
89             " VLDM %[src]!, {d0-d7} \n"
90             " VSTM %[dst]!, {d0-d7} \n"
91             " SUBS %[sz], %[sz], #0x40 \n"
92             " BGT NEONCopyPLD \n"
93     : [dst]"+r"(dst), [src]"+r"(src), [sz]"+r"(sz) : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
94 }
95 
NeonBlendRGBA(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)96 static inline void NeonBlendRGBA(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
97                                  uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
98 {
99     uint8x8_t da = NeonMulDiv255(a1, vdup_n_u8(OPA_OPAQUE) - a2);
100     a1 = a1 - NeonMulDiv255(a2, a1) + a2;
101     uint16x8_t r = vmull_u8(r2, a2) + vmull_u8(r1, da);
102     uint16x8_t g = vmull_u8(g2, a2) + vmull_u8(g1, da);
103     uint16x8_t b = vmull_u8(b2, a2) + vmull_u8(b1, da);
104     r1 = NeonDivInt(r, a1);
105     g1 = NeonDivInt(g, a1);
106     b1 = NeonDivInt(b, a1);
107 }
108 
NeonBlendRGB(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)109 static inline void NeonBlendRGB(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
110                                 uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
111 {
112     uint8x8_t da = vdup_n_u8(OPA_OPAQUE) - a2;
113     r1 = NeonMulDiv255(r2, a2) + NeonMulDiv255(r1, da);
114     g1 = NeonMulDiv255(g2, a2) + NeonMulDiv255(g1, da);
115     b1 = NeonMulDiv255(b2, a2) + NeonMulDiv255(b1, da);
116 }
117 
LoadBuf_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)118 static inline void LoadBuf_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
119 {
120     uint8x8x4_t vBuf = vld4_u8(buf);
121     r = vBuf.val[NEON_R];
122     g = vBuf.val[NEON_G];
123     b = vBuf.val[NEON_B];
124     a = vBuf.val[NEON_A];
125 }
126 
LoadBuf_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)127 static inline void LoadBuf_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
128 {
129     uint8x8x3_t vBuf = vld3_u8(buf);
130     r = vBuf.val[NEON_R];
131     g = vBuf.val[NEON_G];
132     b = vBuf.val[NEON_B];
133 }
134 
LoadBuf_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)135 static inline void LoadBuf_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
136 {
137     uint16x8_t vBuf = vld1q_u16(reinterpret_cast<uint16_t*>(buf));
138     // 3: RRRRRGGG|GGGBBBBB => RRGGGGGG|BBBBB000
139     b = vmovn_u16(vshlq_n_u16(vBuf, 3));
140     // 5, 2: RRRRRGGG|GGGBBBBB => XXXRRRRR|GGGGGG00
141     g = vshl_n_u8(vshrn_n_u16(vBuf, 5), 2);
142     // 11, 3: RRRRRGGG|GGGBBBBB => XXXXXXXX|RRRRR000
143     r = vmovn_u16(vshlq_n_u16(vshrq_n_u16(vBuf, 11), 3));
144 }
145 
LoadBufA_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)146 static inline void LoadBufA_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a, uint8_t opa)
147 {
148     uint8x8x4_t vBuf = vld4_u8(buf);
149     r = vBuf.val[NEON_R];
150     g = vBuf.val[NEON_G];
151     b = vBuf.val[NEON_B];
152     a = NeonMulDiv255(vBuf.val[NEON_A], vdup_n_u8(opa));
153 }
154 
LoadBufA_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)155 static inline void LoadBufA_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a, uint8_t opa)
156 {
157     uint8x8x3_t vBuf = vld3_u8(buf);
158     r = vBuf.val[NEON_R];
159     g = vBuf.val[NEON_G];
160     b = vBuf.val[NEON_B];
161     a = vdup_n_u8(opa);
162 }
163 
LoadBufA_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)164 static inline void LoadBufA_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a, uint8_t opa)
165 {
166     uint16x8_t vBuf = vld1q_u16(reinterpret_cast<uint16_t*>(buf));
167     // 3: RRRRRGGG|GGGBBBBB => RRGGGGGG|BBBBB000
168     b = vmovn_u16(vshlq_n_u16(vBuf, 3));
169     // 5, 2: RRRRRGGG|GGGBBBBB => XXXRRRRR|GGGGGG00
170     g = vshl_n_u8(vshrn_n_u16(vBuf, 5), 2);
171     // 11, 3: RRRRRGGG|GGGBBBBB => XXXXXXXX|RRRRR000
172     r = vmovn_u16(vshlq_n_u16(vshrq_n_u16(vBuf, 11), 3));
173     a = vdup_n_u8(opa);
174 }
175 
StoreBuf_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)176 static inline void StoreBuf_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
177 {
178     uint8x8x4_t vBuf;
179     vBuf.val[NEON_R] = r;
180     vBuf.val[NEON_G] = g;
181     vBuf.val[NEON_B] = b;
182     vBuf.val[NEON_A] = a;
183     vst4_u8(buf, vBuf);
184 }
185 
StoreBuf_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)186 static inline void StoreBuf_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
187 {
188     uint8x8x3_t vBuf;
189     vBuf.val[NEON_R] = r;
190     vBuf.val[NEON_G] = g;
191     vBuf.val[NEON_B] = b;
192     vst3_u8(buf, vBuf);
193 }
194 
StoreBuf_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)195 static inline void StoreBuf_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
196 {
197     /* red left shift 8 bit.
198      * vBuf => RRRRRXXX|XXXXXXXX
199      */
200     uint16x8_t vBuf = vshll_n_u8(r, 8);
201     /* Keep the first 5 digits of vBuf, and splice it with vshll_n_u8(g, 8).
202      * vBuf => RRRRRGGG|GGGXXXXX
203      */
204     vBuf = vsriq_n_u16(vBuf, vshll_n_u8(g, 8), 5);
205     /* Keep the first 11 digits of vBuf, and splice it with vshll_n_u8(b, 8).
206      * vBuf => RRRRRGGG|GGGBBBBB
207      */
208     vBuf = vsriq_n_u16(vBuf, vshll_n_u8(b, 8), 11);
209     vst1q_u16(reinterpret_cast<uint16_t*>(buf), vBuf);
210 }
211 }
212 #endif
213 #endif