1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "vtransform.hpp"
42
43 namespace CAROTENE_NS {
44
45 #ifdef CAROTENE_NEON
46
47 namespace {
48
vnst(u8 * dst,uint8x16_t v1,uint8x16_t v2)49 inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
vnst(u8 * dst,uint16x8_t v1,uint16x8_t v2)50 inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
vnst(u8 * dst,uint32x4_t v1,uint32x4_t v2)51 inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
52
53 template <typename Op, int elsize> struct vtail
54 {
compareCAROTENE_NS::__anon819574a80111::vtail55 static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
56 u8 * dst, const Op & op,
57 size_t &x, size_t width)
58 {
59 //do nothing since there couldn't be enough data
60 (void)src0;
61 (void)src1;
62 (void)dst;
63 (void)op;
64 (void)x;
65 (void)width;
66 }
67 };
68 template <typename Op> struct vtail<Op, 2>
69 {
compareCAROTENE_NS::__anon819574a80111::vtail70 static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
71 u8 * dst, const Op & op,
72 size_t &x, size_t width)
73 {
74 typedef typename Op::type type;
75 typedef typename internal::VecTraits<type>::vec128 vec128;
76 typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
77 //There no more than 15 elements in the tail, so we could handle 8 element vector only once
78 if( x + 8 < width)
79 {
80 vec128 v_src0, v_src1;
81 uvec128 v_dst;
82
83 v_src0 = internal::vld1q(src0 + x);
84 v_src1 = internal::vld1q(src1 + x);
85 op(v_src0, v_src1, v_dst);
86 internal::vst1(dst + x, internal::vmovn(v_dst));
87 x+=8;
88 }
89 }
90 };
91 template <typename Op> struct vtail<Op, 1>
92 {
compareCAROTENE_NS::__anon819574a80111::vtail93 static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
94 u8 * dst, const Op & op,
95 size_t &x, size_t width)
96 {
97 typedef typename Op::type type;
98 typedef typename internal::VecTraits<type>::vec128 vec128;
99 typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
100 typedef typename internal::VecTraits<type>::vec64 vec64;
101 typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
102 //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
103 if( x + 16 < width)
104 {
105 vec128 v_src0, v_src1;
106 uvec128 v_dst;
107
108 v_src0 = internal::vld1q(src0 + x);
109 v_src1 = internal::vld1q(src1 + x);
110 op(v_src0, v_src1, v_dst);
111 internal::vst1q(dst + x, v_dst);
112 x+=16;
113 }
114 if( x + 8 < width)
115 {
116 vec64 v_src0, v_src1;
117 uvec64 v_dst;
118
119 v_src0 = internal::vld1(src0 + x);
120 v_src1 = internal::vld1(src1 + x);
121 op(v_src0, v_src1, v_dst);
122 internal::vst1(dst + x, v_dst);
123 x+=8;
124 }
125 }
126 };
127
128 template <typename Op>
vcompare(Size2D size,const typename Op::type * src0Base,ptrdiff_t src0Stride,const typename Op::type * src1Base,ptrdiff_t src1Stride,u8 * dstBase,ptrdiff_t dstStride,const Op & op)129 void vcompare(Size2D size,
130 const typename Op::type * src0Base, ptrdiff_t src0Stride,
131 const typename Op::type * src1Base, ptrdiff_t src1Stride,
132 u8 * dstBase, ptrdiff_t dstStride, const Op & op)
133 {
134 typedef typename Op::type type;
135 typedef typename internal::VecTraits<type>::vec128 vec128;
136 typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
137
138 if (src0Stride == src1Stride && src0Stride == dstStride &&
139 src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
140 {
141 size.width *= size.height;
142 size.height = 1;
143 }
144
145 const u32 step_base = 32 / sizeof(type);
146 size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
147
148 for (size_t y = 0; y < size.height; ++y)
149 {
150 const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
151 const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
152 u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
153 size_t x = 0;
154
155 for( ; x < roiw_base; x += step_base )
156 {
157 internal::prefetch(src0 + x);
158 internal::prefetch(src1 + x);
159
160 vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
161 vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
162 uvec128 v_dst0;
163 uvec128 v_dst1;
164
165 op(v_src00, v_src10, v_dst0);
166 op(v_src01, v_src11, v_dst1);
167
168 vnst(dst + x, v_dst0, v_dst1);
169 }
170
171 vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
172
173 for (; x < size.width; ++x)
174 {
175 op(src0 + x, src1 + x, dst + x);
176 }
177 }
178 }
179
180 template<typename T>
181 struct OpCmpEQ
182 {
183 typedef T type;
184
operator ()CAROTENE_NS::__anon819574a80111::OpCmpEQ185 void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
186 typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
187 {
188 v_dst = internal::vceqq(v_src0, v_src1);
189 }
190
operator ()CAROTENE_NS::__anon819574a80111::OpCmpEQ191 void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
192 typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
193 {
194 v_dst = internal::vceq(v_src0, v_src1);
195 }
196
operator ()CAROTENE_NS::__anon819574a80111::OpCmpEQ197 void operator() (const T * src0, const T * src1, u8 * dst) const
198 {
199 dst[0] = src0[0] == src1[0] ? 255 : 0;
200 }
201 };
202
203 template<typename T>
204 struct OpCmpNE
205 {
206 typedef T type;
207
operator ()CAROTENE_NS::__anon819574a80111::OpCmpNE208 void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
209 typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
210 {
211 v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
212 }
213
operator ()CAROTENE_NS::__anon819574a80111::OpCmpNE214 void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
215 typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
216 {
217 v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
218 }
219
operator ()CAROTENE_NS::__anon819574a80111::OpCmpNE220 void operator() (const T * src0, const T * src1, u8 * dst) const
221 {
222 dst[0] = src0[0] == src1[0] ? 0 : 255;
223 }
224 };
225
226 template<typename T>
227 struct OpCmpGT
228 {
229 typedef T type;
230
operator ()CAROTENE_NS::__anon819574a80111::OpCmpGT231 void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
232 typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
233 {
234 v_dst = internal::vcgtq(v_src0, v_src1);
235 }
236
operator ()CAROTENE_NS::__anon819574a80111::OpCmpGT237 void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
238 typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
239 {
240 v_dst = internal::vcgt(v_src0, v_src1);
241 }
242
operator ()CAROTENE_NS::__anon819574a80111::OpCmpGT243 void operator() (const T * src0, const T * src1, u8 * dst) const
244 {
245 dst[0] = src0[0] > src1[0] ? 255 : 0;
246 }
247 };
248
249 template<typename T>
250 struct OpCmpGE
251 {
252 typedef T type;
253
operator ()CAROTENE_NS::__anon819574a80111::OpCmpGE254 void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
255 typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
256 {
257 v_dst = internal::vcgeq(v_src0, v_src1);
258 }
259
operator ()CAROTENE_NS::__anon819574a80111::OpCmpGE260 void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
261 typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
262 {
263 v_dst = internal::vcge(v_src0, v_src1);
264 }
265
operator ()CAROTENE_NS::__anon819574a80111::OpCmpGE266 void operator() (const T * src0, const T * src1, u8 * dst) const
267 {
268 dst[0] = src0[0] >= src1[0] ? 255 : 0;
269 }
270 };
271
272 }
273
274 #define IMPL_CMPOP(op, type) \
275 void cmp##op(const Size2D &size, \
276 const type * src0Base, ptrdiff_t src0Stride, \
277 const type * src1Base, ptrdiff_t src1Stride, \
278 u8 *dstBase, ptrdiff_t dstStride) \
279 { \
280 internal::assertSupportedConfiguration(); \
281 vcompare(size, \
282 src0Base, src0Stride, \
283 src1Base, src1Stride, \
284 dstBase, dstStride, \
285 OpCmp##op<type>()); \
286 }
287
288 #else
289
290 #define IMPL_CMPOP(op, type) \
291 void cmp##op(const Size2D &size, \
292 const type * src0Base, ptrdiff_t src0Stride, \
293 const type * src1Base, ptrdiff_t src1Stride, \
294 u8 *dstBase, ptrdiff_t dstStride) \
295 { \
296 internal::assertSupportedConfiguration(); \
297 (void)size; \
298 (void)src0Base; \
299 (void)src0Stride; \
300 (void)src1Base; \
301 (void)src1Stride; \
302 (void)dstBase; \
303 (void)dstStride; \
304 }
305
306 #endif
307
308 IMPL_CMPOP(EQ, u8)
309 IMPL_CMPOP(EQ, s8)
310 IMPL_CMPOP(EQ, u16)
311 IMPL_CMPOP(EQ, s16)
312 IMPL_CMPOP(EQ, u32)
313 IMPL_CMPOP(EQ, s32)
314 IMPL_CMPOP(EQ, f32)
315
316 IMPL_CMPOP(NE, u8)
317 IMPL_CMPOP(NE, s8)
318 IMPL_CMPOP(NE, u16)
319 IMPL_CMPOP(NE, s16)
320 IMPL_CMPOP(NE, u32)
321 IMPL_CMPOP(NE, s32)
322 IMPL_CMPOP(NE, f32)
323
324 IMPL_CMPOP(GT, u8)
325 IMPL_CMPOP(GT, s8)
326 IMPL_CMPOP(GT, u16)
327 IMPL_CMPOP(GT, s16)
328 IMPL_CMPOP(GT, u32)
329 IMPL_CMPOP(GT, s32)
330 IMPL_CMPOP(GT, f32)
331
332 IMPL_CMPOP(GE, u8)
333 IMPL_CMPOP(GE, s8)
334 IMPL_CMPOP(GE, u16)
335 IMPL_CMPOP(GE, s16)
336 IMPL_CMPOP(GE, u32)
337 IMPL_CMPOP(GE, s32)
338 IMPL_CMPOP(GE, f32)
339
340 } // namespace CAROTENE_NS
341