• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #ifndef CAROTENE_INTRINSICS_HPP
41 #define CAROTENE_INTRINSICS_HPP
42 
43 #include <carotene/definitions.hpp>
44 
45 #include <arm_neon.h>
46 
47 namespace CAROTENE_NS { namespace internal {
48 
49 /////////////// Custom NEON intrinsics ///////////////////
50 
51 // calculate reciprocal value
52 
vrecpq_f32(float32x4_t val)53 inline float32x4_t vrecpq_f32(float32x4_t val)
54 {
55     float32x4_t reciprocal = vrecpeq_f32(val);
56     reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
57     reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
58     return reciprocal;
59 }
60 
vrecp_f32(float32x2_t val)61 inline float32x2_t vrecp_f32(float32x2_t val)
62 {
63     float32x2_t reciprocal = vrecpe_f32(val);
64     reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
65     reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
66     return reciprocal;
67 }
68 
69 // caclulate sqrt value
70 
vrsqrtq_f32(float32x4_t val)71 inline float32x4_t vrsqrtq_f32(float32x4_t val)
72 {
73     float32x4_t e = vrsqrteq_f32(val);
74     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
75     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
76     return e;
77 }
78 
vrsqrt_f32(float32x2_t val)79 inline float32x2_t vrsqrt_f32(float32x2_t val)
80 {
81     float32x2_t e = vrsqrte_f32(val);
82     e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
83     e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
84     return e;
85 }
86 
vsqrtq_f32(float32x4_t val)87 inline float32x4_t vsqrtq_f32(float32x4_t val)
88 {
89     return vrecpq_f32(vrsqrtq_f32(val));
90 }
91 
vsqrt_f32(float32x2_t val)92 inline float32x2_t vsqrt_f32(float32x2_t val)
93 {
94     return vrecp_f32(vrsqrt_f32(val));
95 }
96 
97 // table lookup with the table in a 128-bit register
98 
vqtbl1_u8(uint8x16_t a,uint8x8_t b)99 inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
100 {
101 #ifdef __aarch64__
102     // AArch64 supports this natively
103     return ::vqtbl1_u8(a, b);
104 #else
105     union { uint8x16_t v; uint8x8x2_t w; } u = { a };
106     return vtbl2_u8(u.w, b);
107 #endif
108 }
109 
110 } }
111 
112 #endif
113