• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 namespace CAROTENE_NS {
43 
integral(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u32 * sumBase,ptrdiff_t sumStride)44 void integral(const Size2D &size,
45               const u8 * srcBase, ptrdiff_t srcStride,
46               u32 * sumBase, ptrdiff_t sumStride)
47 {
48     internal::assertSupportedConfiguration();
49 #ifdef CAROTENE_NEON
50     uint32x4_t v_zero = vmovq_n_u32(0u);
51 
52     // the first iteration
53     const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
54     u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
55 
56     uint32x4_t prev = v_zero;
57     size_t j = 0u;
58 
59     for ( ; j + 7 < size.width; j += 8)
60     {
61         internal::prefetch(sum + j);
62         internal::prefetch(src + j);
63 
64         uint8x8_t el8shr0 = vld1_u8(src + j);
65         uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
66         uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
67         uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
68 
69         uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
70         uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
71 
72         uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
73         uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
74 
75         uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
76         uint32x4_t vsumh = vaddw_u16(prev, el4h);
77 
78         vst1q_u32(sum + j, vsuml);
79         vst1q_u32(sum + j + 4, vsumh);
80 
81         prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
82     }
83 
84     for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
85         sum[j] = (v += src[j]);
86 
87     // the others
88     for (size_t i = 1; i < size.height ; ++i)
89     {
90         src = internal::getRowPtr(srcBase, srcStride, i);
91         u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
92         sum = internal::getRowPtr(sumBase, sumStride, i);
93 
94         prev = v_zero;
95         j = 0u;
96 
97         for ( ; j + 7 < size.width; j += 8)
98         {
99             internal::prefetch(sum + j);
100             internal::prefetch(src + j);
101 
102             uint32x4_t vsuml = vld1q_u32(prevSum + j);
103             uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
104 
105             uint8x8_t el8shr0 = vld1_u8(src + j);
106             uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
107             uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
108             uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
109 
110             vsuml = vaddq_u32(vsuml, prev);
111             vsumh = vaddq_u32(vsumh, prev);
112 
113             uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
114             uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
115 
116             uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
117             uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
118 
119             vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
120             vsumh = vaddw_u16(vsumh, el4h);
121 
122             vst1q_u32(sum + j, vsuml);
123             vst1q_u32(sum + j + 4, vsumh);
124 
125             prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
126         }
127 
128         for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
129             sum[j] = (v += src[j]) + prevSum[j];
130     }
131 #else
132     (void)size;
133     (void)srcBase;
134     (void)srcStride;
135     (void)sumBase;
136     (void)sumStride;
137 #endif
138 }
139 
sqrIntegral(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,f64 * sqsumBase,ptrdiff_t sqsumStride)140 void sqrIntegral(const Size2D &size,
141                  const u8 * srcBase, ptrdiff_t srcStride,
142                  f64 * sqsumBase, ptrdiff_t sqsumStride)
143 {
144     internal::assertSupportedConfiguration();
145 #ifdef CAROTENE_NEON
146     uint16x8_t v_zero8 = vmovq_n_u16(0u);
147 
148     // the first iteration
149     const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
150     f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
151 
152     double prev = 0.;
153     size_t j = 0u;
154 
155     for ( ; j + 7 < size.width; j += 8)
156     {
157         internal::prefetch(sqsum + j);
158         internal::prefetch(src + j);
159 
160         uint8x8_t vsrc = vld1_u8(src + j);
161 
162         uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
163         uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
164 
165         uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
166         uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
167 
168         uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
169 
170         uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
171         uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
172         uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
173 
174         u32 buf[8];
175         vst1_u32(buf, vget_low_u32(el8shr01l));
176         vst1_u32(buf+2, el2l);
177         vst1_u32(buf+4, el2hl);
178         vst1_u32(buf+6, el2hh);
179         for(u32 k=0; k < 8; k++)
180             sqsum[j+k] = prev + buf[k];
181         prev += buf[7];
182     }
183 
184     for (; j < size.width; ++j)
185         sqsum[j] = (prev += src[j]*src[j]);
186 
187     // the others
188     for (size_t i = 1; i < size.height ; ++i)
189     {
190         src = internal::getRowPtr(srcBase, srcStride, i);
191         f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
192         sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
193 
194         prev = 0.;
195         j = 0u;
196 
197         for ( ; j + 7 < size.width; j += 8)
198         {
199             internal::prefetch(sqsum + j);
200             internal::prefetch(src + j);
201 
202             uint8x8_t vsrc = vld1_u8(src + j);
203 
204             uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
205             uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
206 
207             uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
208             uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
209 
210             uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
211 
212             uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
213             uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
214             uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
215 
216             u32 buf[8];
217             vst1_u32(buf, vget_low_u32(el8shr01l));
218             vst1_u32(buf+2, el2l);
219             vst1_u32(buf+4, el2hl);
220             vst1_u32(buf+6, el2hh);
221             for(u32 k=0; k < 8; k++)
222                 sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
223             prev += buf[7];
224         }
225 
226         for (; j < size.width; ++j)
227             sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
228     }
229 #else
230     (void)size;
231     (void)srcBase;
232     (void)srcStride;
233     (void)sqsumBase;
234     (void)sqsumStride;
235 #endif
236 }
237 
238 } // namespace CAROTENE_NS
239