1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 namespace CAROTENE_NS {
43
integral(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u32 * sumBase,ptrdiff_t sumStride)44 void integral(const Size2D &size,
45 const u8 * srcBase, ptrdiff_t srcStride,
46 u32 * sumBase, ptrdiff_t sumStride)
47 {
48 internal::assertSupportedConfiguration();
49 #ifdef CAROTENE_NEON
50 uint32x4_t v_zero = vmovq_n_u32(0u);
51
52 // the first iteration
53 const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
54 u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
55
56 uint32x4_t prev = v_zero;
57 size_t j = 0u;
58
59 for ( ; j + 7 < size.width; j += 8)
60 {
61 internal::prefetch(sum + j);
62 internal::prefetch(src + j);
63
64 uint8x8_t el8shr0 = vld1_u8(src + j);
65 uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
66 uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
67 uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
68
69 uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
70 uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
71
72 uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
73 uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
74
75 uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
76 uint32x4_t vsumh = vaddw_u16(prev, el4h);
77
78 vst1q_u32(sum + j, vsuml);
79 vst1q_u32(sum + j + 4, vsumh);
80
81 prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
82 }
83
84 for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
85 sum[j] = (v += src[j]);
86
87 // the others
88 for (size_t i = 1; i < size.height ; ++i)
89 {
90 src = internal::getRowPtr(srcBase, srcStride, i);
91 u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
92 sum = internal::getRowPtr(sumBase, sumStride, i);
93
94 prev = v_zero;
95 j = 0u;
96
97 for ( ; j + 7 < size.width; j += 8)
98 {
99 internal::prefetch(sum + j);
100 internal::prefetch(src + j);
101
102 uint32x4_t vsuml = vld1q_u32(prevSum + j);
103 uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
104
105 uint8x8_t el8shr0 = vld1_u8(src + j);
106 uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
107 uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
108 uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
109
110 vsuml = vaddq_u32(vsuml, prev);
111 vsumh = vaddq_u32(vsumh, prev);
112
113 uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
114 uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
115
116 uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
117 uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
118
119 vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
120 vsumh = vaddw_u16(vsumh, el4h);
121
122 vst1q_u32(sum + j, vsuml);
123 vst1q_u32(sum + j + 4, vsumh);
124
125 prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
126 }
127
128 for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
129 sum[j] = (v += src[j]) + prevSum[j];
130 }
131 #else
132 (void)size;
133 (void)srcBase;
134 (void)srcStride;
135 (void)sumBase;
136 (void)sumStride;
137 #endif
138 }
139
sqrIntegral(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,f64 * sqsumBase,ptrdiff_t sqsumStride)140 void sqrIntegral(const Size2D &size,
141 const u8 * srcBase, ptrdiff_t srcStride,
142 f64 * sqsumBase, ptrdiff_t sqsumStride)
143 {
144 internal::assertSupportedConfiguration();
145 #ifdef CAROTENE_NEON
146 uint16x8_t v_zero8 = vmovq_n_u16(0u);
147
148 // the first iteration
149 const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
150 f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
151
152 double prev = 0.;
153 size_t j = 0u;
154
155 for ( ; j + 7 < size.width; j += 8)
156 {
157 internal::prefetch(sqsum + j);
158 internal::prefetch(src + j);
159
160 uint8x8_t vsrc = vld1_u8(src + j);
161
162 uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
163 uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
164
165 uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
166 uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
167
168 uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
169
170 uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
171 uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
172 uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
173
174 u32 buf[8];
175 vst1_u32(buf, vget_low_u32(el8shr01l));
176 vst1_u32(buf+2, el2l);
177 vst1_u32(buf+4, el2hl);
178 vst1_u32(buf+6, el2hh);
179 for(u32 k=0; k < 8; k++)
180 sqsum[j+k] = prev + buf[k];
181 prev += buf[7];
182 }
183
184 for (; j < size.width; ++j)
185 sqsum[j] = (prev += src[j]*src[j]);
186
187 // the others
188 for (size_t i = 1; i < size.height ; ++i)
189 {
190 src = internal::getRowPtr(srcBase, srcStride, i);
191 f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
192 sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
193
194 prev = 0.;
195 j = 0u;
196
197 for ( ; j + 7 < size.width; j += 8)
198 {
199 internal::prefetch(sqsum + j);
200 internal::prefetch(src + j);
201
202 uint8x8_t vsrc = vld1_u8(src + j);
203
204 uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
205 uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
206
207 uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
208 uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
209
210 uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
211
212 uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
213 uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
214 uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
215
216 u32 buf[8];
217 vst1_u32(buf, vget_low_u32(el8shr01l));
218 vst1_u32(buf+2, el2l);
219 vst1_u32(buf+4, el2hl);
220 vst1_u32(buf+6, el2hh);
221 for(u32 k=0; k < 8; k++)
222 sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
223 prev += buf[7];
224 }
225
226 for (; j < size.width; ++j)
227 sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
228 }
229 #else
230 (void)size;
231 (void)srcBase;
232 (void)srcStride;
233 (void)sqsumBase;
234 (void)sqsumStride;
235 #endif
236 }
237
238 } // namespace CAROTENE_NS
239