1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 namespace CAROTENE_NS {
43
44 #ifdef CAROTENE_NEON
45
46 #define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
47 void convert(const Size2D &_size, \
48 const T1 * srcBase, ptrdiff_t srcStride, \
49 T2 * dstBase, ptrdiff_t dstStride) \
50 { \
51 internal::assertSupportedConfiguration(); \
52 Size2D size(_size); \
53 if (srcStride == dstStride && \
54 srcStride == (ptrdiff_t)(size.width)) \
55 { \
56 size.width *= size.height; \
57 size.height = 1; \
58 } \
59 const ptrdiff_t sstep = srcStride / sizeof(T1); \
60 const ptrdiff_t dstep = dstStride / sizeof(T2); \
61 const size_t w = size.width & ~(SIMD_SIZE-1); \
62 if (size.width >= SIMD_SIZE) \
63 { \
64 const T1* _src = srcBase; \
65 T2* _dst = dstBase; \
66 CVTINIT \
67 for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
68 CVTROW \
69 } \
70 if(w < size.width) \
71 { \
72 const T1* _src = srcBase; \
73 T2* _dst = dstBase; \
74 for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
75 for(size_t i = w; i < size.width; i++ ) \
76 _dst[i] = internal::saturate_cast<T2>(_src[i]); \
77 } \
78 }
79
80 #else
81
82 #define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
83 void convert(const Size2D &, \
84 const T1 *, ptrdiff_t, \
85 T2 *, ptrdiff_t) \
86 { \
87 internal::assertSupportedConfiguration(); \
88 }
89
90 #endif
91
92 CVT_FUNC(u8, s8, 16,
93 uint8x16_t v127 = vdupq_n_u8(127);,
94 {
95 for (size_t i = 0; i < w; i += 16)
96 {
97 internal::prefetch(_src + i);
98 uint8x16_t vu8 = vld1q_u8(_src + i);
99 int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127));
100 vst1q_s8(_dst + i, vu1);
101 }
102 })
103
104 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
105 CVT_FUNC(u8, u16, 16,
106 register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
107 {
108 for (size_t i = 0; i < w; i += 16)
109 {
110 internal::prefetch(_src + i);
111 __asm__ (
112 "vld1.8 {d0-d1}, [%[src]] \n\t"
113 "vst2.8 {d0,d2}, [%[dst1]] \n\t"
114 "vst2.8 {d1,d3}, [%[dst2]] \n\t"
115 : /*no output*/
116 : [src] "r" (_src + i),
117 [dst1] "r" (_dst + i + 0),
118 [dst2] "r" (_dst + i + 8),
119 "w" (zero0)
120 : "d0","d1"
121 );
122 }
123 })
124 #else
125 CVT_FUNC(u8, u16, 16,
126 uint8x16x2_t vline;
127 vline.val[1] = vmovq_n_u8(0);,
128 {
129 for (size_t i = 0; i < w; i += 16)
130 {
131 internal::prefetch(_src + i);
132 vline.val[0] = vld1q_u8(_src + i);
133 vst2q_u8((uint8_t*)(_dst + i), vline);
134 }
135 })
136 #endif
137
138 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
139 CVT_FUNC(u8, s32, 16,
140 register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);
141 register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0);
142 register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);,
143 {
144 for (size_t i = 0; i < w; i += 16)
145 {
146 internal::prefetch(_src + i);
147 __asm__ (
148 "vld1.8 {d0-d1}, [%[src]] \n\t"
149 "vst4.8 {d0,d2,d4,d6}, [%[dst1]] \n\t"
150 "vst4.8 {d1,d3,d5,d7}, [%[dst2]] \n\t"
151 : /*no output*/
152 : [src] "r" (_src + i),
153 [dst1] "r" (_dst + i + 0),
154 [dst2] "r" (_dst + i + 8),
155 "w" (zero0), "w" (zero1), "w" (zero2)
156 : "d0","d1"
157 );
158 }
159 })
160 #else
161 CVT_FUNC(u8, s32, 16,
162 uint8x16x4_t vline;
163 vline.val[1] = vmovq_n_u8(0);
164 vline.val[2] = vmovq_n_u8(0);
165 vline.val[3] = vmovq_n_u8(0);,
166 {
167 for (size_t i = 0; i < w; i += 16)
168 {
169 internal::prefetch(_src + i);
170 vline.val[0] = vld1q_u8(_src + i);
171 vst4q_u8((uint8_t*)(_dst + i), vline);
172 }
173 })
174 #endif
175
176 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
177 CVT_FUNC(u8, f32, 16,
178 ,
179 {
180 for (size_t i = 0; i < w; i += 16)
181 {
182 internal::prefetch(_src + i);
183 __asm__ (
184 "vld1.8 {d0-d1}, [%[src]] \n\t"
185 "vmovl.u8 q1, d0 \n\t"
186 "vmovl.u8 q2, d1 \n\t"
187 "vmovl.u16 q3, d2 \n\t"
188 "vmovl.u16 q4, d3 \n\t"
189 "vmovl.u16 q5, d4 \n\t"
190 "vmovl.u16 q6, d5 \n\t"
191 "vcvt.f32.u32 q7, q3 \n\t"
192 "vcvt.f32.u32 q8, q4 \n\t"
193 "vcvt.f32.u32 q9, q5 \n\t"
194 "vcvt.f32.u32 q10, q6 \n\t"
195 "vst1.32 {d14-d15}, [%[dst1]] \n\t"
196 "vst1.32 {d16-d17}, [%[dst2]] \n\t"
197 "vst1.32 {d18-d19}, [%[dst3]] \n\t"
198 "vst1.32 {d20-d21}, [%[dst4]] \n\t"
199 : /*no output*/
200 : [src] "r" (_src + i),
201 [dst1] "r" (_dst + i + 0),
202 [dst2] "r" (_dst + i + 4),
203 [dst3] "r" (_dst + i + 8),
204 [dst4] "r" (_dst + i + 12)
205 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
206 );
207 }
208 })
209 #else
210 CVT_FUNC(u8, f32, 16,
211 ,
212 {
213 for (size_t i = 0; i < w; i += 16)
214 {
215 internal::prefetch(_src + i);
216 uint8x16_t vline_u8 = vld1q_u8(_src + i);
217
218 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8));
219 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8));
220
221 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16));
222 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
223 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16));
224 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
225
226 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
227 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
228 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
229 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
230
231 vst1q_f32(_dst + i, vline1_f32);
232 vst1q_f32(_dst + i + 4, vline2_f32);
233 vst1q_f32(_dst + i + 8, vline3_f32);
234 vst1q_f32(_dst + i + 12, vline4_f32);
235 }
236 })
237 #endif
238
239 CVT_FUNC(s8, u8, 16,
240 int8x16_t vZero = vdupq_n_s8(0);,
241 {
242 for (size_t i = 0; i < w; i += 16)
243 {
244 internal::prefetch(_src + i);
245 int8x16_t vu8 = vld1q_s8(_src + i);
246 uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero));
247 vst1q_u8(_dst + i, vu1);
248 }
249 })
250
251 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
252 CVT_FUNC(s8, u16, 16,
253 register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
254 {
255 for (size_t i = 0; i < w; i += 16)
256 {
257 internal::prefetch(_src + i);
258 __asm__ (
259 "vld1.8 {d0-d1}, [%[src]] \n\t"
260 "vmax.s8 q0, q1 \n\t"
261 "vst2.8 {d0,d2}, [%[dst1]] \n\t"
262 "vst2.8 {d1,d3}, [%[dst2]] \n\t"
263 : /*no output*/
264 : [src] "r" (_src + i),
265 [dst1] "r" (_dst + i + 0),
266 [dst2] "r" (_dst + i + 8),
267 "w" (zero0)
268 : "d0","d1"
269 );
270 }
271 })
272 #else
273 CVT_FUNC(s8, u16, 16,
274 int8x16x2_t vline_s8;
275 vline_s8.val[1] = vmovq_n_s8(0);,
276 {
277 for (size_t i = 0; i < w; i += 16)
278 {
279 internal::prefetch(_src + i);
280 vline_s8.val[0] = vld1q_s8(_src + i);
281 vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]);
282 vst2q_s8((int8_t*)(_dst + i), vline_s8);
283 }
284 })
285 #endif
286
287 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
288 CVT_FUNC(s8, s16, 16,
289 ,
290 {
291 for (size_t i = 0; i < w; i += 16)
292 {
293 internal::prefetch(_src + i);
294 __asm__ (
295 "vld1.8 {d0-d1}, [%[src]] \n\t"
296 "vmovl.s8 q1, d0 \n\t"
297 "vmovl.s8 q2, d1 \n\t"
298 "vst1.16 {d2-d3}, [%[dst1]] \n\t"
299 "vst1.16 {d4-d5}, [%[dst2]] \n\t"
300 : /*no output*/
301 : [src] "r" (_src + i),
302 [dst1] "r" (_dst + i + 0),
303 [dst2] "r" (_dst + i + 8)
304 : "d0","d1","d2","d3","d4","d5"
305 );
306 }
307 })
308 #else
309 CVT_FUNC(s8, s16, 16,
310 ,
311 {
312 for (size_t i = 0; i < w; i += 16)
313 {
314 internal::prefetch(_src + i);
315 int8x16_t vline_s8 = vld1q_s8(_src + i);
316
317 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
318 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
319
320 vst1q_s16(_dst + i, vline1_s16);
321 vst1q_s16(_dst + i + 8, vline2_s16);
322 }
323 })
324 #endif
325
326 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
327 CVT_FUNC(s8, s32, 16,
328 ,
329 {
330 for (size_t i = 0; i < w; i += 16)
331 {
332 internal::prefetch(_src + i);
333 __asm__ (
334 "vld1.8 {d0-d1}, [%[src]] \n\t"
335 "vmovl.s8 q1, d0 \n\t"
336 "vmovl.s8 q2, d1 \n\t"
337 "vmovl.s16 q3, d2 \n\t"
338 "vmovl.s16 q4, d3 \n\t"
339 "vmovl.s16 q5, d4 \n\t"
340 "vmovl.s16 q6, d5 \n\t"
341 "vst1.32 {d6-d7}, [%[dst1]] \n\t"
342 "vst1.32 {d8-d9}, [%[dst2]] \n\t"
343 "vst1.32 {d10-d11}, [%[dst3]] \n\t"
344 "vst1.32 {d12-d13}, [%[dst4]] \n\t"
345 : /*no output*/
346 : [src] "r" (_src + i),
347 [dst1] "r" (_dst + i + 0),
348 [dst2] "r" (_dst + i + 4),
349 [dst3] "r" (_dst + i + 8),
350 [dst4] "r" (_dst + i + 12)
351 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
352 );
353 }
354 })
355 #else
356 CVT_FUNC(s8, s32, 16,
357 ,
358 {
359 for (size_t i = 0; i < w; i += 16)
360 {
361 internal::prefetch(_src + i);
362 int8x16_t vline_s8 = vld1q_s8(_src + i);
363
364 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
365 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
366
367 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
368 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
369 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
370 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
371
372 vst1q_s32(_dst + i, vline1_s32);
373 vst1q_s32(_dst + i + 4, vline2_s32);
374 vst1q_s32(_dst + i + 8, vline3_s32);
375 vst1q_s32(_dst + i + 12, vline4_s32);
376 }
377 })
378 #endif
379
380 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
381 CVT_FUNC(s8, f32, 16,
382 ,
383 {
384 for (size_t i = 0; i < w; i += 16)
385 {
386 internal::prefetch(_src + i);
387 __asm__ (
388 "vld1.8 {d0-d1}, [%[src]] \n\t"
389 "vmovl.s8 q1, d0 \n\t"
390 "vmovl.s8 q2, d1 \n\t"
391 "vmovl.s16 q3, d2 \n\t"
392 "vmovl.s16 q4, d3 \n\t"
393 "vmovl.s16 q5, d4 \n\t"
394 "vmovl.s16 q6, d5 \n\t"
395 "vcvt.f32.s32 q7, q3 \n\t"
396 "vcvt.f32.s32 q8, q4 \n\t"
397 "vcvt.f32.s32 q9, q5 \n\t"
398 "vcvt.f32.s32 q10, q6 \n\t"
399 "vst1.32 {d14-d15}, [%[dst1]] \n\t"
400 "vst1.32 {d16-d17}, [%[dst2]] \n\t"
401 "vst1.32 {d18-d19}, [%[dst3]] \n\t"
402 "vst1.32 {d20-d21}, [%[dst4]] \n\t"
403 : /*no output*/
404 : [src] "r" (_src + i),
405 [dst1] "r" (_dst + i + 0),
406 [dst2] "r" (_dst + i + 4),
407 [dst3] "r" (_dst + i + 8),
408 [dst4] "r" (_dst + i + 12)
409 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
410 );
411 }
412 })
413 #else
414 CVT_FUNC(s8, f32, 16,
415 ,
416 {
417 for (size_t i = 0; i < w; i += 16)
418 {
419 internal::prefetch(_src + i);
420 int8x16_t vline_s8 = vld1q_s8(_src + i);
421
422 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
423 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
424
425 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
426 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
427 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
428 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
429
430 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
431 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
432 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
433 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
434
435 vst1q_f32(_dst + i, vline1_f32);
436 vst1q_f32(_dst + i + 4, vline2_f32);
437 vst1q_f32(_dst + i + 8, vline3_f32);
438 vst1q_f32(_dst + i + 12, vline4_f32);
439 }
440 })
441 #endif
442
443 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
444 CVT_FUNC(u16, u8, 16,
445 ,
446 {
447 for (size_t i = 0; i < w; i += 16)
448 {
449 internal::prefetch(_src + i);
450 __asm__ (
451 "vld1.8 {d0-d1}, [%[src1]] \n\t"
452 "vqmovn.u16 d4, q0 \n\t"
453 "vld1.8 {d2-d3}, [%[src2]] \n\t"
454 "vqmovn.u16 d5, q1 \n\t"
455 "vst1.8 {d4-d5}, [%[dst]] \n\t"
456 : /*no output*/
457 : [src1] "r" (_src + i),
458 [src2] "r" (_src + i + 8),
459 [dst] "r" (_dst + i + 0)
460 : "d0","d1","d2","d3","d4","d5"
461 );
462 }
463 })
464 #else
465 CVT_FUNC(u16, u8, 16,
466 ,
467 {
468 for (size_t i = 0; i < w; i += 16)
469 {
470 internal::prefetch(_src + i);
471 uint16x8_t vline1_u16 = vld1q_u16(_src + i);
472 uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
473
474 uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
475 uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
476
477 vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
478 }
479 })
480 #endif
481
482 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
483 CVT_FUNC(u16, s8, 16,
484 register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);,
485 {
486 for (size_t i = 0; i < w; i += 16)
487 {
488 internal::prefetch(_src + i);
489 __asm__ (
490 "vld1.8 {d0-d1}, [%[src1]] \n\t"
491 "vqmovn.u16 d4, q0 \n\t"
492 "vld1.8 {d2-d3}, [%[src2]] \n\t"
493 "vqmovn.u16 d5, q1 \n\t"
494 "vmin.u8 q3, q2, q4 \n\t"
495 "vst1.8 {d6-d7}, [%[dst]] \n\t"
496 : /*no output*/
497 : [src1] "r" (_src + i),
498 [src2] "r" (_src + i + 8),
499 [dst] "r" (_dst + i + 0),
500 "w" (v127)
501 : "d0","d1","d2","d3","d4","d5","d6","d7"
502 );
503 }
504 })
505 #else
506 CVT_FUNC(u16, s8, 16,
507 uint8x8_t v127 = vmov_n_u8(127);,
508 {
509 for (size_t i = 0; i < w; i += 16)
510 {
511 internal::prefetch(_src + i);
512 uint16x8_t vline1_u16 = vld1q_u16(_src + i);
513 uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
514
515 uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
516 uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
517 vline1_u8 = vmin_u8(vline1_u8, v127);
518 vline2_u8 = vmin_u8(vline2_u8, v127);
519
520 vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8)));
521 }
522 })
523 #endif
524
525 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
526 CVT_FUNC(u16, s16, 8,
527 register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);,
528 {
529 for (size_t i = 0; i < w; i += 8)
530 {
531 internal::prefetch(_src + i);
532 __asm__ (
533 "vld1.16 {d0-d1}, [%[src]] \n\t"
534 "vmin.u16 q1, q0, q4 \n\t"
535 "vst1.16 {d2-d3}, [%[dst]] \n\t"
536 : /*no output*/
537 : [src] "r" (_src + i),
538 [dst] "r" (_dst + i + 0),
539 "w" (v32767)
540 : "d0","d1","d2","d3"
541 );
542 }
543 })
544 #else
545 CVT_FUNC(u16, s16, 8,
546 uint16x8_t v32767 = vmovq_n_u16(0x7FFF);,
547 {
548 for (size_t i = 0; i < w; i += 8)
549 {
550 internal::prefetch(_src + i);
551 uint16x8_t vline_u16 = vld1q_u16(_src + i);
552 vline_u16 = vminq_u16(vline_u16, v32767);
553 vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16));
554 }
555 })
556 #endif
557
558 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
559 CVT_FUNC(u16, s32, 8,
560 register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);,
561 {
562 for (size_t i = 0; i < w; i += 8)
563 {
564 internal::prefetch(_src + i);
565 __asm__ (
566 "vld1.16 {d0-d1}, [%[src]] \n\t"
567 "vst2.16 {d0,d2}, [%[dst1]] \n\t"
568 "vst2.16 {d1,d3}, [%[dst2]] \n\t"
569 : /*no output*/
570 : [src] "r" (_src + i),
571 [dst1] "r" (_dst + i),
572 [dst2] "r" (_dst + i + 4),
573 "w" (zero0)
574 : "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7"
575 );
576 }
577 })
578 #else
579 CVT_FUNC(u16, s32, 8,
580 uint16x8x2_t vline;
581 vline.val[1] = vmovq_n_u16(0);,
582 {
583 for (size_t i = 0; i < w; i += 8)
584 {
585 internal::prefetch(_src + i);
586 vline.val[0] = vld1q_u16(_src + i);
587 vst2q_u16((uint16_t*)(_dst + i), vline);
588 }
589 })
590 #endif
591
592 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
593 CVT_FUNC(u16, f32, 8,
594 ,
595 {
596 for (size_t i = 0; i < w; i += 8)
597 {
598 internal::prefetch(_src + i);
599 __asm__ (
600 "vld1.16 {d0-d1}, [%[src]] \n\t"
601 "vmovl.u16 q1, d0 \n\t"
602 "vmovl.u16 q2, d1 \n\t"
603 "vcvt.f32.u32 q3, q1 \n\t"
604 "vcvt.f32.u32 q4, q2 \n\t"
605 "vst1.32 {d6-d7}, [%[dst1]] \n\t"
606 "vst1.32 {d8-d9}, [%[dst2]] \n\t"
607 : /*no output*/
608 : [src] "r" (_src + i),
609 [dst1] "r" (_dst + i + 0),
610 [dst2] "r" (_dst + i + 4)
611 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
612 );
613 }
614 })
615 #else
616 CVT_FUNC(u16, f32, 8,
617 ,
618 {
619 for (size_t i = 0; i < w; i += 8)
620 {
621 internal::prefetch(_src + i);
622 uint16x8_t vline_u16 = vld1q_u16(_src + i);
623
624 uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16));
625 uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16));
626
627 float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo);
628 float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi);
629
630 vst1q_f32(_dst + i, vline_f32_lo);
631 vst1q_f32(_dst + i + 4, vline_f32_hi);
632 }
633 })
634 #endif
635
636 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
637 CVT_FUNC(s16, u8, 16,
638 ,
639 {
640 for (size_t i = 0; i < w; i += 16)
641 {
642 internal::prefetch(_src + i);
643 __asm__ (
644 "vld1.8 {d0-d1}, [%[src1]] \n\t"
645 "vld1.8 {d2-d3}, [%[src2]] \n\t"
646 "vqmovun.s16 d4, q0 \n\t"
647 "vqmovun.s16 d5, q1 \n\t"
648 "vst1.8 {d4-d5}, [%[dst]] \n\t"
649 : /*no output*/
650 : [src1] "r" (_src + i),
651 [src2] "r" (_src + i + 8),
652 [dst] "r" (_dst + i + 0)
653 : "d0","d1","d2","d3","d4","d5"
654 );
655 }
656 })
657 #else
658 CVT_FUNC(s16, u8, 16,
659 ,
660 {
661 for (size_t i = 0; i < w; i += 16)
662 {
663 internal::prefetch(_src + i);
664 int16x8_t vline1_s16 = vld1q_s16(_src + i);
665 int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
666
667 uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16);
668 uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16);
669
670 vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
671 }
672 })
673 #endif
674
675 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
676 CVT_FUNC(s16, s8, 16,
677 ,
678 {
679 for (size_t i = 0; i < w; i += 16)
680 {
681 internal::prefetch(_src + i);
682 __asm__ (
683 "vld1.8 {d0-d1}, [%[src1]] \n\t"
684 "vld1.8 {d2-d3}, [%[src2]] \n\t"
685 "vqmovn.s16 d4, q0 \n\t"
686 "vqmovn.s16 d5, q1 \n\t"
687 "vst1.8 {d4-d5}, [%[dst]] \n\t"
688 : /*no output*/
689 : [src1] "r" (_src + i),
690 [src2] "r" (_src + i + 8),
691 [dst] "r" (_dst + i + 0)
692 : "d0","d1","d2","d3","d4","d5"
693 );
694 }
695 })
696 #else
697 CVT_FUNC(s16, s8, 16,
698 ,
699 {
700 for (size_t i = 0; i < w; i += 16)
701 {
702 internal::prefetch(_src + i);
703 int16x8_t vline1_s16 = vld1q_s16(_src + i);
704 int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
705
706 int8x8_t vline1_s8 = vqmovn_s16(vline1_s16);
707 int8x8_t vline2_s8 = vqmovn_s16(vline2_s16);
708
709 vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8));
710 }
711 })
712 #endif
713
714 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
715 CVT_FUNC(s16, u16, 8,
716 register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);,
717 {
718 for (size_t i = 0; i < w; i += 8)
719 {
720 internal::prefetch(_src + i);
721 __asm__ (
722 "vld1.16 {d0-d1}, [%[src]] \n\t"
723 "vmax.s16 q1, q0, q4 \n\t"
724 "vst1.16 {d2-d3}, [%[dst]] \n\t"
725 : /*no output*/
726 : [src] "r" (_src + i),
727 [dst] "r" (_dst + i + 0),
728 "w" (vZero)
729 : "d0","d1","d2","d3"
730 );
731 }
732 })
733 #else
734 CVT_FUNC(s16, u16, 8,
735 int16x4_t vZero = vmov_n_s16(0);,
736 {
737 for (size_t i = 0; i < w; i += 8)
738 {
739 internal::prefetch(_src + i);
740 int16x8_t vline_s16 = vld1q_s16(_src + i);
741
742 int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero);
743 int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero);
744
745 vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi)));
746 }
747 })
748 #endif
749
750 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
751 CVT_FUNC(s16, s32, 8,
752 ,
753 {
754 for (size_t i = 0; i < w; i += 8)
755 {
756 internal::prefetch(_src + i);
757 __asm__ (
758 "vld1.16 {d0-d1}, [%[src]] \n\t"
759 "vmovl.s16 q1, d0 \n\t"
760 "vmovl.s16 q2, d1 \n\t"
761 "vst1.32 {d2-d3}, [%[dst1]] \n\t"
762 "vst1.32 {d4-d5}, [%[dst2]] \n\t"
763 : /*no output*/
764 : [src] "r" (_src + i),
765 [dst1] "r" (_dst + i + 0),
766 [dst2] "r" (_dst + i + 4)
767 : "d0","d1","d2","d3","d4","d5"
768 );
769 }
770 })
771 #else
772 CVT_FUNC(s16, s32, 8,
773 ,
774 {
775 for (size_t i = 0; i < w; i += 8)
776 {
777 internal::prefetch(_src + i);
778 int16x8_t vline_s16 = vld1q_s16(_src + i);
779
780 int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
781 int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
782
783 vst1q_s32(_dst + i, vline_s32_lo);
784 vst1q_s32(_dst + i + 4, vline_s32_hi);
785 }
786 })
787 #endif
788
789 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
790 CVT_FUNC(s16, f32, 8,
791 ,
792 {
793 for (size_t i = 0; i < w; i += 8)
794 {
795 internal::prefetch(_src + i);
796 __asm__ (
797 "vld1.16 {d0-d1}, [%[src]] \n\t"
798 "vmovl.s16 q1, d0 \n\t"
799 "vmovl.s16 q2, d1 \n\t"
800 "vcvt.f32.s32 q3, q1 \n\t"
801 "vcvt.f32.s32 q4, q2 \n\t"
802 "vst1.32 {d6-d7}, [%[dst1]] \n\t"
803 "vst1.32 {d8-d9}, [%[dst2]] \n\t"
804 : /*no output*/
805 : [src] "r" (_src + i),
806 [dst1] "r" (_dst + i + 0),
807 [dst2] "r" (_dst + i + 4)
808 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
809 );
810 }
811 })
812 #else
813 CVT_FUNC(s16, f32, 8,
814 ,
815 {
816 for (size_t i = 0; i < w; i += 8)
817 {
818 internal::prefetch(_src + i);
819 int16x8_t vline_s16 = vld1q_s16(_src + i);
820
821 int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
822 int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
823 float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo);
824 float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi);
825
826 vst1q_f32(_dst + i, vline_f32_lo);
827 vst1q_f32(_dst + i + 4, vline_f32_hi);
828 }
829 })
830 #endif
831
832 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
833 CVT_FUNC(s32, u8, 8,
834 ,
835 {
836 for (size_t i = 0; i < w; i += 8)
837 {
838 internal::prefetch(_src + i);
839 __asm__ (
840 "vld1.32 {d0-d1}, [%[src1]] \n\t"
841 "vld1.32 {d2-d3}, [%[src2]] \n\t"
842 "vqmovun.s32 d4, q0 \n\t"
843 "vqmovun.s32 d5, q1 \n\t"
844 "vqmovn.u16 d6, q2 \n\t"
845 "vst1.8 {d6}, [%[dst]] \n\t"
846 : /*no output*/
847 : [src1] "r" (_src + i + 0),
848 [src2] "r" (_src + i + 4),
849 [dst] "r" (_dst + i)
850 : "d0","d1","d2","d3","d4","d5","d6"
851 );
852 }
853 })
854 #else
855 CVT_FUNC(s32, u8, 8,
856 ,
857 {
858 for (size_t i = 0; i < w; i += 8)
859 {
860 internal::prefetch(_src + i);
861 int32x4_t vline1_s32 = vld1q_s32(_src + i);
862 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
863
864 uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
865 uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
866 uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
867
868 vst1_u8(_dst + i, vline_u8);
869 }
870 })
871 #endif
872
873 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
874 CVT_FUNC(s32, s8, 8,
875 ,
876 {
877 for (size_t i = 0; i < w; i += 8)
878 {
879 internal::prefetch(_src + i);
880 __asm__ (
881 "vld1.32 {d0-d1}, [%[src1]] \n\t"
882 "vld1.32 {d2-d3}, [%[src2]] \n\t"
883 "vqmovn.s32 d4, q0 \n\t"
884 "vqmovn.s32 d5, q1 \n\t"
885 "vqmovn.s16 d6, q2 \n\t"
886 "vst1.8 {d6}, [%[dst]] \n\t"
887 : /*no output*/
888 : [src1] "r" (_src + i + 0),
889 [src2] "r" (_src + i + 4),
890 [dst] "r" (_dst + i)
891 : "d0","d1","d2","d3","d4","d5","d6"
892 );
893 }
894 })
895 #else
896 CVT_FUNC(s32, s8, 8,
897 ,
898 {
899 for (size_t i = 0; i < w; i += 8)
900 {
901 internal::prefetch(_src + i);
902 int32x4_t vline1_s32 = vld1q_s32(_src + i);
903 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
904
905 int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
906 int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
907 int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
908
909 vst1_s8(_dst + i, vline_s8);
910 }
911 })
912 #endif
913
914 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
915 CVT_FUNC(s32, u16, 8,
916 ,
917 {
918 for (size_t i = 0; i < w; i += 8)
919 {
920 internal::prefetch(_src + i);
921 __asm__ (
922 "vld1.32 {d0-d1}, [%[src1]] \n\t"
923 "vld1.32 {d2-d3}, [%[src2]] \n\t"
924 "vqmovun.s32 d4, q0 \n\t"
925 "vqmovun.s32 d5, q1 \n\t"
926 "vst1.16 {d4-d5}, [%[dst]] \n\t"
927 : /*no output*/
928 : [src1] "r" (_src + i + 0),
929 [src2] "r" (_src + i + 4),
930 [dst] "r" (_dst + i)
931 : "d0","d1","d2","d3","d4","d5"
932 );
933 }
934 })
935 #else
936 CVT_FUNC(s32, u16, 8,
937 ,
938 {
939 for (size_t i = 0; i < w; i += 8)
940 {
941 internal::prefetch(_src + i);
942 int32x4_t vline1_s32 = vld1q_s32(_src + i);
943 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
944
945 uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
946 uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
947
948 vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16));
949 }
950 })
951 #endif
952
953 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
954 CVT_FUNC(s32, s16, 8,
955 ,
956 {
957 for (size_t i = 0; i < w; i += 8)
958 {
959 internal::prefetch(_src + i);
960 __asm__ (
961 "vld1.32 {d0-d1}, [%[src1]] \n\t"
962 "vld1.32 {d2-d3}, [%[src2]] \n\t"
963 "vqmovn.s32 d4, q0 \n\t"
964 "vqmovn.s32 d5, q1 \n\t"
965 "vst1.8 {d4-d5}, [%[dst]] \n\t"
966 : /*no output*/
967 : [src1] "r" (_src + i + 0),
968 [src2] "r" (_src + i + 4),
969 [dst] "r" (_dst + i)
970 : "d0","d1","d2","d3","d4","d5"
971 );
972 }
973 })
974 #else
975 CVT_FUNC(s32, s16, 8,
976 ,
977 {
978 for (size_t i = 0; i < w; i += 8)
979 {
980 internal::prefetch(_src + i);
981 int32x4_t vline1_s32 = vld1q_s32(_src + i);
982 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
983
984 int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
985 int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
986
987 vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16));
988 }
989 })
990 #endif
991
992 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
993 CVT_FUNC(s32, f32, 8,
994 ,
995 {
996 for (size_t i = 0; i < w; i += 8)
997 {
998 internal::prefetch(_src + i);
999 __asm__ (
1000 "vld1.32 {d0-d1}, [%[src]] \n\t"
1001 "vcvt.f32.s32 q1, q0 \n\t"
1002 "vst1.32 {d2-d3}, [%[dst]] \n\t"
1003 : /*no output*/
1004 : [src] "r" (_src + i),
1005 [dst] "r" (_dst + i)
1006 : "d0","d1","d2","d3"//,"d4","d5"
1007 );
1008 __asm__ (
1009 "vld1.32 {d0-d1}, [%[src]] \n\t"
1010 "vcvt.f32.s32 q1, q0 \n\t"
1011 "vst1.32 {d2-d3}, [%[dst]] \n\t"
1012 : /*no output*/
1013 : [src] "r" (_src + i + 4),
1014 [dst] "r" (_dst + i + 4)
1015 : "d0","d1","d2","d3"//,"d4","d5"
1016 );
1017 }
1018 })
1019 #else
1020 CVT_FUNC(s32, f32, 8,
1021 ,
1022 {
1023 for (size_t i = 0; i < w; i += 8)
1024 {
1025 internal::prefetch(_src + i);
1026 int32x4_t vline_s32 = vld1q_s32(_src + i);
1027 float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32);
1028 vst1q_f32(_dst + i, vline_f32);
1029
1030 vline_s32 = vld1q_s32(_src + i + 4);
1031 vline_f32 = vcvtq_f32_s32(vline_s32);
1032 vst1q_f32(_dst + i + 4, vline_f32);
1033 }
1034 })
1035 #endif
1036
1037 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1038 CVT_FUNC(f32, u8, 8,
1039 register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16));
1040 register uint32x4_t vmask asm ("q1") = vdupq_n_u32(1<<16);,
1041 {
1042 for (size_t i = 0; i < w; i += 8)
1043 {
1044 internal::prefetch(_src + i);
1045 __asm__ (
1046 "vld1.32 {d4-d5}, [%[src1]] \n\t"
1047 "vld1.32 {d6-d7}, [%[src2]] \n\t"
1048 "vmul.f32 q4, q2, q0 \n\t"
1049 "vmul.f32 q5, q3, q0 \n\t"
1050 "vcvt.u32.f32 q6, q4 \n\t"
1051 "vcvt.u32.f32 q7, q5 \n\t"
1052 "vbic q8, q1, q6 \n\t"
1053 "vbic q9, q1, q7 \n\t"
1054 "vshr.u32 q10, q8, #16 \n\t"
1055 "vshr.u32 q11, q9, #16 \n\t"
1056 "vqsub.u32 q12, q6, q10 \n\t"
1057 "vqsub.u32 q13, q7, q11 \n\t"
1058 "vqrshrn.u32 d28, q12, #16 \n\t"
1059 "vqrshrn.u32 d29, q13, #16 \n\t"
1060 "vqmovn.u16 d30, q14 \n\t"
1061 "vst1.8 {d30}, [%[dst]] \n\t"
1062 : /*no output*/
1063 : [src1] "r" (_src + i + 0),
1064 [src2] "r" (_src + i + 4),
1065 [dst] "r" (_dst + i),
1066 "w" (vmult), "w" (vmask)
1067 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
1068 );
1069 }
1070 })
1071 #else
1072 CVT_FUNC(f32, u8, 8,
1073 float32x4_t vmult = vdupq_n_f32((float)(1 << 16));
1074 uint32x4_t vmask = vdupq_n_u32(1<<16);,
1075 {
1076 for (size_t i = 0; i < w; i += 8)
1077 {
1078 internal::prefetch(_src + i);
1079 float32x4_t vline1_f32 = vld1q_f32(_src + i);
1080 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1081
1082 float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult);
1083 float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult);
1084
1085 uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32);
1086 uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32);
1087
1088 uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32);
1089 uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32);
1090 uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16);
1091 uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16);
1092 uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2);
1093 uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2);
1094
1095 uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16);
1096 uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16);
1097
1098 uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
1099 vst1_u8(_dst + i, vline_u8);
1100 }
1101 })
1102 #endif
1103
1104 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1105 CVT_FUNC(f32, s8, 8,
1106 register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1107 {
1108 for (size_t i = 0; i < w; i += 8)
1109 {
1110 internal::prefetch(_src + i);
1111 __asm__ (
1112 "vld1.32 {d2-d3}, [%[src1]] \n\t"
1113 "vld1.32 {d4-d5}, [%[src2]] \n\t"
1114 "vadd.f32 q3, q1, q0 \n\t"
1115 "vadd.f32 q4, q2, q0 \n\t"
1116 "vcvt.s32.f32 q5, q3 \n\t"
1117 "vcvt.s32.f32 q6, q4 \n\t"
1118 "vqmovn.s32 d14, q5 \n\t"
1119 "vqmovn.s32 d15, q6 \n\t"
1120 "vqmovn.s16 d16, q7 \n\t"
1121 "vst1.8 {d16}, [%[dst]] \n\t"
1122 : /*no output*/
1123 : [src1] "r" (_src + i + 0),
1124 [src2] "r" (_src + i + 4),
1125 [dst] "r" (_dst + i),
1126 "w" (vhalf)
1127 : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17"
1128 );
1129 }
1130 })
1131 #else
1132 CVT_FUNC(f32, s8, 8,
1133 float32x4_t vhalf = vdupq_n_f32(0.5f);,
1134 {
1135 for (size_t i = 0; i < w; i += 8)
1136 {
1137 internal::prefetch(_src + i);
1138 float32x4_t vline1_f32 = vld1q_f32(_src + i);
1139 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1140
1141 vline1_f32 = vaddq_f32(vline1_f32, vhalf);
1142 vline2_f32 = vaddq_f32(vline2_f32, vhalf);
1143
1144 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1145 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1146 int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
1147 int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
1148
1149 int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
1150
1151 vst1_s8(_dst + i, vline_s8);
1152 }
1153 })
1154 #endif
1155
1156 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1157 CVT_FUNC(f32, u16, 8,
1158 register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1159 {
1160 for (size_t i = 0; i < w; i += 8)
1161 {
1162 internal::prefetch(_src + i);
1163 __asm__ (
1164 "vld1.32 {d2-d3}, [%[src]] \n\t"
1165 "vadd.f32 q2, q1, q0 \n\t"
1166 "vcvt.u32.f32 q3, q2 \n\t"
1167 "vqmovn.u32 d8, q3 \n\t"
1168 "vst1.16 {d8}, [%[dst]] \n\t"
1169 : /*no output*/
1170 : [src] "r" (_src + i),
1171 [dst] "r" (_dst + i),
1172 "w" (vhalf)
1173 : "d2","d3","d4","d5","d6","d7","d8"
1174 );
1175 __asm__ (
1176 "vld1.32 {d2-d3}, [%[src]] \n\t"
1177 "vadd.f32 q2, q1, q0 \n\t"
1178 "vcvt.u32.f32 q3, q2 \n\t"
1179 "vqmovn.u32 d8, q3 \n\t"
1180 "vst1.16 {d8}, [%[dst]] \n\t"
1181 : /*no output*/
1182 : [src] "r" (_src + i + 4),
1183 [dst] "r" (_dst + i + 4),
1184 "w" (vhalf)
1185 : "d2","d3","d4","d5","d6","d7","d8"
1186 );
1187 }
1188 })
1189 #else
1190 CVT_FUNC(f32, u16, 8,
1191 float32x4_t vhalf = vdupq_n_f32(0.5f);,
1192 {
1193 for (size_t i = 0; i < w; i += 8)
1194 {
1195 internal::prefetch(_src + i);
1196 float32x4_t vline_f32 = vld1q_f32(_src + i);
1197
1198 vline_f32 = vaddq_f32(vline_f32, vhalf);
1199 uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32);
1200 uint16x4_t vline_u16 = vqmovn_u32(vline_u32);
1201
1202 vst1_u16(_dst + i, vline_u16);
1203
1204 vline_f32 = vld1q_f32(_src + i + 4);
1205
1206 vline_f32 = vaddq_f32(vline_f32, vhalf);
1207 vline_u32 = vcvtq_u32_f32(vline_f32);
1208 vline_u16 = vqmovn_u32(vline_u32);
1209
1210 vst1_u16(_dst + i + 4, vline_u16);
1211 }
1212 })
1213 #endif
1214
1215 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1216 CVT_FUNC(f32, s16, 8,
1217 register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1218 {
1219 for (size_t i = 0; i < w; i += 8)
1220 {
1221 internal::prefetch(_src + i);
1222 __asm__ (
1223 "vld1.32 {d2-d3}, [%[src]] \n\t"
1224 "vadd.f32 q2, q1, q0 \n\t"
1225 "vcvt.s32.f32 q3, q2 \n\t"
1226 "vqmovn.s32 d8, q3 \n\t"
1227 "vst1.16 {d8}, [%[dst]] \n\t"
1228 : /*no output*/
1229 : [src] "r" (_src + i),
1230 [dst] "r" (_dst + i),
1231 "w" (vhalf)
1232 : "d2","d3","d4","d5","d6","d7","d8"
1233 );
1234 __asm__ (
1235 "vld1.32 {d2-d3}, [%[src]] \n\t"
1236 "vadd.f32 q2, q1, q0 \n\t"
1237 "vcvt.s32.f32 q3, q2 \n\t"
1238 "vqmovn.s32 d8, q3 \n\t"
1239 "vst1.16 {d8}, [%[dst]] \n\t"
1240 : /*no output*/
1241 : [src] "r" (_src + i + 4),
1242 [dst] "r" (_dst + i + 4),
1243 "w" (vhalf)
1244 : "d2","d3","d4","d5","d6","d7","d8"
1245 );
1246 }
1247 })
1248 #else
1249 CVT_FUNC(f32, s16, 8,
1250 float32x4_t vhalf = vdupq_n_f32(0.5f);,
1251 {
1252 for (size_t i = 0; i < w; i += 8)
1253 {
1254 internal::prefetch(_src + i);
1255 float32x4_t vline_f32 = vld1q_f32(_src + i);
1256
1257 vline_f32 = vaddq_f32(vline_f32, vhalf);
1258 int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1259 int16x4_t vline_s16 = vqmovn_s32(vline_s32);
1260
1261 vst1_s16(_dst + i, vline_s16);
1262
1263 vline_f32 = vld1q_f32(_src + i + 4);
1264
1265 vline_f32 = vaddq_f32(vline_f32, vhalf);
1266 vline_s32 = vcvtq_s32_f32(vline_f32);
1267 vline_s16 = vqmovn_s32(vline_s32);
1268
1269 vst1_s16(_dst + i + 4, vline_s16);
1270 }
1271 })
1272 #endif
1273
1274 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1275 CVT_FUNC(f32, s32, 8,
1276 register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1277 {
1278 for (size_t i = 0; i < w; i += 8)
1279 {
1280 internal::prefetch(_src + i);
1281 __asm__ (
1282 "vld1.32 {d2-d3}, [%[src1]] \n\t"
1283 "vld1.32 {d4-d5}, [%[src2]] \n\t"
1284 "vadd.f32 q3, q1, q0 \n\t"
1285 "vadd.f32 q4, q2, q0 \n\t"
1286 "vcvt.s32.f32 q5, q3 \n\t"
1287 "vcvt.s32.f32 q6, q4 \n\t"
1288 "vst1.32 {q5}, [%[dst1]] \n\t"
1289 "vst1.32 {q6}, [%[dst2]] \n\t"
1290 : /*no output*/
1291 : [src1] "r" (_src + i),
1292 [src2] "r" (_src + i + 4),
1293 [dst1] "r" (_dst + i),
1294 [dst2] "r" (_dst + i + 4),
1295 "w" (vhalf)
1296 : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
1297 );
1298 }
1299 })
1300 #else
1301 CVT_FUNC(f32, s32, 8,
1302 float32x4_t vhalf = vdupq_n_f32(0.5f);,
1303 {
1304 for (size_t i = 0; i < w; i += 8)
1305 {
1306 internal::prefetch(_src + i);
1307 float32x4_t vline_f32 = vld1q_f32(_src + i);
1308
1309 vline_f32 = vaddq_f32(vline_f32, vhalf);
1310 int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1311
1312 vst1q_s32(_dst + i, vline_s32);
1313
1314 vline_f32 = vld1q_f32(_src + i + 4);
1315
1316 vline_f32 = vaddq_f32(vline_f32, vhalf);
1317 vline_s32 = vcvtq_s32_f32(vline_f32);
1318
1319 vst1q_s32(_dst + i + 4, vline_s32);
1320 }
1321 })
1322 #endif
1323
convert(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)1324 void convert(const Size2D &_size,
1325 const u8 * srcBase, ptrdiff_t srcStride,
1326 s16 * dstBase, ptrdiff_t dstStride)
1327 {
1328 convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride);
1329 }
1330
1331 } // namespace CAROTENE_NS
1332