• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 namespace CAROTENE_NS {
43 
44 #ifdef CAROTENE_NEON
45 
46 #define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
47     void convert(const Size2D &_size,                                           \
48                  const T1 * srcBase, ptrdiff_t srcStride,                       \
49                  T2 * dstBase, ptrdiff_t dstStride)                             \
50     {                                                                           \
51         internal::assertSupportedConfiguration();                               \
52         Size2D size(_size);                                                     \
53         if (srcStride == dstStride &&                                           \
54             srcStride == (ptrdiff_t)(size.width))                               \
55         {                                                                       \
56             size.width *= size.height;                                          \
57             size.height = 1;                                                    \
58         }                                                                       \
59         const ptrdiff_t sstep = srcStride / sizeof(T1);                         \
60         const ptrdiff_t dstep = dstStride / sizeof(T2);                         \
61         const size_t w = size.width & ~(SIMD_SIZE-1);                           \
62         if (size.width >= SIMD_SIZE)                                            \
63         {                                                                       \
64             const T1* _src = srcBase;                                           \
65             T2* _dst = dstBase;                                                 \
66             CVTINIT                                                             \
67             for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
68                 CVTROW                                                          \
69         }                                                                       \
70         if(w < size.width)                                                      \
71         {                                                                       \
72             const T1* _src = srcBase;                                           \
73             T2* _dst = dstBase;                                                 \
74             for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
75                 for(size_t i = w; i < size.width; i++ )                         \
76                     _dst[i] = internal::saturate_cast<T2>(_src[i]);             \
77         }                                                                       \
78     }
79 
80 #else
81 
82 #define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
83     void convert(const Size2D &,                                                \
84                  const T1 *, ptrdiff_t,                                         \
85                  T2 *, ptrdiff_t)                                               \
86     {                                                                           \
87         internal::assertSupportedConfiguration();                               \
88     }
89 
90 #endif
91 
92 CVT_FUNC(u8, s8, 16,
93      uint8x16_t v127 = vdupq_n_u8(127);,
94 {
95      for (size_t i = 0; i < w; i += 16)
96      {
97          internal::prefetch(_src + i);
98          uint8x16_t vu8 = vld1q_u8(_src + i);
99          int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127));
100          vst1q_s8(_dst + i, vu1);
101      }
102 })
103 
104 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
105 CVT_FUNC(u8, u16, 16,
106      register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
107 {
108      for (size_t i = 0; i < w; i += 16)
109      {
110          internal::prefetch(_src + i);
111          __asm__ (
112              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
113              "vst2.8 {d0,d2}, [%[dst1]]                             \n\t"
114              "vst2.8 {d1,d3}, [%[dst2]]                             \n\t"
115              : /*no output*/
116              : [src] "r" (_src + i),
117                [dst1] "r" (_dst + i + 0),
118                [dst2] "r" (_dst + i + 8),
119                "w" (zero0)
120              : "d0","d1"
121          );
122      }
123 })
124 #else
125 CVT_FUNC(u8, u16, 16,
126      uint8x16x2_t vline;
127      vline.val[1] = vmovq_n_u8(0);,
128 {
129      for (size_t i = 0; i < w; i += 16)
130      {
131          internal::prefetch(_src + i);
132          vline.val[0] = vld1q_u8(_src + i);
133          vst2q_u8((uint8_t*)(_dst + i), vline);
134      }
135 })
136 #endif
137 
138 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
139 CVT_FUNC(u8, s32, 16,
140      register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);
141      register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0);
142      register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);,
143 {
144      for (size_t i = 0; i < w; i += 16)
145      {
146          internal::prefetch(_src + i);
147          __asm__ (
148              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
149              "vst4.8 {d0,d2,d4,d6}, [%[dst1]]                       \n\t"
150              "vst4.8 {d1,d3,d5,d7}, [%[dst2]]                       \n\t"
151              : /*no output*/
152              : [src] "r" (_src + i),
153                [dst1] "r" (_dst + i + 0),
154                [dst2] "r" (_dst + i + 8),
155                "w" (zero0), "w" (zero1), "w" (zero2)
156              : "d0","d1"
157          );
158      }
159 })
160 #else
161 CVT_FUNC(u8, s32, 16,
162      uint8x16x4_t vline;
163      vline.val[1] = vmovq_n_u8(0);
164      vline.val[2] = vmovq_n_u8(0);
165      vline.val[3] = vmovq_n_u8(0);,
166 {
167      for (size_t i = 0; i < w; i += 16)
168      {
169          internal::prefetch(_src + i);
170         vline.val[0] = vld1q_u8(_src + i);
171         vst4q_u8((uint8_t*)(_dst + i), vline);
172      }
173 })
174 #endif
175 
176 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
177 CVT_FUNC(u8, f32, 16,
178 ,
179 {
180      for (size_t i = 0; i < w; i += 16)
181      {
182          internal::prefetch(_src + i);
183          __asm__ (
184              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
185              "vmovl.u8 q1, d0                                       \n\t"
186              "vmovl.u8 q2, d1                                       \n\t"
187              "vmovl.u16 q3, d2                                      \n\t"
188              "vmovl.u16 q4, d3                                      \n\t"
189              "vmovl.u16 q5, d4                                      \n\t"
190              "vmovl.u16 q6, d5                                      \n\t"
191              "vcvt.f32.u32 q7, q3                                   \n\t"
192              "vcvt.f32.u32 q8, q4                                   \n\t"
193              "vcvt.f32.u32 q9, q5                                   \n\t"
194              "vcvt.f32.u32 q10, q6                                  \n\t"
195              "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
196              "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
197              "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
198              "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
199              : /*no output*/
200              : [src] "r" (_src + i),
201                [dst1] "r" (_dst + i + 0),
202                [dst2] "r" (_dst + i + 4),
203                [dst3] "r" (_dst + i + 8),
204                [dst4] "r" (_dst + i + 12)
205              : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
206          );
207      }
208 })
209 #else
210 CVT_FUNC(u8, f32, 16,
211 ,
212 {
213      for (size_t i = 0; i < w; i += 16)
214      {
215          internal::prefetch(_src + i);
216          uint8x16_t vline_u8 = vld1q_u8(_src + i);
217 
218          uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8));
219          uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8));
220 
221          uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16));
222          uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
223          uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16));
224          uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
225 
226          float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
227          float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
228          float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
229          float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
230 
231          vst1q_f32(_dst + i, vline1_f32);
232          vst1q_f32(_dst + i + 4, vline2_f32);
233          vst1q_f32(_dst + i + 8, vline3_f32);
234          vst1q_f32(_dst + i + 12, vline4_f32);
235      }
236 })
237 #endif
238 
239 CVT_FUNC(s8, u8, 16,
240      int8x16_t vZero = vdupq_n_s8(0);,
241 {
242      for (size_t i = 0; i < w; i += 16)
243      {
244          internal::prefetch(_src + i);
245          int8x16_t vu8 = vld1q_s8(_src + i);
246          uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero));
247          vst1q_u8(_dst + i, vu1);
248      }
249 })
250 
251 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
252 CVT_FUNC(s8, u16, 16,
253      register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
254 {
255      for (size_t i = 0; i < w; i += 16)
256      {
257          internal::prefetch(_src + i);
258          __asm__ (
259              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
260              "vmax.s8 q0, q1                                        \n\t"
261              "vst2.8 {d0,d2}, [%[dst1]]                             \n\t"
262              "vst2.8 {d1,d3}, [%[dst2]]                             \n\t"
263              : /*no output*/
264              : [src] "r" (_src + i),
265                [dst1] "r" (_dst + i + 0),
266                [dst2] "r" (_dst + i + 8),
267                "w" (zero0)
268              : "d0","d1"
269          );
270      }
271 })
272 #else
273 CVT_FUNC(s8, u16, 16,
274      int8x16x2_t vline_s8;
275      vline_s8.val[1] = vmovq_n_s8(0);,
276 {
277      for (size_t i = 0; i < w; i += 16)
278      {
279          internal::prefetch(_src + i);
280          vline_s8.val[0] = vld1q_s8(_src + i);
281          vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]);
282          vst2q_s8((int8_t*)(_dst + i), vline_s8);
283      }
284 })
285 #endif
286 
287 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
288 CVT_FUNC(s8, s16, 16,
289 ,
290 {
291      for (size_t i = 0; i < w; i += 16)
292      {
293          internal::prefetch(_src + i);
294          __asm__ (
295              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
296              "vmovl.s8 q1, d0                                       \n\t"
297              "vmovl.s8 q2, d1                                       \n\t"
298              "vst1.16 {d2-d3}, [%[dst1]]                            \n\t"
299              "vst1.16 {d4-d5}, [%[dst2]]                            \n\t"
300              : /*no output*/
301              : [src] "r" (_src + i),
302                [dst1] "r" (_dst + i + 0),
303                [dst2] "r" (_dst + i + 8)
304              : "d0","d1","d2","d3","d4","d5"
305          );
306      }
307 })
308 #else
309 CVT_FUNC(s8, s16, 16,
310 ,
311 {
312      for (size_t i = 0; i < w; i += 16)
313      {
314          internal::prefetch(_src + i);
315          int8x16_t vline_s8 = vld1q_s8(_src + i);
316 
317          int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
318          int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
319 
320          vst1q_s16(_dst + i, vline1_s16);
321          vst1q_s16(_dst + i + 8, vline2_s16);
322      }
323 })
324 #endif
325 
326 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
327 CVT_FUNC(s8, s32, 16,
328 ,
329 {
330      for (size_t i = 0; i < w; i += 16)
331      {
332          internal::prefetch(_src + i);
333          __asm__ (
334              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
335              "vmovl.s8 q1, d0                                       \n\t"
336              "vmovl.s8 q2, d1                                       \n\t"
337              "vmovl.s16 q3, d2                                      \n\t"
338              "vmovl.s16 q4, d3                                      \n\t"
339              "vmovl.s16 q5, d4                                      \n\t"
340              "vmovl.s16 q6, d5                                      \n\t"
341              "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
342              "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
343              "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
344              "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
345              : /*no output*/
346              : [src] "r" (_src + i),
347                [dst1] "r" (_dst + i + 0),
348                [dst2] "r" (_dst + i + 4),
349                [dst3] "r" (_dst + i + 8),
350                [dst4] "r" (_dst + i + 12)
351              : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
352          );
353      }
354 })
355 #else
356 CVT_FUNC(s8, s32, 16,
357 ,
358 {
359      for (size_t i = 0; i < w; i += 16)
360      {
361          internal::prefetch(_src + i);
362          int8x16_t vline_s8 = vld1q_s8(_src + i);
363 
364          int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
365          int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
366 
367          int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
368          int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
369          int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
370          int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
371 
372          vst1q_s32(_dst + i, vline1_s32);
373          vst1q_s32(_dst + i + 4, vline2_s32);
374          vst1q_s32(_dst + i + 8, vline3_s32);
375          vst1q_s32(_dst + i + 12, vline4_s32);
376      }
377 })
378 #endif
379 
380 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
381 CVT_FUNC(s8, f32, 16,
382 ,
383 {
384      for (size_t i = 0; i < w; i += 16)
385      {
386          internal::prefetch(_src + i);
387          __asm__ (
388              "vld1.8 {d0-d1}, [%[src]]                              \n\t"
389              "vmovl.s8 q1, d0                                       \n\t"
390              "vmovl.s8 q2, d1                                       \n\t"
391              "vmovl.s16 q3, d2                                      \n\t"
392              "vmovl.s16 q4, d3                                      \n\t"
393              "vmovl.s16 q5, d4                                      \n\t"
394              "vmovl.s16 q6, d5                                      \n\t"
395              "vcvt.f32.s32 q7, q3                                   \n\t"
396              "vcvt.f32.s32 q8, q4                                   \n\t"
397              "vcvt.f32.s32 q9, q5                                   \n\t"
398              "vcvt.f32.s32 q10, q6                                  \n\t"
399              "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
400              "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
401              "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
402              "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
403              : /*no output*/
404              : [src] "r" (_src + i),
405                [dst1] "r" (_dst + i + 0),
406                [dst2] "r" (_dst + i + 4),
407                [dst3] "r" (_dst + i + 8),
408                [dst4] "r" (_dst + i + 12)
409              : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
410          );
411      }
412 })
413 #else
414 CVT_FUNC(s8, f32, 16,
415 ,
416 {
417      for (size_t i = 0; i < w; i += 16)
418      {
419          internal::prefetch(_src + i);
420          int8x16_t vline_s8 = vld1q_s8(_src + i);
421 
422          int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
423          int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
424 
425          int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
426          int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
427          int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
428          int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
429 
430          float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
431          float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
432          float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
433          float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
434 
435          vst1q_f32(_dst + i, vline1_f32);
436          vst1q_f32(_dst + i + 4, vline2_f32);
437          vst1q_f32(_dst + i + 8, vline3_f32);
438          vst1q_f32(_dst + i + 12, vline4_f32);
439      }
440 })
441 #endif
442 
443 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
444 CVT_FUNC(u16, u8, 16,
445 ,
446 {
447      for (size_t i = 0; i < w; i += 16)
448      {
449          internal::prefetch(_src + i);
450          __asm__ (
451              "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
452              "vqmovn.u16 d4, q0                                     \n\t"
453              "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
454              "vqmovn.u16 d5, q1                                     \n\t"
455              "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
456              : /*no output*/
457              : [src1] "r" (_src + i),
458                [src2] "r" (_src + i + 8),
459                [dst] "r" (_dst + i + 0)
460              : "d0","d1","d2","d3","d4","d5"
461          );
462      }
463 })
464 #else
465 CVT_FUNC(u16, u8, 16,
466 ,
467 {
468      for (size_t i = 0; i < w; i += 16)
469      {
470          internal::prefetch(_src + i);
471          uint16x8_t vline1_u16 = vld1q_u16(_src + i);
472          uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
473 
474          uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
475          uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
476 
477          vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
478      }
479 })
480 #endif
481 
482 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
483 CVT_FUNC(u16, s8, 16,
484     register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);,
485 {
486     for (size_t i = 0; i < w; i += 16)
487     {
488         internal::prefetch(_src + i);
489         __asm__ (
490             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
491             "vqmovn.u16 d4, q0                                     \n\t"
492             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
493             "vqmovn.u16 d5, q1                                     \n\t"
494             "vmin.u8 q3, q2, q4                                    \n\t"
495             "vst1.8 {d6-d7}, [%[dst]]                              \n\t"
496             : /*no output*/
497             : [src1] "r" (_src + i),
498               [src2] "r" (_src + i + 8),
499               [dst] "r" (_dst + i + 0),
500               "w" (v127)
501             : "d0","d1","d2","d3","d4","d5","d6","d7"
502          );
503     }
504 })
505 #else
506 CVT_FUNC(u16, s8, 16,
507     uint8x8_t v127 = vmov_n_u8(127);,
508 {
509     for (size_t i = 0; i < w; i += 16)
510     {
511         internal::prefetch(_src + i);
512         uint16x8_t vline1_u16 = vld1q_u16(_src + i);
513         uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
514 
515         uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
516         uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
517         vline1_u8 = vmin_u8(vline1_u8, v127);
518         vline2_u8 = vmin_u8(vline2_u8, v127);
519 
520         vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8)));
521     }
522 })
523 #endif
524 
525 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
526 CVT_FUNC(u16, s16, 8,
527      register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);,
528 {
529      for (size_t i = 0; i < w; i += 8)
530      {
531          internal::prefetch(_src + i);
532          __asm__ (
533              "vld1.16 {d0-d1}, [%[src]]                              \n\t"
534              "vmin.u16 q1, q0, q4                                    \n\t"
535              "vst1.16 {d2-d3}, [%[dst]]                              \n\t"
536              : /*no output*/
537              : [src] "r" (_src + i),
538                [dst] "r" (_dst + i + 0),
539                "w" (v32767)
540              : "d0","d1","d2","d3"
541          );
542      }
543 })
544 #else
545 CVT_FUNC(u16, s16, 8,
546      uint16x8_t v32767 = vmovq_n_u16(0x7FFF);,
547 {
548      for (size_t i = 0; i < w; i += 8)
549      {
550          internal::prefetch(_src + i);
551          uint16x8_t vline_u16 = vld1q_u16(_src + i);
552          vline_u16 = vminq_u16(vline_u16, v32767);
553          vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16));
554      }
555 })
556 #endif
557 
558 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
559 CVT_FUNC(u16, s32, 8,
560      register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);,
561 {
562      for (size_t i = 0; i < w; i += 8)
563      {
564          internal::prefetch(_src + i);
565          __asm__ (
566              "vld1.16 {d0-d1}, [%[src]]                        \n\t"
567              "vst2.16 {d0,d2}, [%[dst1]]                       \n\t"
568              "vst2.16 {d1,d3}, [%[dst2]]                       \n\t"
569              : /*no output*/
570              : [src] "r" (_src + i),
571                [dst1] "r" (_dst + i),
572                [dst2] "r" (_dst + i + 4),
573                "w" (zero0)
574              : "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7"
575          );
576      }
577 })
578 #else
579 CVT_FUNC(u16, s32, 8,
580      uint16x8x2_t vline;
581      vline.val[1] = vmovq_n_u16(0);,
582 {
583      for (size_t i = 0; i < w; i += 8)
584      {
585          internal::prefetch(_src + i);
586          vline.val[0] = vld1q_u16(_src + i);
587          vst2q_u16((uint16_t*)(_dst + i), vline);
588      }
589 })
590 #endif
591 
592 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
593 CVT_FUNC(u16, f32, 8,
594 ,
595 {
596      for (size_t i = 0; i < w; i += 8)
597      {
598          internal::prefetch(_src + i);
599          __asm__ (
600              "vld1.16 {d0-d1}, [%[src]]                              \n\t"
601              "vmovl.u16 q1, d0                                       \n\t"
602              "vmovl.u16 q2, d1                                       \n\t"
603              "vcvt.f32.u32 q3, q1                                    \n\t"
604              "vcvt.f32.u32 q4, q2                                    \n\t"
605              "vst1.32 {d6-d7}, [%[dst1]]                             \n\t"
606              "vst1.32 {d8-d9}, [%[dst2]]                             \n\t"
607              : /*no output*/
608              : [src] "r" (_src + i),
609                [dst1] "r" (_dst + i + 0),
610                [dst2] "r" (_dst + i + 4)
611              : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
612          );
613      }
614 })
615 #else
616 CVT_FUNC(u16, f32, 8,
617 ,
618 {
619      for (size_t i = 0; i < w; i += 8)
620      {
621          internal::prefetch(_src + i);
622          uint16x8_t vline_u16 = vld1q_u16(_src + i);
623 
624          uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16));
625          uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16));
626 
627          float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo);
628          float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi);
629 
630          vst1q_f32(_dst + i, vline_f32_lo);
631          vst1q_f32(_dst + i + 4, vline_f32_hi);
632      }
633 })
634 #endif
635 
636 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
637 CVT_FUNC(s16, u8, 16,
638 ,
639 {
640      for (size_t i = 0; i < w; i += 16)
641      {
642          internal::prefetch(_src + i);
643          __asm__ (
644              "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
645              "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
646              "vqmovun.s16 d4, q0                                    \n\t"
647              "vqmovun.s16 d5, q1                                    \n\t"
648              "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
649              : /*no output*/
650              : [src1] "r" (_src + i),
651                [src2] "r" (_src + i + 8),
652                [dst] "r" (_dst + i + 0)
653              : "d0","d1","d2","d3","d4","d5"
654          );
655      }
656 })
657 #else
658 CVT_FUNC(s16, u8, 16,
659 ,
660 {
661      for (size_t i = 0; i < w; i += 16)
662      {
663          internal::prefetch(_src + i);
664          int16x8_t vline1_s16 = vld1q_s16(_src + i);
665          int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
666 
667          uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16);
668          uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16);
669 
670          vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
671      }
672 })
673 #endif
674 
675 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
676 CVT_FUNC(s16, s8, 16,
677 ,
678 {
679      for (size_t i = 0; i < w; i += 16)
680      {
681          internal::prefetch(_src + i);
682          __asm__ (
683              "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
684              "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
685              "vqmovn.s16 d4, q0                                     \n\t"
686              "vqmovn.s16 d5, q1                                     \n\t"
687              "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
688              : /*no output*/
689              : [src1] "r" (_src + i),
690                [src2] "r" (_src + i + 8),
691                [dst] "r" (_dst + i + 0)
692              : "d0","d1","d2","d3","d4","d5"
693          );
694      }
695 })
696 #else
697 CVT_FUNC(s16, s8, 16,
698 ,
699 {
700      for (size_t i = 0; i < w; i += 16)
701      {
702          internal::prefetch(_src + i);
703          int16x8_t vline1_s16 = vld1q_s16(_src + i);
704          int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
705 
706          int8x8_t vline1_s8 = vqmovn_s16(vline1_s16);
707          int8x8_t vline2_s8 = vqmovn_s16(vline2_s16);
708 
709          vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8));
710      }
711 })
712 #endif
713 
714 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
715 CVT_FUNC(s16, u16, 8,
716      register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);,
717 {
718      for (size_t i = 0; i < w; i += 8)
719      {
720          internal::prefetch(_src + i);
721          __asm__ (
722              "vld1.16 {d0-d1}, [%[src]]                              \n\t"
723              "vmax.s16 q1, q0, q4                                    \n\t"
724              "vst1.16 {d2-d3}, [%[dst]]                              \n\t"
725              : /*no output*/
726              : [src] "r" (_src + i),
727                [dst] "r" (_dst + i + 0),
728                "w" (vZero)
729              : "d0","d1","d2","d3"
730          );
731      }
732 })
733 #else
734 CVT_FUNC(s16, u16, 8,
735      int16x4_t vZero = vmov_n_s16(0);,
736 {
737      for (size_t i = 0; i < w; i += 8)
738      {
739          internal::prefetch(_src + i);
740          int16x8_t vline_s16 = vld1q_s16(_src + i);
741 
742          int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero);
743          int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero);
744 
745          vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi)));
746      }
747 })
748 #endif
749 
750 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
751 CVT_FUNC(s16, s32, 8,
752 ,
753 {
754      for (size_t i = 0; i < w; i += 8)
755      {
756          internal::prefetch(_src + i);
757          __asm__ (
758              "vld1.16 {d0-d1}, [%[src]]                              \n\t"
759              "vmovl.s16 q1, d0                                       \n\t"
760              "vmovl.s16 q2, d1                                       \n\t"
761              "vst1.32 {d2-d3}, [%[dst1]]                             \n\t"
762              "vst1.32 {d4-d5}, [%[dst2]]                             \n\t"
763              : /*no output*/
764              : [src] "r" (_src + i),
765                [dst1] "r" (_dst + i + 0),
766                [dst2] "r" (_dst + i + 4)
767              : "d0","d1","d2","d3","d4","d5"
768          );
769      }
770 })
771 #else
772 CVT_FUNC(s16, s32, 8,
773 ,
774 {
775      for (size_t i = 0; i < w; i += 8)
776      {
777          internal::prefetch(_src + i);
778          int16x8_t vline_s16 = vld1q_s16(_src + i);
779 
780          int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
781          int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
782 
783          vst1q_s32(_dst + i, vline_s32_lo);
784          vst1q_s32(_dst + i + 4, vline_s32_hi);
785      }
786 })
787 #endif
788 
789 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
790 CVT_FUNC(s16, f32, 8,
791 ,
792 {
793      for (size_t i = 0; i < w; i += 8)
794      {
795          internal::prefetch(_src + i);
796          __asm__ (
797              "vld1.16 {d0-d1}, [%[src]]                              \n\t"
798              "vmovl.s16 q1, d0                                       \n\t"
799              "vmovl.s16 q2, d1                                       \n\t"
800              "vcvt.f32.s32 q3, q1                                    \n\t"
801              "vcvt.f32.s32 q4, q2                                    \n\t"
802              "vst1.32 {d6-d7}, [%[dst1]]                             \n\t"
803              "vst1.32 {d8-d9}, [%[dst2]]                             \n\t"
804              : /*no output*/
805              : [src] "r" (_src + i),
806                [dst1] "r" (_dst + i + 0),
807                [dst2] "r" (_dst + i + 4)
808              : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
809          );
810      }
811 })
812 #else
813 CVT_FUNC(s16, f32, 8,
814 ,
815 {
816      for (size_t i = 0; i < w; i += 8)
817      {
818          internal::prefetch(_src + i);
819          int16x8_t vline_s16 = vld1q_s16(_src + i);
820 
821          int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
822          int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
823          float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo);
824          float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi);
825 
826          vst1q_f32(_dst + i, vline_f32_lo);
827          vst1q_f32(_dst + i + 4, vline_f32_hi);
828      }
829 })
830 #endif
831 
832 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
833 CVT_FUNC(s32, u8, 8,
834 ,
835 {
836      for (size_t i = 0; i < w; i += 8)
837      {
838          internal::prefetch(_src + i);
839          __asm__ (
840              "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
841              "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
842              "vqmovun.s32 d4, q0                                      \n\t"
843              "vqmovun.s32 d5, q1                                      \n\t"
844              "vqmovn.u16  d6, q2                                      \n\t"
845              "vst1.8 {d6}, [%[dst]]                                   \n\t"
846              : /*no output*/
847              : [src1] "r" (_src + i + 0),
848                [src2] "r" (_src + i + 4),
849                [dst] "r" (_dst + i)
850              : "d0","d1","d2","d3","d4","d5","d6"
851          );
852      }
853 })
854 #else
855 CVT_FUNC(s32, u8, 8,
856 ,
857 {
858      for (size_t i = 0; i < w; i += 8)
859      {
860          internal::prefetch(_src + i);
861          int32x4_t vline1_s32 = vld1q_s32(_src + i);
862          int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
863 
864          uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
865          uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
866          uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
867 
868          vst1_u8(_dst + i, vline_u8);
869      }
870 })
871 #endif
872 
873 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
874 CVT_FUNC(s32, s8, 8,
875 ,
876 {
877      for (size_t i = 0; i < w; i += 8)
878      {
879          internal::prefetch(_src + i);
880          __asm__ (
881              "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
882              "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
883              "vqmovn.s32 d4, q0                                       \n\t"
884              "vqmovn.s32 d5, q1                                       \n\t"
885              "vqmovn.s16  d6, q2                                      \n\t"
886              "vst1.8 {d6}, [%[dst]]                                   \n\t"
887              : /*no output*/
888              : [src1] "r" (_src + i + 0),
889                [src2] "r" (_src + i + 4),
890                [dst] "r" (_dst + i)
891              : "d0","d1","d2","d3","d4","d5","d6"
892          );
893      }
894 })
895 #else
896 CVT_FUNC(s32, s8, 8,
897 ,
898 {
899      for (size_t i = 0; i < w; i += 8)
900      {
901          internal::prefetch(_src + i);
902          int32x4_t vline1_s32 = vld1q_s32(_src + i);
903          int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
904 
905          int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
906          int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
907          int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
908 
909          vst1_s8(_dst + i, vline_s8);
910      }
911 })
912 #endif
913 
914 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
915 CVT_FUNC(s32, u16, 8,
916 ,
917 {
918      for (size_t i = 0; i < w; i += 8)
919      {
920          internal::prefetch(_src + i);
921          __asm__ (
922              "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
923              "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
924              "vqmovun.s32 d4, q0                                      \n\t"
925              "vqmovun.s32 d5, q1                                      \n\t"
926              "vst1.16 {d4-d5}, [%[dst]]                               \n\t"
927              : /*no output*/
928              : [src1] "r" (_src + i + 0),
929                [src2] "r" (_src + i + 4),
930                [dst] "r" (_dst + i)
931              : "d0","d1","d2","d3","d4","d5"
932          );
933      }
934 })
935 #else
936 CVT_FUNC(s32, u16, 8,
937 ,
938 {
939      for (size_t i = 0; i < w; i += 8)
940      {
941          internal::prefetch(_src + i);
942          int32x4_t vline1_s32 = vld1q_s32(_src + i);
943          int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
944 
945          uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
946          uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
947 
948          vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16));
949      }
950 })
951 #endif
952 
953 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
954 CVT_FUNC(s32, s16, 8,
955 ,
956 {
957      for (size_t i = 0; i < w; i += 8)
958      {
959          internal::prefetch(_src + i);
960          __asm__ (
961              "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
962              "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
963              "vqmovn.s32 d4, q0                                       \n\t"
964              "vqmovn.s32 d5, q1                                       \n\t"
965              "vst1.8 {d4-d5}, [%[dst]]                                \n\t"
966              : /*no output*/
967              : [src1] "r" (_src + i + 0),
968                [src2] "r" (_src + i + 4),
969                [dst] "r" (_dst + i)
970              : "d0","d1","d2","d3","d4","d5"
971          );
972      }
973 })
974 #else
975 CVT_FUNC(s32, s16, 8,
976 ,
977 {
978      for (size_t i = 0; i < w; i += 8)
979      {
980          internal::prefetch(_src + i);
981          int32x4_t vline1_s32 = vld1q_s32(_src + i);
982          int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
983 
984          int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
985          int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
986 
987          vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16));
988      }
989 })
990 #endif
991 
992 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
993 CVT_FUNC(s32, f32, 8,
994 ,
995 {
996      for (size_t i = 0; i < w; i += 8)
997      {
998          internal::prefetch(_src + i);
999          __asm__ (
1000              "vld1.32 {d0-d1}, [%[src]]                              \n\t"
1001              "vcvt.f32.s32 q1, q0                                    \n\t"
1002              "vst1.32 {d2-d3}, [%[dst]]                              \n\t"
1003              : /*no output*/
1004              : [src] "r" (_src + i),
1005                [dst] "r" (_dst + i)
1006              : "d0","d1","d2","d3"//,"d4","d5"
1007          );
1008          __asm__ (
1009              "vld1.32 {d0-d1}, [%[src]]                              \n\t"
1010              "vcvt.f32.s32 q1, q0                                    \n\t"
1011              "vst1.32 {d2-d3}, [%[dst]]                              \n\t"
1012              : /*no output*/
1013              : [src] "r" (_src + i + 4),
1014                [dst] "r" (_dst + i + 4)
1015              : "d0","d1","d2","d3"//,"d4","d5"
1016          );
1017      }
1018 })
1019 #else
1020 CVT_FUNC(s32, f32, 8,
1021 ,
1022 {
1023      for (size_t i = 0; i < w; i += 8)
1024      {
1025          internal::prefetch(_src + i);
1026          int32x4_t vline_s32 = vld1q_s32(_src + i);
1027          float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32);
1028          vst1q_f32(_dst + i, vline_f32);
1029 
1030          vline_s32 = vld1q_s32(_src + i + 4);
1031          vline_f32 = vcvtq_f32_s32(vline_s32);
1032          vst1q_f32(_dst + i + 4, vline_f32);
1033      }
1034 })
1035 #endif
1036 
1037 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1038 CVT_FUNC(f32, u8, 8,
1039     register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16));
1040     register uint32x4_t  vmask asm ("q1") = vdupq_n_u32(1<<16);,
1041 {
1042     for (size_t i = 0; i < w; i += 8)
1043     {
1044         internal::prefetch(_src + i);
1045         __asm__ (
1046             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
1047             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
1048             "vmul.f32 q4, q2, q0                                     \n\t"
1049             "vmul.f32 q5, q3, q0                                     \n\t"
1050             "vcvt.u32.f32 q6, q4                                     \n\t"
1051             "vcvt.u32.f32 q7, q5                                     \n\t"
1052             "vbic q8, q1, q6                                         \n\t"
1053             "vbic q9, q1, q7                                         \n\t"
1054             "vshr.u32 q10, q8, #16                                   \n\t"
1055             "vshr.u32 q11, q9, #16                                   \n\t"
1056             "vqsub.u32 q12, q6, q10                                  \n\t"
1057             "vqsub.u32 q13, q7, q11                                  \n\t"
1058             "vqrshrn.u32 d28, q12, #16                               \n\t"
1059             "vqrshrn.u32 d29, q13, #16                               \n\t"
1060             "vqmovn.u16 d30, q14                                     \n\t"
1061             "vst1.8 {d30}, [%[dst]]                                  \n\t"
1062             : /*no output*/
1063             : [src1] "r" (_src + i + 0),
1064               [src2] "r" (_src + i + 4),
1065               [dst] "r" (_dst + i),
1066               "w" (vmult), "w" (vmask)
1067             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
1068         );
1069      }
1070 })
1071 #else
1072 CVT_FUNC(f32, u8, 8,
1073     float32x4_t vmult = vdupq_n_f32((float)(1 << 16));
1074     uint32x4_t  vmask = vdupq_n_u32(1<<16);,
1075 {
1076     for (size_t i = 0; i < w; i += 8)
1077     {
1078         internal::prefetch(_src + i);
1079         float32x4_t vline1_f32 = vld1q_f32(_src + i);
1080         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1081 
1082         float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult);
1083         float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult);
1084 
1085         uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32);
1086         uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32);
1087 
1088         uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32);
1089         uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32);
1090         uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16);
1091         uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16);
1092         uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2);
1093         uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2);
1094 
1095         uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16);
1096         uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16);
1097 
1098         uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
1099         vst1_u8(_dst + i, vline_u8);
1100      }
1101 })
1102 #endif
1103 
1104 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1105 CVT_FUNC(f32, s8, 8,
1106      register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1107 {
1108      for (size_t i = 0; i < w; i += 8)
1109      {
1110          internal::prefetch(_src + i);
1111          __asm__ (
1112              "vld1.32 {d2-d3}, [%[src1]]                              \n\t"
1113              "vld1.32 {d4-d5}, [%[src2]]                              \n\t"
1114              "vadd.f32 q3, q1, q0                                     \n\t"
1115              "vadd.f32 q4, q2, q0                                     \n\t"
1116              "vcvt.s32.f32 q5, q3                                     \n\t"
1117              "vcvt.s32.f32 q6, q4                                     \n\t"
1118              "vqmovn.s32 d14, q5                                      \n\t"
1119              "vqmovn.s32 d15, q6                                      \n\t"
1120              "vqmovn.s16 d16, q7                                      \n\t"
1121              "vst1.8 {d16}, [%[dst]]                                  \n\t"
1122              : /*no output*/
1123              : [src1] "r" (_src + i + 0),
1124                [src2] "r" (_src + i + 4),
1125                [dst] "r" (_dst + i),
1126                "w" (vhalf)
1127              : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17"
1128          );
1129      }
1130 })
1131 #else
1132 CVT_FUNC(f32, s8, 8,
1133      float32x4_t vhalf = vdupq_n_f32(0.5f);,
1134 {
1135      for (size_t i = 0; i < w; i += 8)
1136      {
1137          internal::prefetch(_src + i);
1138          float32x4_t vline1_f32 = vld1q_f32(_src + i);
1139          float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1140 
1141          vline1_f32 = vaddq_f32(vline1_f32, vhalf);
1142          vline2_f32 = vaddq_f32(vline2_f32, vhalf);
1143 
1144          int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1145          int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1146          int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
1147          int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
1148 
1149          int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
1150 
1151          vst1_s8(_dst + i, vline_s8);
1152      }
1153 })
1154 #endif
1155 
1156 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1157 CVT_FUNC(f32, u16, 8,
1158      register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1159 {
1160      for (size_t i = 0; i < w; i += 8)
1161      {
1162          internal::prefetch(_src + i);
1163          __asm__ (
1164              "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1165              "vadd.f32 q2, q1, q0                                     \n\t"
1166              "vcvt.u32.f32 q3, q2                                     \n\t"
1167              "vqmovn.u32 d8, q3                                       \n\t"
1168              "vst1.16 {d8}, [%[dst]]                                  \n\t"
1169              : /*no output*/
1170              : [src] "r" (_src + i),
1171                [dst] "r" (_dst + i),
1172                "w" (vhalf)
1173              : "d2","d3","d4","d5","d6","d7","d8"
1174          );
1175          __asm__ (
1176              "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1177              "vadd.f32 q2, q1, q0                                     \n\t"
1178              "vcvt.u32.f32 q3, q2                                     \n\t"
1179              "vqmovn.u32 d8, q3                                       \n\t"
1180              "vst1.16 {d8}, [%[dst]]                                  \n\t"
1181              : /*no output*/
1182              : [src] "r" (_src + i + 4),
1183                [dst] "r" (_dst + i + 4),
1184                "w" (vhalf)
1185              : "d2","d3","d4","d5","d6","d7","d8"
1186          );
1187      }
1188 })
1189 #else
1190 CVT_FUNC(f32, u16, 8,
1191      float32x4_t vhalf = vdupq_n_f32(0.5f);,
1192 {
1193      for (size_t i = 0; i < w; i += 8)
1194      {
1195          internal::prefetch(_src + i);
1196          float32x4_t vline_f32 = vld1q_f32(_src + i);
1197 
1198          vline_f32 = vaddq_f32(vline_f32, vhalf);
1199          uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32);
1200          uint16x4_t vline_u16 = vqmovn_u32(vline_u32);
1201 
1202          vst1_u16(_dst + i, vline_u16);
1203 
1204          vline_f32 = vld1q_f32(_src + i + 4);
1205 
1206          vline_f32 = vaddq_f32(vline_f32, vhalf);
1207          vline_u32 = vcvtq_u32_f32(vline_f32);
1208          vline_u16 = vqmovn_u32(vline_u32);
1209 
1210          vst1_u16(_dst + i + 4, vline_u16);
1211      }
1212 })
1213 #endif
1214 
1215 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1216 CVT_FUNC(f32, s16, 8,
1217      register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1218 {
1219      for (size_t i = 0; i < w; i += 8)
1220      {
1221          internal::prefetch(_src + i);
1222          __asm__ (
1223              "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1224              "vadd.f32 q2, q1, q0                                     \n\t"
1225              "vcvt.s32.f32 q3, q2                                     \n\t"
1226              "vqmovn.s32 d8, q3                                       \n\t"
1227              "vst1.16 {d8}, [%[dst]]                                  \n\t"
1228              : /*no output*/
1229              : [src] "r" (_src + i),
1230                [dst] "r" (_dst + i),
1231                "w" (vhalf)
1232              : "d2","d3","d4","d5","d6","d7","d8"
1233          );
1234          __asm__ (
1235              "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1236              "vadd.f32 q2, q1, q0                                     \n\t"
1237              "vcvt.s32.f32 q3, q2                                     \n\t"
1238              "vqmovn.s32 d8, q3                                       \n\t"
1239              "vst1.16 {d8}, [%[dst]]                                  \n\t"
1240              : /*no output*/
1241              : [src] "r" (_src + i + 4),
1242                [dst] "r" (_dst + i + 4),
1243                "w" (vhalf)
1244              : "d2","d3","d4","d5","d6","d7","d8"
1245          );
1246      }
1247 })
1248 #else
1249 CVT_FUNC(f32, s16, 8,
1250      float32x4_t vhalf = vdupq_n_f32(0.5f);,
1251 {
1252      for (size_t i = 0; i < w; i += 8)
1253      {
1254          internal::prefetch(_src + i);
1255          float32x4_t vline_f32 = vld1q_f32(_src + i);
1256 
1257          vline_f32 = vaddq_f32(vline_f32, vhalf);
1258          int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1259          int16x4_t vline_s16 = vqmovn_s32(vline_s32);
1260 
1261          vst1_s16(_dst + i, vline_s16);
1262 
1263          vline_f32 = vld1q_f32(_src + i + 4);
1264 
1265          vline_f32 = vaddq_f32(vline_f32, vhalf);
1266          vline_s32 = vcvtq_s32_f32(vline_f32);
1267          vline_s16 = vqmovn_s32(vline_s32);
1268 
1269          vst1_s16(_dst + i + 4, vline_s16);
1270      }
1271 })
1272 #endif
1273 
1274 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1275 CVT_FUNC(f32, s32, 8,
1276      register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1277 {
1278      for (size_t i = 0; i < w; i += 8)
1279      {
1280          internal::prefetch(_src + i);
1281          __asm__ (
1282              "vld1.32 {d2-d3}, [%[src1]]                              \n\t"
1283              "vld1.32 {d4-d5}, [%[src2]]                              \n\t"
1284              "vadd.f32 q3, q1, q0                                     \n\t"
1285              "vadd.f32 q4, q2, q0                                     \n\t"
1286              "vcvt.s32.f32 q5, q3                                     \n\t"
1287              "vcvt.s32.f32 q6, q4                                     \n\t"
1288              "vst1.32 {q5}, [%[dst1]]                                 \n\t"
1289              "vst1.32 {q6}, [%[dst2]]                                 \n\t"
1290              : /*no output*/
1291              : [src1] "r" (_src + i),
1292                [src2] "r" (_src + i + 4),
1293                [dst1] "r" (_dst + i),
1294                [dst2] "r" (_dst + i + 4),
1295                "w" (vhalf)
1296              : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
1297          );
1298      }
1299 })
1300 #else
1301 CVT_FUNC(f32, s32, 8,
1302      float32x4_t vhalf = vdupq_n_f32(0.5f);,
1303 {
1304      for (size_t i = 0; i < w; i += 8)
1305      {
1306          internal::prefetch(_src + i);
1307          float32x4_t vline_f32 = vld1q_f32(_src + i);
1308 
1309          vline_f32 = vaddq_f32(vline_f32, vhalf);
1310          int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1311 
1312          vst1q_s32(_dst + i, vline_s32);
1313 
1314          vline_f32 = vld1q_f32(_src + i + 4);
1315 
1316          vline_f32 = vaddq_f32(vline_f32, vhalf);
1317          vline_s32 = vcvtq_s32_f32(vline_f32);
1318 
1319          vst1q_s32(_dst + i + 4, vline_s32);
1320      }
1321 })
1322 #endif
1323 
convert(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)1324 void convert(const Size2D &_size,
1325              const u8 * srcBase, ptrdiff_t srcStride,
1326              s16 * dstBase, ptrdiff_t dstStride)
1327 {
1328     convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride);
1329 }
1330 
1331 } // namespace CAROTENE_NS
1332