1 /* 2 3 Copyright (c) 2013 STMicroelectronics 4 Written by Christophe Lyon 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy 7 of this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights 9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 copies of the Software, and to permit persons to whom the Software is 11 furnished to do so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in 14 all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 THE SOFTWARE. 23 24 */ 25 26 #if defined(__arm__) || defined(__aarch64__) 27 #include <arm_neon.h> 28 #else 29 #include "stm-arm-neon.h" 30 #endif 31 #include "stm-arm-neon-ref.h" 32 33 /* Initialization helpers; 4 slices are needed for vld2, vld3 and 34 vld4. */ 35 #define MY_INIT_TAB(T,W,N) xNAME(INIT_TAB,N)(T##W##_t) 36 #define MY_INIT_TAB2(T,W,N) xNAME(INIT_TAB2,N)(T##W##_t) 37 #define MY_INIT_TAB3(T,W,N) xNAME(INIT_TAB3,N)(T##W##_t) 38 #define MY_INIT_TAB4(T,W,N) xNAME(INIT_TAB4,N)(T##W##_t) 39 40 /* Initialized input buffers. */ 41 #define VECT_VAR_DECL_INIT(V, T, W, N) \ 42 VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,N) }; 43 44 /* Specialized initializer with 4 entries, as used by vldX_dup and 45 vdup tests, which iterate 4 times on input buffers. */ 46 #define VECT_VAR_DECL_INIT4(V, T, W, N) \ 47 VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,4) }; 48 49 /* Initializers for arrays of vectors. */ 50 #define VECT_ARRAY_INIT2(V, T, W, N) \ 51 T##W##_t VECT_ARRAY_VAR(V,T,W,N,2)[] = \ 52 { MY_INIT_TAB(T,W,N) \ 53 MY_INIT_TAB2(T,W,N) }; 54 55 #define VECT_ARRAY_INIT3(V, T, W, N) \ 56 T##W##_t VECT_ARRAY_VAR(V,T,W,N,3)[] = \ 57 { MY_INIT_TAB(T,W,N) \ 58 MY_INIT_TAB2(T,W,N) \ 59 MY_INIT_TAB3(T,W,N) }; 60 61 #define VECT_ARRAY_INIT4(V, T, W, N) \ 62 T##W##_t VECT_ARRAY_VAR(V,T,W,N,4)[] = \ 63 { MY_INIT_TAB(T,W,N) \ 64 MY_INIT_TAB2(T,W,N) \ 65 MY_INIT_TAB3(T,W,N) \ 66 MY_INIT_TAB4(T,W,N) }; 67 68 /* Sample initialization vectors. */ 69 #define INIT_TAB_1(T) \ 70 (T)-16, 71 #define INIT_TAB2_1(T) \ 72 (T)-15, 73 #define INIT_TAB3_1(T) \ 74 (T)-14, 75 #define INIT_TAB4_1(T) \ 76 (T)-13, 77 78 #define INIT_TAB_2(T) \ 79 (T)-16, (T)-15, 80 #define INIT_TAB2_2(T) \ 81 (T)-14, (T)-13, 82 #define INIT_TAB3_2(T) \ 83 (T)-12, (T)-11, 84 #define INIT_TAB4_2(T) \ 85 (T)-10, (T)-9, 86 87 /* Initializer for vld3_lane tests. */ 88 #define INIT_TAB_3(T) \ 89 (T)-16, (T)-15, (T)-14, 90 91 #define INIT_TAB_4(T) \ 92 (T)-16, (T)-15, (T)-14, (T)-13, 93 #define INIT_TAB2_4(T) \ 94 (T)-12, (T)-11, (T)-10, (T)-9, 95 #define INIT_TAB3_4(T) \ 96 (T)-8, (T)-7, (T)-6, (T)-5, 97 #define INIT_TAB4_4(T) \ 98 (T)-4, (T)-3, (T)-2, (T)-1, 99 100 #define INIT_TAB_8(T) \ 101 (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9, 102 #define INIT_TAB2_8(T) \ 103 (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1, 104 #define INIT_TAB3_8(T) \ 105 (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7, 106 #define INIT_TAB4_8(T) \ 107 (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15, 108 109 #define INIT_TAB_16(T) \ 110 (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9, \ 111 (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1, 112 #define INIT_TAB2_16(T) \ 113 (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7, \ 114 (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15, 115 #define INIT_TAB3_16(T) \ 116 (T)16, (T)17, (T)18, (T)19, (T)20, (T)21, (T)22, (T)23, \ 117 (T)24, (T)25, (T)26, (T)27, (T)28, (T)29, (T)30, (T)31, 118 #define INIT_TAB4_16(T) \ 119 (T)32, (T)33, (T)34, (T)35, (T)36, (T)37, (T)38, (T)39, \ 120 (T)40, (T)41, (T)42, (T)43, (T)44, (T)45, (T)46, (T)47, 121 122 /* Input buffers, one of each size. */ 123 /* Insert some padding to try to exhibit out of bounds accesses. */ 124 VECT_VAR_DECL_INIT(buffer, int, 8, 8); 125 PAD(buffer_pad, int, 8, 8); 126 VECT_VAR_DECL_INIT(buffer, int, 16, 4); 127 PAD(buffer_pad, int, 16, 4); 128 VECT_VAR_DECL_INIT(buffer, int, 32, 2); 129 PAD(buffer_pad, int, 32, 2); 130 VECT_VAR_DECL_INIT(buffer, int, 64, 1); 131 PAD(buffer_pad, int, 64, 1); 132 VECT_VAR_DECL_INIT(buffer, uint, 8, 8); 133 PAD(buffer_pad, uint, 8, 8); 134 VECT_VAR_DECL_INIT(buffer, poly, 8, 8); 135 PAD(buffer_pad, poly, 8, 8); 136 VECT_VAR_DECL_INIT(buffer, poly, 16, 4); 137 PAD(buffer_pad, poly, 16, 4); 138 VECT_VAR_DECL_INIT(buffer, uint, 16, 4); 139 PAD(buffer_pad, uint, 16, 4); 140 VECT_VAR_DECL_INIT(buffer, uint, 32, 2); 141 PAD(buffer_pad, uint, 32, 2); 142 VECT_VAR_DECL_INIT(buffer, uint, 64, 1); 143 PAD(buffer_pad, uint, 64, 1); 144 VECT_VAR_DECL_INIT(buffer, float, 32, 2); 145 PAD(buffer_pad, float, 32, 2); 146 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 147 /* We need a different initialization for ARMCC, because the compiler 148 performs the conversion to half-precision internal 149 representation. */ 150 #ifdef __ARMCC_VERSION 151 __fp16 buffer_float16x4[4] = {-16, -15, -14, -13}; 152 #else 153 VECT_VAR_DECL(buffer, float, 16, 4) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 154 0xcb00 /* -14 */, 0xca80 /* -13 */}; 155 #endif 156 PAD(buffer_pad, float, 16, 4); 157 #endif 158 VECT_VAR_DECL_INIT(buffer, int, 8, 16); 159 PAD(buffer_pad, int, 8, 16); 160 VECT_VAR_DECL_INIT(buffer, int, 16, 8); 161 PAD(buffer_pad, int, 16, 8); 162 VECT_VAR_DECL_INIT(buffer, int, 32, 4); 163 PAD(buffer_pad, int, 32, 4); 164 VECT_VAR_DECL_INIT(buffer, int, 64, 2); 165 PAD(buffer_pad, int, 64, 2); 166 VECT_VAR_DECL_INIT(buffer, uint, 8, 16); 167 PAD(buffer_pad, uint, 8, 16); 168 VECT_VAR_DECL_INIT(buffer, uint, 16, 8); 169 PAD(buffer_pad, uint, 16, 8); 170 VECT_VAR_DECL_INIT(buffer, uint, 32, 4); 171 PAD(buffer_pad, uint, 32, 4); 172 VECT_VAR_DECL_INIT(buffer, uint, 64, 2); 173 PAD(buffer_pad, uint, 64, 2); 174 VECT_VAR_DECL_INIT(buffer, poly, 8, 16); 175 PAD(buffer_pad, poly, 8, 16); 176 VECT_VAR_DECL_INIT(buffer, poly, 16, 8); 177 PAD(buffer_pad, poly, 16, 8); 178 VECT_VAR_DECL_INIT(buffer, float, 32, 4); 179 PAD(buffer_pad, float, 32, 4); 180 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 181 #ifdef __ARMCC_VERSION 182 __fp16 buffer_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9}; 183 #else 184 VECT_VAR_DECL(buffer, float, 16, 8) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 185 0xcb00 /* -14 */, 0xca80 /* -13 */, 186 0xca00 /* -12 */, 0xc980 /* -11 */, 187 0xc900 /* -10 */, 0xc880 /* -9 */}; 188 #endif 189 PAD(buffer_pad, float, 16, 8); 190 #endif 191 192 /* The tests for vld1_dup and vdup expect at least 4 entries in the 193 input buffer, so force 1- and 2-elements initializers to have 4 194 entries. */ 195 VECT_VAR_DECL_INIT(buffer_dup, int, 8, 8); 196 VECT_VAR_DECL(buffer_dup_pad, int, 8, 8); 197 VECT_VAR_DECL_INIT(buffer_dup, int, 16, 4); 198 VECT_VAR_DECL(buffer_dup_pad, int, 16, 4); 199 VECT_VAR_DECL_INIT4(buffer_dup, int, 32, 2); 200 VECT_VAR_DECL(buffer_dup_pad, int, 32, 2); 201 VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 1); 202 VECT_VAR_DECL(buffer_dup_pad, int, 64, 1); 203 VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 8); 204 VECT_VAR_DECL(buffer_dup_pad, uint, 8, 8); 205 VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 4); 206 VECT_VAR_DECL(buffer_dup_pad, uint, 16, 4); 207 VECT_VAR_DECL_INIT4(buffer_dup, uint, 32, 2); 208 VECT_VAR_DECL(buffer_dup_pad, uint, 32, 2); 209 VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 1); 210 VECT_VAR_DECL(buffer_dup_pad, uint, 64, 1); 211 VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8); 212 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8); 213 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4); 214 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4); 215 VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2); 216 VECT_VAR_DECL(buffer_dup_pad, float, 32, 2); 217 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 218 #ifdef __ARMCC_VERSION 219 __fp16 buffer_dup_float16x4[4] = {-16, -15, -14, -13}; 220 #else 221 VECT_VAR_DECL(buffer_dup, float, 16, 4)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 222 0xcb00 /* -14 */, 0xca80 /* -13 */}; 223 #endif 224 PAD(buffer_dup_pad, float, 16, 4); 225 #endif 226 VECT_VAR_DECL_INIT(buffer_dup, int, 8, 16); 227 VECT_VAR_DECL(buffer_dup_pad, int, 8, 16); 228 VECT_VAR_DECL_INIT(buffer_dup, int, 16, 8); 229 VECT_VAR_DECL(buffer_dup_pad, int, 16, 8); 230 VECT_VAR_DECL_INIT(buffer_dup, int, 32, 4); 231 VECT_VAR_DECL(buffer_dup_pad, int, 32, 4); 232 VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 2); 233 VECT_VAR_DECL(buffer_dup_pad, int, 64, 2); 234 VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 16); 235 VECT_VAR_DECL(buffer_dup_pad, uint, 8, 16); 236 VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 8); 237 VECT_VAR_DECL(buffer_dup_pad, uint, 16, 8); 238 VECT_VAR_DECL_INIT(buffer_dup, uint, 32, 4); 239 VECT_VAR_DECL(buffer_dup_pad, uint, 32, 4); 240 VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 2); 241 VECT_VAR_DECL(buffer_dup_pad, uint, 64, 2); 242 VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16); 243 VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16); 244 VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8); 245 VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8); 246 VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4); 247 VECT_VAR_DECL(buffer_dup_pad, float, 32, 4); 248 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 249 #ifdef __ARMCC_VERSION 250 __fp16 buffer_dup_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9}; 251 #else 252 VECT_VAR_DECL(buffer_dup, float, 16, 8)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 253 0xcb00 /* -14 */, 0xca80 /* -13 */, 254 0xca00 /* -12 */, 0xc980 /* -11 */, 255 0xc900 /* -10 */, 0xc880 /* -9 */}; 256 #endif 257 PAD(buffer_dup_pad, float, 16, 8); 258 #endif 259 260 /* Input buffers for vld2, 1 of each size */ 261 VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8); 262 PAD(buffer_vld2_pad, int, 8, 8); 263 VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4); 264 PAD(buffer_vld2_pad, int, 16, 4); 265 VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2); 266 PAD(buffer_vld2_pad, int, 32, 2); 267 VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1); 268 PAD(buffer_vld2_pad, int, 64, 1); 269 VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8); 270 PAD(buffer_vld2_pad, uint, 8, 8); 271 VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4); 272 PAD(buffer_vld2_pad, uint, 16, 4); 273 VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2); 274 PAD(buffer_vld2_pad, uint, 32, 2); 275 VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1); 276 PAD(buffer_vld2_pad, uint, 64, 1); 277 VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 8); 278 PAD(buffer_vld2_pad, poly, 8, 8); 279 VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4); 280 PAD(buffer_vld2_pad, poly, 16, 4); 281 VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2); 282 PAD(buffer_vld2_pad, float, 32, 2); 283 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 284 #ifdef __ARMCC_VERSION 285 __fp16 buffer_vld2_float16x4x2[4*2] = {-16, -15, -14, -13, -12, -11, -10, -9}; 286 #else 287 float16_t buffer_vld2_float16x4x2[4*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 288 0xcb00 /* -14 */, 0xca80 /* -13 */, 289 0xca00 /* -12 */, 0xc980 /* -11 */, 290 0xc900 /* -10 */, 0xc880 /* -9 */}; 291 #endif 292 PAD(buffer_vld2_pad, float, 16, 4); 293 #endif 294 VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16); 295 PAD(buffer_vld2_pad, int, 8, 16); 296 VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8); 297 PAD(buffer_vld2_pad, int, 16, 8); 298 VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4); 299 PAD(buffer_vld2_pad, int, 32, 4); 300 VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2); 301 PAD(buffer_vld2_pad, int, 64, 2); 302 VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16); 303 PAD(buffer_vld2_pad, uint, 8, 16); 304 VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8); 305 PAD(buffer_vld2_pad, uint, 16, 8); 306 VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4); 307 PAD(buffer_vld2_pad, uint, 32, 4); 308 VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2); 309 PAD(buffer_vld2_pad, uint, 64, 2); 310 VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 16); 311 PAD(buffer_vld2_pad, poly, 8, 16); 312 VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8); 313 PAD(buffer_vld2_pad, poly, 16, 8); 314 VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4); 315 PAD(buffer_vld2_pad, float, 32, 4); 316 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 317 #ifdef __ARMCC_VERSION 318 __fp16 buffer_vld2_float16x8x2[8*2] = {-16, -15, -14, -13, -12, -11, -10, -9, 319 -8, -7, -6, -5, -4, -3, -2, -1}; 320 #else 321 float16_t buffer_vld2_float16x8x2[8*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 322 0xcb00 /* -14 */, 0xca80 /* -13 */, 323 0xca00 /* -12 */, 0xc980 /* -11 */, 324 0xc900 /* -10 */, 0xc880 /* -9 */, 325 0xc800 /* -8 */, 0xc700 /* -7 */, 326 0xc600 /* -6 */, 0xc500 /* -5 */, 327 0xc400 /* -4 */, 0xc200 /* -3 */, 328 0xc000 /* -2 */, 0xbc00 /* -1 */}; 329 #endif 330 PAD(buffer_vld2_pad, float, 16, 8); 331 #endif 332 333 /* Input buffers for vld3, 1 of each size */ 334 VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8); 335 PAD(buffer_vld3_pad, int, 8, 8); 336 VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4); 337 PAD(buffer_vld3_pad, int, 16, 4); 338 VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2); 339 PAD(buffer_vld3_pad, int, 32, 2); 340 VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1); 341 PAD(buffer_vld3_pad, int, 64, 1); 342 VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8); 343 PAD(buffer_vld3_pad, uint, 8, 8); 344 VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4); 345 PAD(buffer_vld3_pad, uint, 16, 4); 346 VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2); 347 PAD(buffer_vld3_pad, uint, 32, 2); 348 VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1); 349 PAD(buffer_vld3_pad, uint, 64, 1); 350 VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 8); 351 PAD(buffer_vld3_pad, poly, 8, 8); 352 VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4); 353 PAD(buffer_vld3_pad, poly, 16, 4); 354 VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2); 355 PAD(buffer_vld3_pad, float, 32, 2); 356 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 357 #ifdef __ARMCC_VERSION 358 __fp16 buffer_vld3_float16x4x3[4*3] = {-16, -15, -14, -13, -12, -11, -10, -9, 359 -8, -7, -6, -5}; 360 #else 361 float16_t buffer_vld3_float16x4x3[4*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 362 0xcb00 /* -14 */, 0xca80 /* -13 */, 363 0xca00 /* -12 */, 0xc980 /* -11 */, 364 0xc900 /* -10 */, 0xc880 /* -9 */, 365 0xc800 /* -8 */, 0xc700 /* -7 */, 366 0xc600 /* -6 */, 0xc500 /* -5 */}; 367 #endif 368 PAD(buffer_vld3_pad, float, 16, 4); 369 #endif 370 VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16); 371 PAD(buffer_vld3_pad, int, 8, 16); 372 VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8); 373 PAD(buffer_vld3_pad, int, 16, 8); 374 VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4); 375 PAD(buffer_vld3_pad, int, 32, 4); 376 VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2); 377 PAD(buffer_vld3_pad, int, 64, 2); 378 VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16); 379 PAD(buffer_vld3_pad, uint, 8, 16); 380 VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8); 381 PAD(buffer_vld3_pad, uint, 16, 8); 382 VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4); 383 PAD(buffer_vld3_pad, uint, 32, 4); 384 VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2); 385 PAD(buffer_vld3_pad, uint, 64, 2); 386 VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 16); 387 PAD(buffer_vld3_pad, poly, 8, 16); 388 VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8); 389 PAD(buffer_vld3_pad, poly, 16, 8); 390 VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4); 391 PAD(buffer_vld3_pad, float, 32, 4); 392 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 393 #ifdef __ARMCC_VERSION 394 __fp16 buffer_vld3_float16x8x3[8*3] = {-16, -15, -14, -13, -12, -11, -10, -9, 395 -8, -7, -6, -5, -4, -3, -2, -1, 396 0, 1, 2, 3, 4, 5, 6, 7}; 397 #else 398 float16_t buffer_vld3_float16x8x3[8*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 399 0xcb00 /* -14 */, 0xca80 /* -13 */, 400 0xca00 /* -12 */, 0xc980 /* -11 */, 401 0xc900 /* -10 */, 0xc880 /* -9 */, 402 0xc800 /* -8 */, 0xc700 /* -7 */, 403 0xc600 /* -6 */, 0xc500 /* -6 */, 404 0xc400 /* -4 */, 0xc200 /* -3 */, 405 0xc000 /* -2 */, 0xbc00 /* -1 */, 406 0, 0x3c00 /* 1 */, 407 0x4000 /* 2 */, 0x4200 /* 3 */, 408 0x4400 /* 4 */, 0x4500 /* 5 */, 409 0x4600 /* 6 */, 0x4700 /* 7 */}; 410 #endif 411 PAD(buffer_vld3_pad, float, 16, 8); 412 #endif 413 414 /* Input buffers for vld4, 1 of each size */ 415 VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8); 416 PAD(buffer_vld4_pad, int, 8, 8); 417 VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4); 418 PAD(buffer_vld4_pad, int, 16, 4); 419 VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2); 420 PAD(buffer_vld4_pad, int, 32, 2); 421 VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1); 422 PAD(buffer_vld4_pad, int, 64, 1); 423 VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8); 424 PAD(buffer_vld4_pad, uint, 8, 8); 425 VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4); 426 PAD(buffer_vld4_pad, uint, 16, 4); 427 VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2); 428 PAD(buffer_vld4_pad, uint, 32, 2); 429 VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1); 430 PAD(buffer_vld4_pad, uint, 64, 1); 431 VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 8); 432 PAD(buffer_vld4_pad, poly, 8, 8); 433 VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4); 434 PAD(buffer_vld4_pad, poly, 16, 4); 435 VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2); 436 PAD(buffer_vld4_pad, float, 32, 2); 437 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 438 #ifdef __ARMCC_VERSION 439 __fp16 buffer_vld4_float16x4x4[4*4] = {-16, -15, -14, -13, -12, -11, -10, -9, 440 -8, -7, -6, -5, -4, -3, -2, -1}; 441 #else 442 float16_t buffer_vld4_float16x4x4[4*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 443 0xcb00 /* -14 */, 0xca80 /* -13 */, 444 0xca00 /* -12 */, 0xc980 /* -11 */, 445 0xc900 /* -10 */, 0xc880 /* -9 */, 446 0xc800 /* -8 */, 0xc700 /* -7 */, 447 0xc600 /* -6 */, 0xc500 /* -5 */, 448 0xc400 /* -4 */, 0xc200 /* -3 */, 449 0xc000 /* -2 */, 0xbc00 /* -1 */}; 450 #endif 451 PAD(buffer_vld4_pad, float, 16, 4); 452 #endif 453 VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16); 454 PAD(buffer_vld4_pad, int, 8, 16); 455 VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8); 456 PAD(buffer_vld4_pad, int, 16, 8); 457 VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4); 458 PAD(buffer_vld4_pad, int, 32, 4); 459 VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2); 460 PAD(buffer_vld4_pad, int, 64, 2); 461 VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16); 462 PAD(buffer_vld4_pad, uint, 8, 16); 463 VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8); 464 PAD(buffer_vld4_pad, uint, 16, 8); 465 VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4); 466 PAD(buffer_vld4_pad, uint, 32, 4); 467 VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2); 468 PAD(buffer_vld4_pad, uint, 64, 2); 469 VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 16); 470 PAD(buffer_vld4_pad, poly, 8, 16); 471 VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8); 472 PAD(buffer_vld4_pad, poly, 16, 8); 473 VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4); 474 PAD(buffer_vld4_pad, float, 32, 4); 475 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 476 #ifdef __ARMCC_VERSION 477 __fp16 buffer_vld4_float16x8x4[8*4] = {-16, -15, -14, -13, -12, -11, -10, -9, 478 -8, -7, -6, -5, -4, -3, -2, -1, 479 0, 1, 2, 3, 4, 5, 6, 7, 480 8, 9, 10, 11, 12, 13, 14, 15}; 481 #else 482 float16_t buffer_vld4_float16x8x4[8*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */, 483 0xcb00 /* -14 */, 0xca80 /* -13 */, 484 0xca00 /* -12 */, 0xc980 /* -11 */, 485 0xc900 /* -10 */, 0xc880 /* -9 */, 486 0xc800 /* -8 */, 0xc700 /* -7 */, 487 0xc600 /* -6 */, 0xc500 /* -6 */, 488 0xc400 /* -4 */, 0xc200 /* -3 */, 489 0xc000 /* -2 */, 0xbc00 /* -1 */, 490 0, 0x3c00 /* 1 */, 491 0x4000 /* 2 */, 0x4200 /* 3 */, 492 0x4400 /* 4 */, 0x4500 /* 5 */, 493 0x4600 /* 6 */, 0x4700 /* 7 */, 494 0x4800 /* 8 */, 0x4880 /* 9 */, 495 0x4900 /* 10 */, 0x4980 /* 11 */, 496 0x4a00 /* 12 */, 0x4a80 /* 13 */, 497 0x4b00 /* 14 */, 0x04b80 /* 15 */}; 498 #endif 499 PAD(buffer_vld4_pad, float, 16, 8); 500 #endif 501 502 /* Input buffers for vld2_lane */ 503 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2); 504 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2); 505 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2); 506 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2); 507 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2); 508 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2); 509 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2); 510 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2); 511 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2); 512 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2); 513 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2); 514 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 515 #ifdef __ARMCC_VERSION 516 __fp16 buffer_vld2_lane_float16x2[2] = {-16, -15}; 517 #else 518 VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2) [] = {0xcc00 /* -16 */, 519 0xcb80 /* -15 */}; 520 #endif 521 #endif 522 523 /* Input buffers for vld3_lane */ 524 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3); 525 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3); 526 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3); 527 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3); 528 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3); 529 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3); 530 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3); 531 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3); 532 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3); 533 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3); 534 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3); 535 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 536 #ifdef __ARMCC_VERSION 537 __fp16 buffer_vld3_lane_float16x3[3] = {-16, -15, -14}; 538 #else 539 VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3) [] = {0xcc00 /* -16 */, 540 0xcb80 /* -15 */, 541 0xcb00 /* -14 */}; 542 #endif 543 #endif 544 545 /* Input buffers for vld4_lane */ 546 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4); 547 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4); 548 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4); 549 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4); 550 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4); 551 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4); 552 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4); 553 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4); 554 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4); 555 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4); 556 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4); 557 #if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) ) 558 #ifdef __ARMCC_VERSION 559 __fp16 buffer_vld4_lane_float16x4[4] = {-16, -15, -14, -13}; 560 #else 561 VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4) [] = {0xcc00 /* -16 */, 562 0xcb80 /* -15 */, 563 0xcb00 /* -14 */, 564 0xca80 /* -13 */}; 565 #endif 566 #endif 567