1 // Copyright 2021 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <xnnpack/common.h> 12 13 #ifdef __cplusplus 14 extern "C" { 15 #endif 16 17 #define DECLARE_XX_TRANSPOSEV_UKERNEL_FUNCTION(fn_name) \ 18 XNN_INTERNAL void fn_name(const void* input, \ 19 void* output, \ 20 size_t input_row_stride, \ 21 size_t output_row_stride, \ 22 size_t input_element_stride, \ 23 size_t output_element_stride, \ 24 size_t element_size, \ 25 size_t block_width, \ 26 size_t block_height); 27 28 DECLARE_XX_TRANSPOSEV_UKERNEL_FUNCTION(xnn_xx_transposev_ukernel__1x1_memcpy) 29 30 #define DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(fn_name) \ 31 XNN_INTERNAL void fn_name(const uint64_t* input, \ 32 uint64_t* output, \ 33 size_t input_stride, \ 34 size_t output_stride, \ 35 size_t block_width, \ 36 size_t block_height); 37 38 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__1x2_scalar_float) 39 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__1x2_scalar_int) 40 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x1_scalar_float) 41 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x1_scalar_int) 42 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_multi_mov_sse2) 43 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_multi_multi_sse2) 44 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_multi_switch_sse2) 45 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_reuse_mov_sse2) 46 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_reuse_multi_sse2) 47 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_reuse_switch_sse2) 48 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_scalar_float) 49 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__2x2_scalar_int) 50 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__4x1_scalar_float) 51 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__4x1_scalar_int) 52 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__4x2_scalar_float) 53 DECLARE_X64_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x64_transposec_ukernel__4x2_scalar_int) 54 55 #define DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(fn_name) \ 56 XNN_INTERNAL void fn_name(const uint32_t* input, \ 57 uint32_t* output, \ 58 size_t input_stride, \ 59 size_t output_stride, \ 60 size_t block_width, \ 61 size_t block_height); 62 63 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__1x2_scalar_float) 64 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__1x2_scalar_int) 65 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__1x4_scalar_float) 66 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__1x4_scalar_int) 67 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x1_scalar_float) 68 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x1_scalar_int) 69 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_multi_dec_zip_neon) 70 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_multi_mov_zip_neon) 71 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_multi_multi_zip_neon) 72 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_multi_switch_zip_neon) 73 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_reuse_dec_zip_neon) 74 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_reuse_mov_zip_neon) 75 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_reuse_multi_zip_neon) 76 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_reuse_switch_zip_neon) 77 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_scalar_float) 78 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x2_scalar_int) 79 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x4_scalar_float) 80 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__2x4_scalar_int) 81 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x1_scalar_float) 82 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x1_scalar_int) 83 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x2_scalar_float) 84 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x2_scalar_int) 85 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl) 86 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_dec_zip_neon) 87 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_mov_sse2) 88 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_mov_wasmsimd) 89 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_mov_zip_neon) 90 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_multi_sse2) 91 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_multi_wasmsimd) 92 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_multi_zip_neon) 93 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_switch_sse2) 94 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_switch_wasmsimd) 95 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_multi_switch_zip_neon) 96 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon) 97 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_mov_sse2) 98 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd) 99 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_mov_zip_neon) 100 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_multi_sse2) 101 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_multi_wasmsimd) 102 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_multi_zip_neon) 103 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_switch_sse2) 104 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_switch_wasmsimd) 105 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_reuse_switch_zip_neon) 106 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_scalar_float) 107 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_scalar_int) 108 DECLARE_X32_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x32_transposec_ukernel__4x4_sse) 109 110 #define DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(fn_name) \ 111 XNN_INTERNAL void fn_name(const void* input, \ 112 void* output, \ 113 size_t input_stride, \ 114 size_t output_stride, \ 115 size_t block_width, \ 116 size_t block_height); 117 118 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__1x2_scalar) 119 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__1x4_scalar) 120 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__2x1_scalar) 121 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__2x2_neon_tbl) 122 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__2x2_scalar) 123 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__2x4_scalar) 124 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__4x1_scalar) 125 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__4x2_scalar) 126 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__4x4_aarch64_neon_tbl) 127 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__4x4_scalar) 128 DECLARE_X24_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x24_transposec_ukernel__4x4_ssse3) 129 130 #define DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(fn_name) \ 131 XNN_INTERNAL void fn_name(const uint16_t* input, \ 132 uint16_t* output, \ 133 size_t input_stride, \ 134 size_t output_stride, \ 135 size_t block_width, \ 136 size_t block_height); 137 138 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__1x2_scalar_int) 139 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__1x4_scalar_int) 140 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__2x1_scalar_int) 141 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__2x2_scalar_int) 142 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__2x4_scalar_int) 143 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x1_scalar_int) 144 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x2_scalar_int) 145 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_multi_dec_zip_neon) 146 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_multi_mov_zip_neon) 147 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_multi_multi_zip_neon) 148 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_multi_switch_zip_neon) 149 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_reuse_dec_zip_neon) 150 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_reuse_mov_zip_neon) 151 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_reuse_multi_zip_neon) 152 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_reuse_switch_zip_neon) 153 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x4_scalar_int) 154 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__4x8_sse2) 155 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_dec_zip_neon) 156 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_mov_sse2) 157 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_mov_wasmsimd) 158 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_mov_zip_neon) 159 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_switch_sse2) 160 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_switch_wasmsimd) 161 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_multi_switch_zip_neon) 162 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon) 163 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_mov_sse2) 164 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd) 165 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_mov_zip_neon) 166 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2) 167 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_multi_wasmsimd) 168 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_multi_zip_neon) 169 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_switch_sse2) 170 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_switch_wasmsimd) 171 DECLARE_X16_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x16_transposec_ukernel__8x8_reuse_switch_zip_neon) 172 173 #define DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(fn_name) \ 174 XNN_INTERNAL void fn_name(const uint8_t* input, \ 175 uint8_t* output, \ 176 size_t input_stride, \ 177 size_t output_stride, \ 178 size_t block_width, \ 179 size_t block_height); 180 181 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__1x2_scalar_int) 182 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__1x4_scalar_int) 183 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__2x1_scalar_int) 184 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__2x2_scalar_int) 185 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__2x4_scalar_int) 186 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__4x1_scalar_int) 187 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__4x2_scalar_int) 188 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__4x4_scalar_int) 189 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_multi_dec_zip_neon) 190 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_multi_mov_zip_neon) 191 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_multi_switch_zip_neon) 192 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_reuse_dec_zip_neon) 193 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_reuse_mov_zip_neon) 194 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_reuse_multi_zip_neon) 195 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__8x8_reuse_switch_zip_neon) 196 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon) 197 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2) 198 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd) 199 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_mov_zip_neon) 200 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2) 201 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd) 202 DECLARE_X8_TRANSPOSEC_UKERNEL_FUNCTION(xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon) 203 204 #ifdef __cplusplus 205 } // extern "C" 206 #endif 207