1 /* 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ 12 #define INCLUDE_LIBYUV_MACROS_MSA_H_ 13 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 15 #include <msa.h> 16 #include <stdint.h> 17 18 #if (__mips_isa_rev >= 6) 19 #define LW(psrc) \ 20 ({ \ 21 const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ 22 uint32_t val_m; \ 23 asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ 24 : [val_m] "=r"(val_m) \ 25 : [psrc_lw_m] "m"(*psrc_lw_m)); \ 26 val_m; \ 27 }) 28 29 #if (__mips == 64) 30 #define LD(psrc) \ 31 ({ \ 32 const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ 33 uint64_t val_m = 0; \ 34 asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ 35 : [val_m] "=r"(val_m) \ 36 : [psrc_ld_m] "m"(*psrc_ld_m)); \ 37 val_m; \ 38 }) 39 #else // !(__mips == 64) 40 #define LD(psrc) \ 41 ({ \ 42 const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ 43 uint32_t val0_m, val1_m; \ 44 uint64_t val_m = 0; \ 45 val0_m = LW(psrc_ld_m); \ 46 val1_m = LW(psrc_ld_m + 4); \ 47 val_m = (uint64_t)(val1_m); /* NOLINT */ \ 48 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ 49 val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ 50 val_m; \ 51 }) 52 #endif // (__mips == 64) 53 54 #define SW(val, pdst) \ 55 ({ \ 56 uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ 57 uint32_t val_m = (val); \ 58 asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ 59 : [pdst_sw_m] "=m"(*pdst_sw_m) \ 60 : [val_m] "r"(val_m)); \ 61 }) 62 63 #if (__mips == 64) 64 #define SD(val, pdst) \ 65 ({ \ 66 uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ 67 uint64_t val_m = (val); \ 68 asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ 69 : [pdst_sd_m] "=m"(*pdst_sd_m) \ 70 : [val_m] "r"(val_m)); \ 71 }) 72 #else // !(__mips == 64) 73 #define SD(val, pdst) \ 74 ({ \ 75 uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ 76 uint32_t val0_m, val1_m; \ 77 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 78 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 79 SW(val0_m, pdst_sd_m); \ 80 SW(val1_m, pdst_sd_m + 4); \ 81 }) 82 #endif // !(__mips == 64) 83 #else // !(__mips_isa_rev >= 6) 84 #define LW(psrc) \ 85 ({ \ 86 uint8_t* psrc_lw_m = (uint8_t*)(psrc); \ 87 uint32_t val_lw_m; \ 88 \ 89 __asm__ volatile( \ 90 "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ 91 "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ 92 \ 93 : [val_lw_m] "=&r"(val_lw_m) \ 94 : [psrc_lw_m] "r"(psrc_lw_m)); \ 95 \ 96 val_lw_m; \ 97 }) 98 99 #if (__mips == 64) 100 #define LD(psrc) \ 101 ({ \ 102 uint8_t* psrc_ld_m = (uint8_t*)(psrc); \ 103 uint64_t val_ld_m = 0; \ 104 \ 105 __asm__ volatile( \ 106 "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ 107 "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ 108 \ 109 : [val_ld_m] "=&r"(val_ld_m) \ 110 : [psrc_ld_m] "r"(psrc_ld_m)); \ 111 \ 112 val_ld_m; \ 113 }) 114 #else // !(__mips == 64) 115 #define LD(psrc) \ 116 ({ \ 117 const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ 118 uint32_t val0_m, val1_m; \ 119 uint64_t val_m = 0; \ 120 val0_m = LW(psrc_ld_m); \ 121 val1_m = LW(psrc_ld_m + 4); \ 122 val_m = (uint64_t)(val1_m); /* NOLINT */ \ 123 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ 124 val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ 125 val_m; \ 126 }) 127 #endif // (__mips == 64) 128 129 #define SW(val, pdst) \ 130 ({ \ 131 uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ 132 uint32_t val_m = (val); \ 133 asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ 134 : [pdst_sw_m] "=m"(*pdst_sw_m) \ 135 : [val_m] "r"(val_m)); \ 136 }) 137 138 #define SD(val, pdst) \ 139 ({ \ 140 uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ 141 uint32_t val0_m, val1_m; \ 142 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 143 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 144 SW(val0_m, pdst_sd_m); \ 145 SW(val1_m, pdst_sd_m + 4); \ 146 }) 147 #endif // (__mips_isa_rev >= 6) 148 149 // TODO(fbarchard): Consider removing __VAR_ARGS versions. 150 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ 151 #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) 152 153 #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ 154 #define LD_UH(...) LD_H(const v8u16, __VA_ARGS__) 155 156 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ 157 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 158 159 #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ 160 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) 161 162 /* Description : Load two vectors with 16 'byte' sized elements 163 Arguments : Inputs - psrc, stride 164 Outputs - out0, out1 165 Return Type - as per RTYPE 166 Details : Load 16 byte elements in 'out0' from (psrc) 167 Load 16 byte elements in 'out1' from (psrc + stride) 168 */ 169 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 170 { \ 171 out0 = LD_B(RTYPE, (psrc)); \ 172 out1 = LD_B(RTYPE, (psrc) + stride); \ 173 } 174 #define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) 175 176 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 177 { \ 178 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 179 LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 180 } 181 #define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) 182 183 /* Description : Store two vectors with stride each having 16 'byte' sized 184 elements 185 Arguments : Inputs - in0, in1, pdst, stride 186 Details : Store 16 byte elements from 'in0' to (pdst) 187 Store 16 byte elements from 'in1' to (pdst + stride) 188 */ 189 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 190 { \ 191 ST_B(RTYPE, in0, (pdst)); \ 192 ST_B(RTYPE, in1, (pdst) + stride); \ 193 } 194 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 195 196 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 197 { \ 198 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 199 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 200 } 201 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 202 203 /* Description : Store vectors of 8 halfword elements with stride 204 Arguments : Inputs - in0, in1, pdst, stride 205 Details : Store 8 halfword elements from 'in0' to (pdst) 206 Store 8 halfword elements from 'in1' to (pdst + stride) 207 */ 208 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 209 { \ 210 ST_H(RTYPE, in0, (pdst)); \ 211 ST_H(RTYPE, in1, (pdst) + stride); \ 212 } 213 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) 214 215 // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. 216 /* Description : Shuffle byte vector elements as per mask vector 217 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 218 Outputs - out0, out1 219 Return Type - as per RTYPE 220 Details : Byte elements from 'in0' & 'in1' are copied selectively to 221 'out0' as per control vector 'mask0' 222 */ 223 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 224 { \ 225 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 226 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 227 } 228 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 229 230 /* Description : Interleave both left and right half of input vectors 231 Arguments : Inputs - in0, in1 232 Outputs - out0, out1 233 Return Type - as per RTYPE 234 Details : Right half of byte elements from 'in0' and 'in1' are 235 interleaved and written to 'out0' 236 */ 237 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 238 { \ 239 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 240 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 241 } 242 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 243 244 #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ 245 246 #endif // INCLUDE_LIBYUV_MACROS_MSA_H_ 247