1; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s 2; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s 3 4define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { 5; SSE2: zext_v16i16_to_v16i32 6; SSE2: cost of 6 {{.*}} zext 7; 8; SSE41: zext_v16i16_to_v16i32 9; SSE41: cost of 4 {{.*}} zext 10; 11 %1 = load <16 x i16>, <16 x i16>* %a 12 %2 = zext <16 x i16> %1 to <16 x i32> 13 store <16 x i32> %2, <16 x i32>* undef, align 4 14 ret void 15} 16 17define void @sext_v16i16_to_v16i32(<16 x i16>* %a) { 18; SSE2: sext_v16i16_to_v16i32 19; SSE2: cost of 8 {{.*}} sext 20; 21; SSE41: sext_v16i16_to_v16i32 22; SSE41: cost of 4 {{.*}} sext 23; 24 %1 = load <16 x i16>, <16 x i16>* %a 25 %2 = sext <16 x i16> %1 to <16 x i32> 26 store <16 x i32> %2, <16 x i32>* undef, align 4 27 ret void 28} 29 30define void @zext_v8i16_to_v8i32(<8 x i16>* %a) { 31; SSE2: zext_v8i16_to_v8i32 32; SSE2: cost of 3 {{.*}} zext 33; 34; SSE41: zext_v8i16_to_v8i32 35; SSE41: cost of 2 {{.*}} zext 36; 37 %1 = load <8 x i16>, <8 x i16>* %a 38 %2 = zext <8 x i16> %1 to <8 x i32> 39 store <8 x i32> %2, <8 x i32>* undef, align 4 40 ret void 41} 42 43define void @sext_v8i16_to_v8i32(<8 x i16>* %a) { 44; SSE2: sext_v8i16_to_v8i32 45; SSE2: cost of 4 {{.*}} sext 46; 47; SSE41: sext_v8i16_to_v8i32 48; SSE41: cost of 2 {{.*}} sext 49; 50 %1 = load <8 x i16>, <8 x i16>* %a 51 %2 = sext <8 x i16> %1 to <8 x i32> 52 store <8 x i32> %2, <8 x i32>* undef, align 4 53 ret void 54} 55 56define void @zext_v4i16_to_v4i32(<4 x i16>* %a) { 57; SSE2: zext_v4i16_to_v4i32 58; SSE2: cost of 1 {{.*}} zext 59; 60; SSE41: zext_v4i16_to_v4i32 61; SSE41: cost of 1 {{.*}} zext 62; 63 %1 = load <4 x i16>, <4 x i16>* %a 64 %2 = zext <4 x i16> %1 to <4 x i32> 65 store <4 x i32> %2, <4 x i32>* undef, align 4 66 ret void 67} 68 69define void @sext_v4i16_to_v4i32(<4 x i16>* %a) { 70; SSE2: sext_v4i16_to_v4i32 71; SSE2: cost of 2 {{.*}} sext 72; 73; SSE41: sext_v4i16_to_v4i32 74; SSE41: cost of 1 {{.*}} sext 75; 76 %1 = load <4 x i16>, <4 x i16>* %a 77 %2 = sext <4 x i16> %1 to <4 x i32> 78 store <4 x i32> %2, <4 x i32>* undef, align 4 79 ret void 80} 81 82define void @zext_v16i8_to_v16i32(<16 x i8>* %a) { 83; SSE2: zext_v16i8_to_v16i32 84; SSE2: cost of 9 {{.*}} zext 85; 86; SSE41: zext_v16i8_to_v16i32 87; SSE41: cost of 4 {{.*}} zext 88; 89 %1 = load <16 x i8>, <16 x i8>* %a 90 %2 = zext <16 x i8> %1 to <16 x i32> 91 store <16 x i32> %2, <16 x i32>* undef, align 4 92 ret void 93} 94 95define void @sext_v16i8_to_v16i32(<16 x i8>* %a) { 96; SSE2: sext_v16i8_to_v16i32 97; SSE2: cost of 12 {{.*}} sext 98; 99; SSE41: sext_v16i8_to_v16i32 100; SSE41: cost of 4 {{.*}} sext 101; 102 %1 = load <16 x i8>, <16 x i8>* %a 103 %2 = sext <16 x i8> %1 to <16 x i32> 104 store <16 x i32> %2, <16 x i32>* undef, align 4 105 ret void 106} 107 108define void @zext_v8i8_to_v8i32(<8 x i8>* %a) { 109; SSE2: zext_v8i8_to_v8i32 110; SSE2: cost of 6 {{.*}} zext 111; 112; SSE41: zext_v8i8_to_v8i32 113; SSE41: cost of 2 {{.*}} zext 114; 115 %1 = load <8 x i8>, <8 x i8>* %a 116 %2 = zext <8 x i8> %1 to <8 x i32> 117 store <8 x i32> %2, <8 x i32>* undef, align 4 118 ret void 119} 120 121define void @sext_v8i8_to_v8i32(<8 x i8>* %a) { 122; SSE2: sext_v8i8_to_v8i32 123; SSE2: cost of 6 {{.*}} sext 124; 125; SSE41: sext_v8i8_to_v8i32 126; SSE41: cost of 2 {{.*}} sext 127; 128 %1 = load <8 x i8>, <8 x i8>* %a 129 %2 = sext <8 x i8> %1 to <8 x i32> 130 store <8 x i32> %2, <8 x i32>* undef, align 4 131 ret void 132} 133 134define void @zext_v4i8_to_v4i32(<4 x i8>* %a) { 135; SSE2: zext_v4i8_to_v4i32 136; SSE2: cost of 2 {{.*}} zext 137; 138; SSE41: zext_v4i8_to_v4i32 139; SSE41: cost of 1 {{.*}} zext 140; 141 %1 = load <4 x i8>, <4 x i8>* %a 142 %2 = zext <4 x i8> %1 to <4 x i32> 143 store <4 x i32> %2, <4 x i32>* undef, align 4 144 ret void 145} 146 147define void @sext_v4i8_to_v4i32(<4 x i8>* %a) { 148; SSE2: sext_v4i8_to_v4i32 149; SSE2: cost of 3 {{.*}} sext 150; 151; SSE41: sext_v4i8_to_v4i32 152; SSE41: cost of 1 {{.*}} sext 153; 154 %1 = load <4 x i8>, <4 x i8>* %a 155 %2 = sext <4 x i8> %1 to <4 x i32> 156 store <4 x i32> %2, <4 x i32>* undef, align 4 157 ret void 158} 159 160define void @zext_v16i8_to_v16i16(<16 x i8>* %a) { 161; SSE2: zext_v16i8_to_v16i16 162; SSE2: cost of 3 {{.*}} zext 163; 164; SSE41: zext_v16i8_to_v16i16 165; SSE41: cost of 2 {{.*}} zext 166; 167 %1 = load <16 x i8>, <16 x i8>* %a 168 %2 = zext <16 x i8> %1 to <16 x i16> 169 store <16 x i16> %2, <16 x i16>* undef, align 4 170 ret void 171} 172 173define void @sext_v16i8_to_v16i16(<16 x i8>* %a) { 174; SSE2: sext_v16i8_to_v16i16 175; SSE2: cost of 4 {{.*}} sext 176; 177; SSE41: sext_v16i8_to_v16i16 178; SSE41: cost of 2 {{.*}} sext 179; 180 %1 = load <16 x i8>, <16 x i8>* %a 181 %2 = sext <16 x i8> %1 to <16 x i16> 182 store <16 x i16> %2, <16 x i16>* undef, align 4 183 ret void 184} 185 186define void @zext_v8i8_to_v8i16(<8 x i8>* %a) { 187; SSE2: zext_v8i8_to_v8i16 188; SSE2: cost of 1 {{.*}} zext 189; 190; SSE41: zext_v8i8_to_v8i16 191; SSE41: cost of 1 {{.*}} zext 192; 193 %1 = load <8 x i8>, <8 x i8>* %a 194 %2 = zext <8 x i8> %1 to <8 x i16> 195 store <8 x i16> %2, <8 x i16>* undef, align 4 196 ret void 197} 198 199define void @sext_v8i8_to_v8i16(<8 x i8>* %a) { 200; SSE2: sext_v8i8_to_v8i16 201; SSE2: cost of 2 {{.*}} sext 202; 203; SSE41: sext_v8i8_to_v8i16 204; SSE41: cost of 1 {{.*}} sext 205; 206 %1 = load <8 x i8>, <8 x i8>* %a 207 %2 = sext <8 x i8> %1 to <8 x i16> 208 store <8 x i16> %2, <8 x i16>* undef, align 4 209 ret void 210} 211 212define void @zext_v4i8_to_v4i16(<4 x i8>* %a) { 213; SSE2: zext_v4i8_to_v4i16 214; SSE2: cost of 1 {{.*}} zext 215; 216; SSE41: zext_v4i8_to_v4i16 217; SSE41: cost of 1 {{.*}} zext 218; 219 %1 = load <4 x i8>, <4 x i8>* %a 220 %2 = zext <4 x i8> %1 to <4 x i16> 221 store <4 x i16> %2, <4 x i16>* undef, align 4 222 ret void 223} 224 225define void @sext_v4i8_to_v4i16(<4 x i8>* %a) { 226; SSE2: sext_v4i8_to_v4i16 227; SSE2: cost of 6 {{.*}} sext 228; 229; SSE41: sext_v4i8_to_v4i16 230; SSE41: cost of 2 {{.*}} sext 231; 232 %1 = load <4 x i8>, <4 x i8>* %a 233 %2 = sext <4 x i8> %1 to <4 x i16> 234 store <4 x i16> %2, <4 x i16>* undef, align 4 235 ret void 236} 237 238define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) { 239; SSE2: truncate_v16i32_to_v16i16 240; SSE2: cost of 10 {{.*}} trunc 241; 242; SSE41: truncate_v16i32_to_v16i16 243; SSE41: cost of 6 {{.*}} trunc 244; 245 %1 = load <16 x i32>, <16 x i32>* %a 246 %2 = trunc <16 x i32> %1 to <16 x i16> 247 store <16 x i16> %2, <16 x i16>* undef, align 4 248 ret void 249} 250 251define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) { 252; SSE2: truncate_v8i32_to_v8i16 253; SSE2: cost of 5 {{.*}} trunc 254; 255; SSE41: truncate_v8i32_to_v8i16 256; SSE41: cost of 3 {{.*}} trunc 257; 258 %1 = load <8 x i32>, <8 x i32>* %a 259 %2 = trunc <8 x i32> %1 to <8 x i16> 260 store <8 x i16> %2, <8 x i16>* undef, align 4 261 ret void 262} 263 264define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) { 265; SSE2: truncate_v4i32_to_v4i16 266; SSE2: cost of 3 {{.*}} trunc 267; 268; SSE41: truncate_v4i32_to_v4i16 269; SSE41: cost of 1 {{.*}} trunc 270; 271 %1 = load <4 x i32>, <4 x i32>* %a 272 %2 = trunc <4 x i32> %1 to <4 x i16> 273 store <4 x i16> %2, <4 x i16>* undef, align 4 274 ret void 275} 276 277define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) { 278; SSE2: truncate_v16i32_to_v16i8 279; SSE2: cost of 7 {{.*}} trunc 280; 281; SSE41: truncate_v16i32_to_v16i8 282; SSE41: cost of 30 {{.*}} trunc 283; 284 %1 = load <16 x i32>, <16 x i32>* %a 285 %2 = trunc <16 x i32> %1 to <16 x i8> 286 store <16 x i8> %2, <16 x i8>* undef, align 4 287 ret void 288} 289 290define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) { 291; SSE2: truncate_v8i32_to_v8i8 292; SSE2: cost of 4 {{.*}} trunc 293; 294; SSE41: truncate_v8i32_to_v8i8 295; SSE41: cost of 3 {{.*}} trunc 296; 297 %1 = load <8 x i32>, <8 x i32>* %a 298 %2 = trunc <8 x i32> %1 to <8 x i8> 299 store <8 x i8> %2, <8 x i8>* undef, align 4 300 ret void 301} 302 303define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) { 304; SSE2: truncate_v4i32_to_v4i8 305; SSE2: cost of 3 {{.*}} trunc 306; 307; SSE41: truncate_v4i32_to_v4i8 308; SSE41: cost of 1 {{.*}} trunc 309; 310 %1 = load <4 x i32>, <4 x i32>* %a 311 %2 = trunc <4 x i32> %1 to <4 x i8> 312 store <4 x i8> %2, <4 x i8>* undef, align 4 313 ret void 314} 315 316define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) { 317; SSE2: truncate_v16i16_to_v16i8 318; SSE2: cost of 3 {{.*}} trunc 319; 320; SSE41: truncate_v16i16_to_v16i8 321; SSE41: cost of 3 {{.*}} trunc 322; 323 %1 = load <16 x i16>, <16 x i16>* %a 324 %2 = trunc <16 x i16> %1 to <16 x i8> 325 store <16 x i8> %2, <16 x i8>* undef, align 4 326 ret void 327} 328 329define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) { 330; SSE2: truncate_v8i16_to_v8i8 331; SSE2: cost of 2 {{.*}} trunc 332; 333; SSE41: truncate_v8i16_to_v8i8 334; SSE41: cost of 1 {{.*}} trunc 335; 336 %1 = load <8 x i16>, <8 x i16>* %a 337 %2 = trunc <8 x i16> %1 to <8 x i8> 338 store <8 x i8> %2, <8 x i8>* undef, align 4 339 ret void 340} 341 342define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) { 343; SSE2: truncate_v4i16_to_v4i8 344; SSE2: cost of 4 {{.*}} trunc 345; 346; SSE41: truncate_v4i16_to_v4i8 347; SSE41: cost of 2 {{.*}} trunc 348; 349 %1 = load <4 x i16>, <4 x i16>* %a 350 %2 = trunc <4 x i16> %1 to <4 x i8> 351 store <4 x i8> %2, <4 x i8>* undef, align 4 352 ret void 353} 354