1; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s 2; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s 3 4define void @zext_v4i8_to_v4i64(<4 x i8>* %a) { 5; SSE2: zext_v4i8_to_v4i64 6; SSE2: cost of 4 {{.*}} zext 7; 8; SSE41: zext_v4i8_to_v4i64 9; SSE41: cost of 2 {{.*}} zext 10; 11 %1 = load <4 x i8>, <4 x i8>* %a 12 %2 = zext <4 x i8> %1 to <4 x i64> 13 store <4 x i64> %2, <4 x i64>* undef, align 4 14 ret void 15} 16 17define void @sext_v4i8_to_v4i64(<4 x i8>* %a) { 18; SSE2: sext_v4i8_to_v4i64 19; SSE2: cost of 8 {{.*}} sext 20; 21; SSE41: sext_v4i8_to_v4i64 22; SSE41: cost of 2 {{.*}} sext 23; 24 %1 = load <4 x i8>, <4 x i8>* %a 25 %2 = sext <4 x i8> %1 to <4 x i64> 26 store <4 x i64> %2, <4 x i64>* undef, align 4 27 ret void 28} 29 30define void @zext_v4i16_to_v4i64(<4 x i16>* %a) { 31; SSE2: zext_v4i16_to_v4i64 32; SSE2: cost of 3 {{.*}} zext 33; 34; SSE41: zext_v4i16_to_v4i64 35; SSE41: cost of 2 {{.*}} zext 36; 37 %1 = load <4 x i16>, <4 x i16>* %a 38 %2 = zext <4 x i16> %1 to <4 x i64> 39 store <4 x i64> %2, <4 x i64>* undef, align 4 40 ret void 41} 42 43define void @sext_v4i16_to_v4i64(<4 x i16>* %a) { 44; SSE2: sext_v4i16_to_v4i64 45; SSE2: cost of 10 {{.*}} sext 46; 47; SSE41: sext_v4i16_to_v4i64 48; SSE41: cost of 2 {{.*}} sext 49; 50 %1 = load <4 x i16>, <4 x i16>* %a 51 %2 = sext <4 x i16> %1 to <4 x i64> 52 store <4 x i64> %2, <4 x i64>* undef, align 4 53 ret void 54} 55 56 57define void @zext_v4i32_to_v4i64(<4 x i32>* %a) { 58; SSE2: zext_v4i32_to_v4i64 59; SSE2: cost of 3 {{.*}} zext 60; 61; SSE41: zext_v4i32_to_v4i64 62; SSE41: cost of 2 {{.*}} zext 63; 64 %1 = load <4 x i32>, <4 x i32>* %a 65 %2 = zext <4 x i32> %1 to <4 x i64> 66 store <4 x i64> %2, <4 x i64>* undef, align 4 67 ret void 68} 69 70define void @sext_v4i32_to_v4i64(<4 x i32>* %a) { 71; SSE2: sext_v4i32_to_v4i64 72; SSE2: cost of 5 {{.*}} sext 73; 74; SSE41: sext_v4i32_to_v4i64 75; SSE41: cost of 2 {{.*}} sext 76; 77 %1 = load <4 x i32>, <4 x i32>* %a 78 %2 = sext <4 x i32> %1 to <4 x i64> 79 store <4 x i64> %2, <4 x i64>* undef, align 4 80 ret void 81} 82 83define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { 84; SSE2: zext_v16i16_to_v16i32 85; SSE2: cost of 6 {{.*}} zext 86; 87; SSE41: zext_v16i16_to_v16i32 88; SSE41: cost of 4 {{.*}} zext 89; 90 %1 = load <16 x i16>, <16 x i16>* %a 91 %2 = zext <16 x i16> %1 to <16 x i32> 92 store <16 x i32> %2, <16 x i32>* undef, align 4 93 ret void 94} 95 96define void @sext_v16i16_to_v16i32(<16 x i16>* %a) { 97; SSE2: sext_v16i16_to_v16i32 98; SSE2: cost of 8 {{.*}} sext 99; 100; SSE41: sext_v16i16_to_v16i32 101; SSE41: cost of 4 {{.*}} sext 102; 103 %1 = load <16 x i16>, <16 x i16>* %a 104 %2 = sext <16 x i16> %1 to <16 x i32> 105 store <16 x i32> %2, <16 x i32>* undef, align 4 106 ret void 107} 108 109define void @zext_v8i16_to_v8i32(<8 x i16>* %a) { 110; SSE2: zext_v8i16_to_v8i32 111; SSE2: cost of 3 {{.*}} zext 112; 113; SSE41: zext_v8i16_to_v8i32 114; SSE41: cost of 2 {{.*}} zext 115; 116 %1 = load <8 x i16>, <8 x i16>* %a 117 %2 = zext <8 x i16> %1 to <8 x i32> 118 store <8 x i32> %2, <8 x i32>* undef, align 4 119 ret void 120} 121 122define void @sext_v8i16_to_v8i32(<8 x i16>* %a) { 123; SSE2: sext_v8i16_to_v8i32 124; SSE2: cost of 4 {{.*}} sext 125; 126; SSE41: sext_v8i16_to_v8i32 127; SSE41: cost of 2 {{.*}} sext 128; 129 %1 = load <8 x i16>, <8 x i16>* %a 130 %2 = sext <8 x i16> %1 to <8 x i32> 131 store <8 x i32> %2, <8 x i32>* undef, align 4 132 ret void 133} 134 135define void @zext_v4i16_to_v4i32(<4 x i16>* %a) { 136; SSE2: zext_v4i16_to_v4i32 137; SSE2: cost of 1 {{.*}} zext 138; 139; SSE41: zext_v4i16_to_v4i32 140; SSE41: cost of 1 {{.*}} zext 141; 142 %1 = load <4 x i16>, <4 x i16>* %a 143 %2 = zext <4 x i16> %1 to <4 x i32> 144 store <4 x i32> %2, <4 x i32>* undef, align 4 145 ret void 146} 147 148define void @sext_v4i16_to_v4i32(<4 x i16>* %a) { 149; SSE2: sext_v4i16_to_v4i32 150; SSE2: cost of 2 {{.*}} sext 151; 152; SSE41: sext_v4i16_to_v4i32 153; SSE41: cost of 1 {{.*}} sext 154; 155 %1 = load <4 x i16>, <4 x i16>* %a 156 %2 = sext <4 x i16> %1 to <4 x i32> 157 store <4 x i32> %2, <4 x i32>* undef, align 4 158 ret void 159} 160 161define void @zext_v16i8_to_v16i32(<16 x i8>* %a) { 162; SSE2: zext_v16i8_to_v16i32 163; SSE2: cost of 9 {{.*}} zext 164; 165; SSE41: zext_v16i8_to_v16i32 166; SSE41: cost of 4 {{.*}} zext 167; 168 %1 = load <16 x i8>, <16 x i8>* %a 169 %2 = zext <16 x i8> %1 to <16 x i32> 170 store <16 x i32> %2, <16 x i32>* undef, align 4 171 ret void 172} 173 174define void @sext_v16i8_to_v16i32(<16 x i8>* %a) { 175; SSE2: sext_v16i8_to_v16i32 176; SSE2: cost of 12 {{.*}} sext 177; 178; SSE41: sext_v16i8_to_v16i32 179; SSE41: cost of 4 {{.*}} sext 180; 181 %1 = load <16 x i8>, <16 x i8>* %a 182 %2 = sext <16 x i8> %1 to <16 x i32> 183 store <16 x i32> %2, <16 x i32>* undef, align 4 184 ret void 185} 186 187define void @zext_v8i8_to_v8i32(<8 x i8>* %a) { 188; SSE2: zext_v8i8_to_v8i32 189; SSE2: cost of 6 {{.*}} zext 190; 191; SSE41: zext_v8i8_to_v8i32 192; SSE41: cost of 2 {{.*}} zext 193; 194 %1 = load <8 x i8>, <8 x i8>* %a 195 %2 = zext <8 x i8> %1 to <8 x i32> 196 store <8 x i32> %2, <8 x i32>* undef, align 4 197 ret void 198} 199 200define void @sext_v8i8_to_v8i32(<8 x i8>* %a) { 201; SSE2: sext_v8i8_to_v8i32 202; SSE2: cost of 6 {{.*}} sext 203; 204; SSE41: sext_v8i8_to_v8i32 205; SSE41: cost of 2 {{.*}} sext 206; 207 %1 = load <8 x i8>, <8 x i8>* %a 208 %2 = sext <8 x i8> %1 to <8 x i32> 209 store <8 x i32> %2, <8 x i32>* undef, align 4 210 ret void 211} 212 213define void @zext_v4i8_to_v4i32(<4 x i8>* %a) { 214; SSE2: zext_v4i8_to_v4i32 215; SSE2: cost of 2 {{.*}} zext 216; 217; SSE41: zext_v4i8_to_v4i32 218; SSE41: cost of 1 {{.*}} zext 219; 220 %1 = load <4 x i8>, <4 x i8>* %a 221 %2 = zext <4 x i8> %1 to <4 x i32> 222 store <4 x i32> %2, <4 x i32>* undef, align 4 223 ret void 224} 225 226define void @sext_v4i8_to_v4i32(<4 x i8>* %a) { 227; SSE2: sext_v4i8_to_v4i32 228; SSE2: cost of 3 {{.*}} sext 229; 230; SSE41: sext_v4i8_to_v4i32 231; SSE41: cost of 1 {{.*}} sext 232; 233 %1 = load <4 x i8>, <4 x i8>* %a 234 %2 = sext <4 x i8> %1 to <4 x i32> 235 store <4 x i32> %2, <4 x i32>* undef, align 4 236 ret void 237} 238 239define void @zext_v16i8_to_v16i16(<16 x i8>* %a) { 240; SSE2: zext_v16i8_to_v16i16 241; SSE2: cost of 3 {{.*}} zext 242; 243; SSE41: zext_v16i8_to_v16i16 244; SSE41: cost of 2 {{.*}} zext 245; 246 %1 = load <16 x i8>, <16 x i8>* %a 247 %2 = zext <16 x i8> %1 to <16 x i16> 248 store <16 x i16> %2, <16 x i16>* undef, align 4 249 ret void 250} 251 252define void @sext_v16i8_to_v16i16(<16 x i8>* %a) { 253; SSE2: sext_v16i8_to_v16i16 254; SSE2: cost of 4 {{.*}} sext 255; 256; SSE41: sext_v16i8_to_v16i16 257; SSE41: cost of 2 {{.*}} sext 258; 259 %1 = load <16 x i8>, <16 x i8>* %a 260 %2 = sext <16 x i8> %1 to <16 x i16> 261 store <16 x i16> %2, <16 x i16>* undef, align 4 262 ret void 263} 264 265define void @zext_v8i8_to_v8i16(<8 x i8>* %a) { 266; SSE2: zext_v8i8_to_v8i16 267; SSE2: cost of 1 {{.*}} zext 268; 269; SSE41: zext_v8i8_to_v8i16 270; SSE41: cost of 1 {{.*}} zext 271; 272 %1 = load <8 x i8>, <8 x i8>* %a 273 %2 = zext <8 x i8> %1 to <8 x i16> 274 store <8 x i16> %2, <8 x i16>* undef, align 4 275 ret void 276} 277 278define void @sext_v8i8_to_v8i16(<8 x i8>* %a) { 279; SSE2: sext_v8i8_to_v8i16 280; SSE2: cost of 2 {{.*}} sext 281; 282; SSE41: sext_v8i8_to_v8i16 283; SSE41: cost of 1 {{.*}} sext 284; 285 %1 = load <8 x i8>, <8 x i8>* %a 286 %2 = sext <8 x i8> %1 to <8 x i16> 287 store <8 x i16> %2, <8 x i16>* undef, align 4 288 ret void 289} 290 291define void @zext_v4i8_to_v4i16(<4 x i8>* %a) { 292; SSE2: zext_v4i8_to_v4i16 293; SSE2: cost of 1 {{.*}} zext 294; 295; SSE41: zext_v4i8_to_v4i16 296; SSE41: cost of 1 {{.*}} zext 297; 298 %1 = load <4 x i8>, <4 x i8>* %a 299 %2 = zext <4 x i8> %1 to <4 x i16> 300 store <4 x i16> %2, <4 x i16>* undef, align 4 301 ret void 302} 303 304define void @sext_v4i8_to_v4i16(<4 x i8>* %a) { 305; SSE2: sext_v4i8_to_v4i16 306; SSE2: cost of 6 {{.*}} sext 307; 308; SSE41: sext_v4i8_to_v4i16 309; SSE41: cost of 2 {{.*}} sext 310; 311 %1 = load <4 x i8>, <4 x i8>* %a 312 %2 = sext <4 x i8> %1 to <4 x i16> 313 store <4 x i16> %2, <4 x i16>* undef, align 4 314 ret void 315} 316 317define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) { 318; SSE2: truncate_v16i32_to_v16i16 319; SSE2: cost of 10 {{.*}} trunc 320; 321; SSE41: truncate_v16i32_to_v16i16 322; SSE41: cost of 6 {{.*}} trunc 323; 324 %1 = load <16 x i32>, <16 x i32>* %a 325 %2 = trunc <16 x i32> %1 to <16 x i16> 326 store <16 x i16> %2, <16 x i16>* undef, align 4 327 ret void 328} 329 330define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) { 331; SSE2: truncate_v8i32_to_v8i16 332; SSE2: cost of 5 {{.*}} trunc 333; 334; SSE41: truncate_v8i32_to_v8i16 335; SSE41: cost of 3 {{.*}} trunc 336; 337 %1 = load <8 x i32>, <8 x i32>* %a 338 %2 = trunc <8 x i32> %1 to <8 x i16> 339 store <8 x i16> %2, <8 x i16>* undef, align 4 340 ret void 341} 342 343define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) { 344; SSE2: truncate_v4i32_to_v4i16 345; SSE2: cost of 3 {{.*}} trunc 346; 347; SSE41: truncate_v4i32_to_v4i16 348; SSE41: cost of 1 {{.*}} trunc 349; 350 %1 = load <4 x i32>, <4 x i32>* %a 351 %2 = trunc <4 x i32> %1 to <4 x i16> 352 store <4 x i16> %2, <4 x i16>* undef, align 4 353 ret void 354} 355 356define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) { 357; SSE2: truncate_v16i32_to_v16i8 358; SSE2: cost of 7 {{.*}} trunc 359; 360; SSE41: truncate_v16i32_to_v16i8 361; SSE41: cost of 7 {{.*}} trunc 362; 363 %1 = load <16 x i32>, <16 x i32>* %a 364 %2 = trunc <16 x i32> %1 to <16 x i8> 365 store <16 x i8> %2, <16 x i8>* undef, align 4 366 ret void 367} 368 369define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) { 370; SSE2: truncate_v8i32_to_v8i8 371; SSE2: cost of 4 {{.*}} trunc 372; 373; SSE41: truncate_v8i32_to_v8i8 374; SSE41: cost of 3 {{.*}} trunc 375; 376 %1 = load <8 x i32>, <8 x i32>* %a 377 %2 = trunc <8 x i32> %1 to <8 x i8> 378 store <8 x i8> %2, <8 x i8>* undef, align 4 379 ret void 380} 381 382define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) { 383; SSE2: truncate_v4i32_to_v4i8 384; SSE2: cost of 3 {{.*}} trunc 385; 386; SSE41: truncate_v4i32_to_v4i8 387; SSE41: cost of 1 {{.*}} trunc 388; 389 %1 = load <4 x i32>, <4 x i32>* %a 390 %2 = trunc <4 x i32> %1 to <4 x i8> 391 store <4 x i8> %2, <4 x i8>* undef, align 4 392 ret void 393} 394 395define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) { 396; SSE2: truncate_v16i16_to_v16i8 397; SSE2: cost of 3 {{.*}} trunc 398; 399; SSE41: truncate_v16i16_to_v16i8 400; SSE41: cost of 3 {{.*}} trunc 401; 402 %1 = load <16 x i16>, <16 x i16>* %a 403 %2 = trunc <16 x i16> %1 to <16 x i8> 404 store <16 x i8> %2, <16 x i8>* undef, align 4 405 ret void 406} 407 408define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) { 409; SSE2: truncate_v8i16_to_v8i8 410; SSE2: cost of 2 {{.*}} trunc 411; 412; SSE41: truncate_v8i16_to_v8i8 413; SSE41: cost of 1 {{.*}} trunc 414; 415 %1 = load <8 x i16>, <8 x i16>* %a 416 %2 = trunc <8 x i16> %1 to <8 x i8> 417 store <8 x i8> %2, <8 x i8>* undef, align 4 418 ret void 419} 420 421define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) { 422; SSE2: truncate_v4i16_to_v4i8 423; SSE2: cost of 4 {{.*}} trunc 424; 425; SSE41: truncate_v4i16_to_v4i8 426; SSE41: cost of 2 {{.*}} trunc 427; 428 %1 = load <4 x i16>, <4 x i16>* %a 429 %2 = trunc <4 x i16> %1 to <4 x i8> 430 store <4 x i8> %2, <4 x i8>* undef, align 4 431 ret void 432} 433