1; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr8 \ 2; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck -allow-deprecated-dag-overlap %s 3 4; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr8 -disable-ppc-vsx-swap-removal \ 5; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck -allow-deprecated-dag-overlap \ 6; RUN: -check-prefix=NOOPTSWAP %s 7 8; RUN: llc -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ 9; RUN: -verify-machineinstrs -ppc-vsr-nums-as-vr < %s | FileCheck -allow-deprecated-dag-overlap \ 10; RUN: -check-prefix=CHECK-P9 --implicit-check-not xxswapd %s 11 12; RUN: llc -O3 -mcpu=pwr9 -disable-ppc-vsx-swap-removal -mattr=-power9-vector \ 13; RUN: -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s \ 14; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefix=NOOPTSWAP %s 15 16; LH: 2016-11-17 17; Updated align attritue from 16 to 8 to keep swap instructions tests. 18; Changes have been made on little-endian to use lvx and stvx 19; instructions instead of lxvd2x/xxswapd and xxswapd/stxvd2x for 20; aligned vectors with elements up to 4 bytes 21 22; This test was generated from the following source: 23; 24; #define N 4096 25; int ca[N] __attribute__((aligned(16))); 26; int cb[N] __attribute__((aligned(16))); 27; int cc[N] __attribute__((aligned(16))); 28; int cd[N] __attribute__((aligned(16))); 29; 30; void foo () 31; { 32; int i; 33; for (i = 0; i < N; i++) { 34; ca[i] = (cb[i] + cc[i]) * cd[i]; 35; } 36; } 37 38@cb = common global [4096 x i32] zeroinitializer, align 8 39@cc = common global [4096 x i32] zeroinitializer, align 8 40@cd = common global [4096 x i32] zeroinitializer, align 8 41@ca = common global [4096 x i32] zeroinitializer, align 8 42 43define void @foo() { 44entry: 45 br label %vector.body 46 47vector.body: 48 %index = phi i64 [ 0, %entry ], [ %index.next.3, %vector.body ] 49 %0 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index 50 %1 = bitcast i32* %0 to <4 x i32>* 51 %wide.load = load <4 x i32>, <4 x i32>* %1, align 8 52 %2 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index 53 %3 = bitcast i32* %2 to <4 x i32>* 54 %wide.load13 = load <4 x i32>, <4 x i32>* %3, align 8 55 %4 = add nsw <4 x i32> %wide.load13, %wide.load 56 %5 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index 57 %6 = bitcast i32* %5 to <4 x i32>* 58 %wide.load14 = load <4 x i32>, <4 x i32>* %6, align 8 59 %7 = mul nsw <4 x i32> %4, %wide.load14 60 %8 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index 61 %9 = bitcast i32* %8 to <4 x i32>* 62 store <4 x i32> %7, <4 x i32>* %9, align 8 63 %index.next = add nuw nsw i64 %index, 4 64 %10 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next 65 %11 = bitcast i32* %10 to <4 x i32>* 66 %wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 8 67 %12 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next 68 %13 = bitcast i32* %12 to <4 x i32>* 69 %wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 8 70 %14 = add nsw <4 x i32> %wide.load13.1, %wide.load.1 71 %15 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next 72 %16 = bitcast i32* %15 to <4 x i32>* 73 %wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 8 74 %17 = mul nsw <4 x i32> %14, %wide.load14.1 75 %18 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next 76 %19 = bitcast i32* %18 to <4 x i32>* 77 store <4 x i32> %17, <4 x i32>* %19, align 8 78 %index.next.1 = add nuw nsw i64 %index.next, 4 79 %20 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.1 80 %21 = bitcast i32* %20 to <4 x i32>* 81 %wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 8 82 %22 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.1 83 %23 = bitcast i32* %22 to <4 x i32>* 84 %wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 8 85 %24 = add nsw <4 x i32> %wide.load13.2, %wide.load.2 86 %25 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.1 87 %26 = bitcast i32* %25 to <4 x i32>* 88 %wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 8 89 %27 = mul nsw <4 x i32> %24, %wide.load14.2 90 %28 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.1 91 %29 = bitcast i32* %28 to <4 x i32>* 92 store <4 x i32> %27, <4 x i32>* %29, align 8 93 %index.next.2 = add nuw nsw i64 %index.next.1, 4 94 %30 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.2 95 %31 = bitcast i32* %30 to <4 x i32>* 96 %wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 8 97 %32 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.2 98 %33 = bitcast i32* %32 to <4 x i32>* 99 %wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 8 100 %34 = add nsw <4 x i32> %wide.load13.3, %wide.load.3 101 %35 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.2 102 %36 = bitcast i32* %35 to <4 x i32>* 103 %wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 8 104 %37 = mul nsw <4 x i32> %34, %wide.load14.3 105 %38 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.2 106 %39 = bitcast i32* %38 to <4 x i32>* 107 store <4 x i32> %37, <4 x i32>* %39, align 8 108 %index.next.3 = add nuw nsw i64 %index.next.2, 4 109 %40 = icmp eq i64 %index.next.3, 4096 110 br i1 %40, label %for.end, label %vector.body 111 112for.end: 113 ret void 114} 115 116; CHECK-LABEL: @foo 117; CHECK-NOT: xxpermdi 118; CHECK-NOT: xxswapd 119; CHECK-P9-NOT: xxpermdi 120 121; CHECK: lxvd2x 122; CHECK: lxvd2x 123; CHECK-DAG: lxvd2x 124; CHECK-DAG: vadduwm 125; CHECK: vmuluwm 126; CHECK: stxvd2x 127 128; CHECK: lxvd2x 129; CHECK: lxvd2x 130; CHECK-DAG: lxvd2x 131; CHECK-DAG: vadduwm 132; CHECK: vmuluwm 133; CHECK: stxvd2x 134 135; CHECK: lxvd2x 136; CHECK: lxvd2x 137; CHECK-DAG: lxvd2x 138; CHECK-DAG: vadduwm 139; CHECK: vmuluwm 140; CHECK: stxvd2x 141 142; CHECK: lxvd2x 143; CHECK: lxvd2x 144; CHECK-DAG: lxvd2x 145; CHECK-DAG: vadduwm 146; CHECK: vmuluwm 147; CHECK: stxvd2x 148 149; NOOPTSWAP-LABEL: @foo 150 151; NOOPTSWAP: lxvd2x 152; NOOPTSWAP-DAG: lxvd2x 153; NOOPTSWAP-DAG: lxvd2x 154; NOOPTSWAP-DAG: xxswapd 155; NOOPTSWAP-DAG: xxswapd 156; NOOPTSWAP-DAG: xxswapd 157; NOOPTSWAP-DAG: vadduwm 158; NOOPTSWAP: vmuluwm 159; NOOPTSWAP: xxswapd 160; NOOPTSWAP-DAG: xxswapd 161; NOOPTSWAP-DAG: xxswapd 162; NOOPTSWAP-DAG: stxvd2x 163; NOOPTSWAP-DAG: stxvd2x 164; NOOPTSWAP: stxvd2x 165 166; CHECK-P9-LABEL: @foo 167; CHECK-P9-DAG: lxvx 168; CHECK-P9-DAG: lxvx 169; CHECK-P9-DAG: lxvx 170; CHECK-P9-DAG: lxvx 171; CHECK-P9-DAG: lxvx 172; CHECK-P9-DAG: lxvx 173; CHECK-P9-DAG: lxvx 174; CHECK-P9-DAG: lxvx 175; CHECK-P9-DAG: lxvx 176; CHECK-P9-DAG: lxvx 177; CHECK-P9-DAG: lxvx 178; CHECK-P9-DAG: lxvx 179; CHECK-P9-DAG: vadduwm 180; CHECK-P9-DAG: vadduwm 181; CHECK-P9-DAG: vadduwm 182; CHECK-P9-DAG: vadduwm 183; CHECK-P9-DAG: vmuluwm 184; CHECK-P9-DAG: vmuluwm 185; CHECK-P9-DAG: vmuluwm 186; CHECK-P9-DAG: vmuluwm 187; CHECK-P9-DAG: stxvx 188; CHECK-P9-DAG: stxvx 189; CHECK-P9-DAG: stxvx 190; CHECK-P9-DAG: stxvx 191 192