1; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefix=SM20 %s 2; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck -check-prefix=SM35 %s 3 4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 5target triple = "nvptx64-unknown-unknown" 6 7; SM20-LABEL: .visible .entry foo1( 8; SM20: ld.global.f32 9; SM35-LABEL: .visible .entry foo1( 10; SM35: ld.global.nc.f32 11define void @foo1(float * noalias readonly %from, float * %to) { 12 %1 = load float, float * %from 13 store float %1, float * %to 14 ret void 15} 16 17; SM20-LABEL: .visible .entry foo2( 18; SM20: ld.global.f64 19; SM35-LABEL: .visible .entry foo2( 20; SM35: ld.global.nc.f64 21define void @foo2(double * noalias readonly %from, double * %to) { 22 %1 = load double, double * %from 23 store double %1, double * %to 24 ret void 25} 26 27; SM20-LABEL: .visible .entry foo3( 28; SM20: ld.global.u16 29; SM35-LABEL: .visible .entry foo3( 30; SM35: ld.global.nc.u16 31define void @foo3(i16 * noalias readonly %from, i16 * %to) { 32 %1 = load i16, i16 * %from 33 store i16 %1, i16 * %to 34 ret void 35} 36 37; SM20-LABEL: .visible .entry foo4( 38; SM20: ld.global.u32 39; SM35-LABEL: .visible .entry foo4( 40; SM35: ld.global.nc.u32 41define void @foo4(i32 * noalias readonly %from, i32 * %to) { 42 %1 = load i32, i32 * %from 43 store i32 %1, i32 * %to 44 ret void 45} 46 47; SM20-LABEL: .visible .entry foo5( 48; SM20: ld.global.u64 49; SM35-LABEL: .visible .entry foo5( 50; SM35: ld.global.nc.u64 51define void @foo5(i64 * noalias readonly %from, i64 * %to) { 52 %1 = load i64, i64 * %from 53 store i64 %1, i64 * %to 54 ret void 55} 56 57; i128 is non standard integer in nvptx64 58; SM20-LABEL: .visible .entry foo6( 59; SM20: ld.global.u64 60; SM20: ld.global.u64 61; SM35-LABEL: .visible .entry foo6( 62; SM35: ld.global.nc.u64 63; SM35: ld.global.nc.u64 64define void @foo6(i128 * noalias readonly %from, i128 * %to) { 65 %1 = load i128, i128 * %from 66 store i128 %1, i128 * %to 67 ret void 68} 69 70; SM20-LABEL: .visible .entry foo7( 71; SM20: ld.global.v2.u8 72; SM35-LABEL: .visible .entry foo7( 73; SM35: ld.global.nc.v2.u8 74define void @foo7(<2 x i8> * noalias readonly %from, <2 x i8> * %to) { 75 %1 = load <2 x i8>, <2 x i8> * %from 76 store <2 x i8> %1, <2 x i8> * %to 77 ret void 78} 79 80; SM20-LABEL: .visible .entry foo8( 81; SM20: ld.global.v2.u16 82; SM35-LABEL: .visible .entry foo8( 83; SM35: ld.global.nc.v2.u16 84define void @foo8(<2 x i16> * noalias readonly %from, <2 x i16> * %to) { 85 %1 = load <2 x i16>, <2 x i16> * %from 86 store <2 x i16> %1, <2 x i16> * %to 87 ret void 88} 89 90; SM20-LABEL: .visible .entry foo9( 91; SM20: ld.global.v2.u32 92; SM35-LABEL: .visible .entry foo9( 93; SM35: ld.global.nc.v2.u32 94define void @foo9(<2 x i32> * noalias readonly %from, <2 x i32> * %to) { 95 %1 = load <2 x i32>, <2 x i32> * %from 96 store <2 x i32> %1, <2 x i32> * %to 97 ret void 98} 99 100; SM20-LABEL: .visible .entry foo10( 101; SM20: ld.global.v2.u64 102; SM35-LABEL: .visible .entry foo10( 103; SM35: ld.global.nc.v2.u64 104define void @foo10(<2 x i64> * noalias readonly %from, <2 x i64> * %to) { 105 %1 = load <2 x i64>, <2 x i64> * %from 106 store <2 x i64> %1, <2 x i64> * %to 107 ret void 108} 109 110; SM20-LABEL: .visible .entry foo11( 111; SM20: ld.global.v2.f32 112; SM35-LABEL: .visible .entry foo11( 113; SM35: ld.global.nc.v2.f32 114define void @foo11(<2 x float> * noalias readonly %from, <2 x float> * %to) { 115 %1 = load <2 x float>, <2 x float> * %from 116 store <2 x float> %1, <2 x float> * %to 117 ret void 118} 119 120; SM20-LABEL: .visible .entry foo12( 121; SM20: ld.global.v2.f64 122; SM35-LABEL: .visible .entry foo12( 123; SM35: ld.global.nc.v2.f64 124define void @foo12(<2 x double> * noalias readonly %from, <2 x double> * %to) { 125 %1 = load <2 x double>, <2 x double> * %from 126 store <2 x double> %1, <2 x double> * %to 127 ret void 128} 129 130; SM20-LABEL: .visible .entry foo13( 131; SM20: ld.global.v4.u8 132; SM35-LABEL: .visible .entry foo13( 133; SM35: ld.global.nc.v4.u8 134define void @foo13(<4 x i8> * noalias readonly %from, <4 x i8> * %to) { 135 %1 = load <4 x i8>, <4 x i8> * %from 136 store <4 x i8> %1, <4 x i8> * %to 137 ret void 138} 139 140; SM20-LABEL: .visible .entry foo14( 141; SM20: ld.global.v4.u16 142; SM35-LABEL: .visible .entry foo14( 143; SM35: ld.global.nc.v4.u16 144define void @foo14(<4 x i16> * noalias readonly %from, <4 x i16> * %to) { 145 %1 = load <4 x i16>, <4 x i16> * %from 146 store <4 x i16> %1, <4 x i16> * %to 147 ret void 148} 149 150; SM20-LABEL: .visible .entry foo15( 151; SM20: ld.global.v4.u32 152; SM35-LABEL: .visible .entry foo15( 153; SM35: ld.global.nc.v4.u32 154define void @foo15(<4 x i32> * noalias readonly %from, <4 x i32> * %to) { 155 %1 = load <4 x i32>, <4 x i32> * %from 156 store <4 x i32> %1, <4 x i32> * %to 157 ret void 158} 159 160; SM20-LABEL: .visible .entry foo16( 161; SM20: ld.global.v4.f32 162; SM35-LABEL: .visible .entry foo16( 163; SM35: ld.global.nc.v4.f32 164define void @foo16(<4 x float> * noalias readonly %from, <4 x float> * %to) { 165 %1 = load <4 x float>, <4 x float> * %from 166 store <4 x float> %1, <4 x float> * %to 167 ret void 168} 169 170; SM20-LABEL: .visible .entry foo17( 171; SM20: ld.global.v2.f64 172; SM20: ld.global.v2.f64 173; SM35-LABEL: .visible .entry foo17( 174; SM35: ld.global.nc.v2.f64 175; SM35: ld.global.nc.v2.f64 176define void @foo17(<4 x double> * noalias readonly %from, <4 x double> * %to) { 177 %1 = load <4 x double>, <4 x double> * %from 178 store <4 x double> %1, <4 x double> * %to 179 ret void 180} 181 182; SM20-LABEL: .visible .entry foo18( 183; SM20: ld.global.u64 184; SM35-LABEL: .visible .entry foo18( 185; SM35: ld.global.nc.u64 186define void @foo18(float ** noalias readonly %from, float ** %to) { 187 %1 = load float *, float ** %from 188 store float * %1, float ** %to 189 ret void 190} 191 192; Test that we can infer a cached load for a pointer induction variable. 193; SM20-LABEL: .visible .entry foo19( 194; SM20: ld.global.f32 195; SM35-LABEL: .visible .entry foo19( 196; SM35: ld.global.nc.f32 197define void @foo19(float * noalias readonly %from, float * %to, i32 %n) { 198entry: 199 br label %loop 200 201loop: 202 %i = phi i32 [ 0, %entry ], [ %nexti, %loop ] 203 %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ] 204 %ptr = getelementptr inbounds float, float * %from, i32 %i 205 %value = load float, float * %ptr, align 4 206 %nextsum = fadd float %value, %sum 207 %nexti = add nsw i32 %i, 1 208 %exitcond = icmp eq i32 %nexti, %n 209 br i1 %exitcond, label %exit, label %loop 210 211exit: 212 store float %nextsum, float * %to 213 ret void 214} 215 216; This test captures the case of a non-kernel function. In a 217; non-kernel function, without interprocedural analysis, we do not 218; know that the parameter is global. We also do not know that the 219; pointed-to memory is never written to (for the duration of the 220; kernel). For both reasons, we cannot use a cached load here. 221; SM20-LABEL: notkernel( 222; SM20: ld.f32 223; SM35-LABEL: notkernel( 224; SM35: ld.f32 225define void @notkernel(float * noalias readonly %from, float * %to) { 226 %1 = load float, float * %from 227 store float %1, float * %to 228 ret void 229} 230 231; As @notkernel, but with the parameter explicitly marked as global. We still 232; do not know that the parameter is never written to (for the duration of the 233; kernel). This case does not currently come up normally since we do not infer 234; that pointers are global interprocedurally as of 2015-08-05. 235; SM20-LABEL: notkernel2( 236; SM20: ld.global.f32 237; SM35-LABEL: notkernel2( 238; SM35: ld.global.f32 239define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) { 240 %1 = load float, float addrspace(1) * %from 241 store float %1, float * %to 242 ret void 243} 244 245!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19} 246!1 = !{void (float *, float *)* @foo1, !"kernel", i32 1} 247!2 = !{void (double *, double *)* @foo2, !"kernel", i32 1} 248!3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1} 249!4 = !{void (i32 *, i32 *)* @foo4, !"kernel", i32 1} 250!5 = !{void (i64 *, i64 *)* @foo5, !"kernel", i32 1} 251!6 = !{void (i128 *, i128 *)* @foo6, !"kernel", i32 1} 252!7 = !{void (<2 x i8> *, <2 x i8> *)* @foo7, !"kernel", i32 1} 253!8 = !{void (<2 x i16> *, <2 x i16> *)* @foo8, !"kernel", i32 1} 254!9 = !{void (<2 x i32> *, <2 x i32> *)* @foo9, !"kernel", i32 1} 255!10 = !{void (<2 x i64> *, <2 x i64> *)* @foo10, !"kernel", i32 1} 256!11 = !{void (<2 x float> *, <2 x float> *)* @foo11, !"kernel", i32 1} 257!12 = !{void (<2 x double> *, <2 x double> *)* @foo12, !"kernel", i32 1} 258!13 = !{void (<4 x i8> *, <4 x i8> *)* @foo13, !"kernel", i32 1} 259!14 = !{void (<4 x i16> *, <4 x i16> *)* @foo14, !"kernel", i32 1} 260!15 = !{void (<4 x i32> *, <4 x i32> *)* @foo15, !"kernel", i32 1} 261!16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1} 262!17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1} 263!18 = !{void (float **, float **)* @foo18, !"kernel", i32 1} 264!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1} 265