1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s 3; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR 4 5define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 { 6; CHECK-LABEL: @fusion( 7; CHECK-NEXT: [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6 8; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]] 9; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2 10; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 11; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 12; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half* 13; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]] 14; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* 15; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] 16; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] 17; CHECK-NEXT: [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>* 18; CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8 19; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380> 20; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0> 21; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] 22; CHECK-NEXT: [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>* 23; CHECK-NEXT: store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8 24; CHECK-NEXT: ret void 25; 26; NOVECTOR-LABEL: @fusion( 27; NOVECTOR-NEXT: [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6 28; NOVECTOR-NEXT: [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]] 29; NOVECTOR-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2 30; NOVECTOR-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 31; NOVECTOR-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 32; NOVECTOR-NEXT: [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half* 33; NOVECTOR-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]] 34; NOVECTOR-NEXT: [[TMP12:%.*]] = load half, half* [[TMP11]], align 8 35; NOVECTOR-NEXT: [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380 36; NOVECTOR-NEXT: [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0 37; NOVECTOR-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* 38; NOVECTOR-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] 39; NOVECTOR-NEXT: store half [[TMP14]], half* [[TMP16]], align 8 40; NOVECTOR-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] 41; NOVECTOR-NEXT: [[TMP18:%.*]] = load half, half* [[TMP17]], align 2 42; NOVECTOR-NEXT: [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380 43; NOVECTOR-NEXT: [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0 44; NOVECTOR-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] 45; NOVECTOR-NEXT: store half [[TMP20]], half* [[TMP21]], align 2 46; NOVECTOR-NEXT: ret void 47; 48 %tmp = shl nuw nsw i32 %arg2, 6 49 %tmp4 = or i32 %tmp, %arg3 50 %tmp5 = shl nuw nsw i32 %tmp4, 2 51 %tmp6 = zext i32 %tmp5 to i64 52 %tmp7 = or i64 %tmp6, 1 53 %tmp10 = bitcast i8* %arg1 to half* 54 %tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6 55 %tmp12 = load half, half* %tmp11, align 8 56 %tmp13 = fmul fast half %tmp12, 0xH5380 57 %tmp14 = fadd fast half %tmp13, 0xH57F0 58 %tmp15 = bitcast i8* %arg to half* 59 %tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6 60 store half %tmp14, half* %tmp16, align 8 61 %tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7 62 %tmp18 = load half, half* %tmp17, align 2 63 %tmp19 = fmul fast half %tmp18, 0xH5380 64 %tmp20 = fadd fast half %tmp19, 0xH57F0 65 %tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7 66 store half %tmp20, half* %tmp21, align 2 67 ret void 68} 69 70attributes #0 = { nounwind } 71