1// RUN: hlo_to_llvm_ir %s | FileCheck %s 2 3// CHECK-LABEL: entry: 4// CHECK: %[[VAL_0:.*]] = alloca i32, align 4 5// CHECK: %[[VAL_1:.*]] = alloca i32, align 4 6// CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_3:.*]], i64 0 7// CHECK: %[[VAL_4:.*]] = bitcast i8* %[[VAL_2]] to [100 x [200 x float]]* 8// CHECK: %[[VAL_5:.*]] = getelementptr inbounds i8, i8* %[[VAL_6:.*]], i64 0 9// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_5]] to [200 x [100 x float]]* 10// CHECK: %[[VAL_8:.*]] = bitcast [100 x [200 x float]]* %[[VAL_4]] to [1 x [100 x [200 x float]]]* 11// CHECK: %[[VAL_9:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2 12// CHECK: %[[VAL_10:.*]] = urem i32 %[[VAL_9]], 32 13// CHECK: %[[VAL_11:.*]] = udiv i32 %[[VAL_9]], 32 14// CHECK: %[[VAL_12:.*]] = urem i32 %[[VAL_9]], 32 15// CHECK: %[[VAL_13:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !3 16// CHECK: %[[VAL_14:.*]] = udiv i32 %[[VAL_13]], 1 17// CHECK: %[[VAL_15:.*]] = urem i32 %[[VAL_14]], 4 18// CHECK: %[[VAL_16:.*]] = udiv i32 %[[VAL_13]], 4 19// CHECK: %[[VAL_17:.*]] = urem i32 %[[VAL_16]], 7 20// CHECK: %[[VAL_18:.*]] = udiv i32 %[[VAL_13]], 28 21// CHECK: %[[VAL_19:.*]] = mul i32 %[[VAL_18]], 1 22// CHECK: %[[VAL_20:.*]] = icmp eq i32 %[[VAL_17]], 6 23// CHECK: %[[VAL_21:.*]] = select i1 %[[VAL_20]], i32 8, i32 32 24// CHECK: %[[VAL_22:.*]] = icmp eq i32 %[[VAL_15]], 3 25// CHECK: %[[VAL_23:.*]] = select i1 %[[VAL_22]], i32 4, i32 32 26// CHECK: %[[VAL_24:.*]] = mul i32 %[[VAL_17]], 32 27// CHECK: %[[VAL_25:.*]] = mul i32 %[[VAL_15]], 32 28// CHECK: %[[VAL_26:.*]] = mul i32 %[[VAL_10]], 1 29// CHECK: %[[VAL_27:.*]] = add i32 %[[VAL_24]], %[[VAL_26]] 30// CHECK: %[[VAL_28:.*]] = sub i32 %[[VAL_23]], %[[VAL_11]] 31// CHECK: %[[VAL_29:.*]] = add i32 %[[VAL_28]], 4 32// CHECK: %[[VAL_30:.*]] = add i32 %[[VAL_29]], -1 33// CHECK: %[[VAL_31:.*]] = udiv i32 %[[VAL_30]], 4 34// CHECK: store i32 0, i32* %[[VAL_1]], align 4 35// CHECK: br label %[[VAL_32:.*]] 36// CHECK: input_y_in_tile.loop_header: ; preds = %[[VAL_33:.*]], %[[VAL_34:.*]] 37// CHECK: %[[VAL_35:.*]] = load i32, i32* %[[VAL_1]], align 4 38// CHECK: %[[VAL_36:.*]] = icmp uge i32 %[[VAL_35]], %[[VAL_31]] 39// CHECK: br i1 %[[VAL_36]], label %[[VAL_37:.*]], label %[[VAL_38:.*]] 40// CHECK: input_y_in_tile.loop_body: ; preds = %[[VAL_32]] 41// CHECK: %[[VAL_39:.*]] = add nuw nsw i32 %[[VAL_35]], 1 42// CHECK: store i32 %[[VAL_39]], i32* %[[VAL_1]], align 4 43// CHECK: %[[VAL_40:.*]] = icmp eq i32 %[[VAL_35]], 0 44// CHECK: %[[VAL_41:.*]] = mul i32 %[[VAL_35]], 4 45// CHECK: %[[VAL_42:.*]] = add i32 %[[VAL_11]], %[[VAL_41]] 46// CHECK: %[[VAL_43:.*]] = add i32 %[[VAL_25]], %[[VAL_42]] 47// CHECK: %[[VAL_44:.*]] = add i32 0, %[[VAL_26]] 48// CHECK: %[[VAL_45:.*]] = add i32 %[[VAL_27]], 0 49// CHECK: %[[VAL_46:.*]] = icmp ult i32 %[[VAL_44]], %[[VAL_21]] 50// CHECK: br i1 %[[VAL_46]], label %[[VAL_47:.*]], label %[[VAL_33]] 51// CHECK: input_x_in_tile-after: ; preds = %[[VAL_47]], %[[VAL_38]] 52// CHECK: br label %[[VAL_32]], !llvm.loop !4 53// CHECK: input_y_in_tile.loop_exit: ; preds = %[[VAL_32]] 54// CHECK: call void @llvm.nvvm.barrier0() 55// CHECK: %[[VAL_48:.*]] = mul i32 %[[VAL_10]], 1 56// CHECK: %[[VAL_49:.*]] = add i32 %[[VAL_25]], %[[VAL_48]] 57// CHECK: %[[VAL_50:.*]] = sub i32 %[[VAL_21]], %[[VAL_11]] 58// CHECK: %[[VAL_51:.*]] = add i32 %[[VAL_50]], 4 59// CHECK: %[[VAL_52:.*]] = add i32 %[[VAL_51]], -1 60// CHECK: %[[VAL_53:.*]] = udiv i32 %[[VAL_52]], 4 61// CHECK: store i32 0, i32* %[[VAL_0]], align 4 62// CHECK: br label %[[VAL_54:.*]] 63// CHECK: output_y_in_tile.loop_header: ; preds = %[[VAL_55:.*]], %[[VAL_37]] 64// CHECK: %[[VAL_56:.*]] = load i32, i32* %[[VAL_0]], align 4 65// CHECK: %[[VAL_57:.*]] = icmp uge i32 %[[VAL_56]], %[[VAL_53]] 66// CHECK: br i1 %[[VAL_57]], label %[[VAL_58:.*]], label %[[VAL_59:.*]] 67// CHECK: output_y_in_tile.loop_body: ; preds = %[[VAL_54]] 68// CHECK: %[[VAL_60:.*]] = add nuw nsw i32 %[[VAL_56]], 1 69// CHECK: store i32 %[[VAL_60]], i32* %[[VAL_0]], align 4 70// CHECK: %[[VAL_61:.*]] = icmp eq i32 %[[VAL_56]], 0 71// CHECK: %[[VAL_62:.*]] = mul i32 %[[VAL_56]], 4 72// CHECK: %[[VAL_63:.*]] = add i32 %[[VAL_11]], %[[VAL_62]] 73// CHECK: %[[VAL_64:.*]] = add i32 %[[VAL_24]], %[[VAL_63]] 74// CHECK: %[[VAL_65:.*]] = add i32 0, %[[VAL_48]] 75// CHECK: %[[VAL_66:.*]] = add i32 %[[VAL_49]], 0 76// CHECK: %[[VAL_67:.*]] = icmp ult i32 %[[VAL_65]], %[[VAL_23]] 77// CHECK: br i1 %[[VAL_67]], label %[[VAL_68:.*]], label %[[VAL_55]] 78// CHECK: output_x_in_tile-after: ; preds = %[[VAL_68]], %[[VAL_59]] 79// CHECK: br label %[[VAL_54]], !llvm.loop !6 80// CHECK: output_y_in_tile.loop_exit: ; preds = %[[VAL_54]] 81// CHECK: ret void 82// CHECK: input_x_in_tile-true: ; preds = %[[VAL_38]] 83// CHECK: %[[VAL_69:.*]] = getelementptr inbounds [1 x [100 x [200 x float]]], [1 x [100 x [200 x float]]]* %[[VAL_8]], i32 0, i32 0, i32 %[[VAL_43]], i32 %[[VAL_45]] 84// CHECK: %[[VAL_70:.*]] = load float, float* %[[VAL_69]], align 4, !invariant.load !7 85// CHECK: %[[VAL_71:.*]] = getelementptr [32 x [33 x float]], [32 x [33 x float]] addrspace(3)* @b.tile0, i32 0, i32 %[[VAL_42]], i32 %[[VAL_44]] 86// CHECK: store float %[[VAL_70]], float addrspace(3)* %[[VAL_71]], align 4 87// CHECK: br label %[[VAL_33]] 88// CHECK: output_x_in_tile-true: ; preds = %[[VAL_59]] 89// CHECK: %[[VAL_72:.*]] = getelementptr [32 x [33 x float]], [32 x [33 x float]] addrspace(3)* @b.tile0, i64 0, i32 %[[VAL_65]], i32 %[[VAL_63]] 90// CHECK: %[[VAL_73:.*]] = load float, float addrspace(3)* %[[VAL_72]], align 4 91// CHECK: %[[VAL_74:.*]] = bitcast [200 x [100 x float]]* %[[VAL_7]] to [1 x [200 x [100 x float]]]* 92// CHECK: %[[VAL_75:.*]] = getelementptr inbounds [1 x [200 x [100 x float]]], [1 x [200 x [100 x float]]]* %[[VAL_74]], i32 0, i32 0, i32 %[[VAL_64]], i32 %[[VAL_66]] 93// CHECK: store float %[[VAL_73]], float* %[[VAL_75]], align 4 94// CHECK: br label %[[VAL_55]] 95 96HloModule Test 97 98ENTRY main { 99 a = f32[100, 200]{1,0} parameter(0) 100 ROOT b = f32[100, 200]{0,1} copy(a) 101} 102