1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,fullfp16 < %s | FileCheck %s 3; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops 4; depending on it. 5 6target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 7target triple = "armv8.6a-arm-none-eabi" 8 9define arm_aapcs_vfpcc <4 x bfloat> @test_vcreate_bf16(i64 %a) { 10; CHECK-LABEL: test_vcreate_bf16: 11; CHECK: @ %bb.0: @ %entry 12; CHECK-NEXT: vmov d0, r0, r1 13; CHECK-NEXT: bx lr 14entry: 15 %0 = bitcast i64 %a to <4 x bfloat> 16 ret <4 x bfloat> %0 17} 18 19define arm_aapcs_vfpcc <4 x bfloat> @test_vdup_n_bf16(bfloat %v) { 20; CHECK-LABEL: test_vdup_n_bf16: 21; CHECK: @ %bb.0: @ %entry 22; CHECK-NEXT: @ kill: def $s0 killed $s0 def $d0 23; CHECK-NEXT: vdup.16 d0, d0[0] 24; CHECK-NEXT: bx lr 25entry: 26 %vecinit.i = insertelement <4 x bfloat> undef, bfloat %v, i32 0 27 %vecinit3.i = shufflevector <4 x bfloat> %vecinit.i, <4 x bfloat> undef, <4 x i32> zeroinitializer 28 ret <4 x bfloat> %vecinit3.i 29} 30 31define arm_aapcs_vfpcc <8 x bfloat> @test_vdupq_n_bf16(bfloat %v) { 32; CHECK-LABEL: test_vdupq_n_bf16: 33; CHECK: @ %bb.0: @ %entry 34; CHECK-NEXT: @ kill: def $s0 killed $s0 def $d0 35; CHECK-NEXT: vdup.16 q0, d0[0] 36; CHECK-NEXT: bx lr 37entry: 38 %vecinit.i = insertelement <8 x bfloat> undef, bfloat %v, i32 0 39 %vecinit7.i = shufflevector <8 x bfloat> %vecinit.i, <8 x bfloat> undef, <8 x i32> zeroinitializer 40 ret <8 x bfloat> %vecinit7.i 41} 42 43define arm_aapcs_vfpcc <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %v) { 44; CHECK-LABEL: test_vdup_lane_bf16: 45; CHECK: @ %bb.0: @ %entry 46; CHECK-NEXT: vdup.16 d0, d0[1] 47; CHECK-NEXT: bx lr 48entry: 49 %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 50 ret <4 x bfloat> %lane 51} 52 53define arm_aapcs_vfpcc <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %v) { 54; CHECK-LABEL: test_vdupq_lane_bf16: 55; CHECK: @ %bb.0: @ %entry 56; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 57; CHECK-NEXT: vdup.16 q0, d0[1] 58; CHECK-NEXT: bx lr 59entry: 60 %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 61 ret <8 x bfloat> %lane 62} 63 64define arm_aapcs_vfpcc <4 x bfloat> @test_vdup_laneq_bf16(<8 x bfloat> %v) { 65; CHECK-LABEL: test_vdup_laneq_bf16: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vdup.16 d0, d1[3] 68; CHECK-NEXT: bx lr 69entry: 70 %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 71 ret <4 x bfloat> %lane 72} 73 74define arm_aapcs_vfpcc <8 x bfloat> @test_vdupq_laneq_bf16(<8 x bfloat> %v) { 75; CHECK-LABEL: test_vdupq_laneq_bf16: 76; CHECK: @ %bb.0: @ %entry 77; CHECK-NEXT: vdup.16 q0, d1[3] 78; CHECK-NEXT: bx lr 79entry: 80 %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 81 ret <8 x bfloat> %lane 82} 83 84define arm_aapcs_vfpcc <8 x bfloat> @test_vcombine_bf16(<4 x bfloat> %low, <4 x bfloat> %high) { 85; CHECK-LABEL: test_vcombine_bf16: 86; CHECK: @ %bb.0: @ %entry 87; CHECK-NEXT: vmov.f64 d16, d1 88; CHECK-NEXT: vorr d17, d0, d0 89; CHECK-NEXT: vorr q0, q8, q8 90; CHECK-NEXT: bx lr 91entry: 92 %shuffle.i = shufflevector <4 x bfloat> %high, <4 x bfloat> %low, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 93 ret <8 x bfloat> %shuffle.i 94} 95 96define arm_aapcs_vfpcc <4 x bfloat> @test_vget_high_bf16(<8 x bfloat> %a) { 97; CHECK-LABEL: test_vget_high_bf16: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: vmov.f64 d0, d1 100; CHECK-NEXT: bx lr 101entry: 102 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 103 ret <4 x bfloat> %shuffle.i 104} 105 106define arm_aapcs_vfpcc <4 x bfloat> @test_vget_low_bf16(<8 x bfloat> %a) { 107; CHECK-LABEL: test_vget_low_bf16: 108; CHECK: @ %bb.0: @ %entry 109; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 110; CHECK-NEXT: bx lr 111entry: 112 %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 113 ret <4 x bfloat> %shuffle.i 114} 115 116define arm_aapcs_vfpcc bfloat @test_vgetq_lane_bf16_even(<8 x bfloat> %v) { 117; CHECK-LABEL: test_vgetq_lane_bf16_even: 118; CHECK: @ %bb.0: @ %entry 119; CHECK-NEXT: vmov.f32 s0, s3 120; CHECK-NEXT: bx lr 121entry: 122 %0 = extractelement <8 x bfloat> %v, i32 6 123 ret bfloat %0 124} 125 126define arm_aapcs_vfpcc bfloat @test_vgetq_lane_bf16_odd(<8 x bfloat> %v) { 127; CHECK-LABEL: test_vgetq_lane_bf16_odd: 128; CHECK: @ %bb.0: @ %entry 129; CHECK-NEXT: vmovx.f16 s0, s3 130; CHECK-NEXT: bx lr 131entry: 132 %0 = extractelement <8 x bfloat> %v, i32 7 133 ret bfloat %0 134} 135 136define arm_aapcs_vfpcc bfloat @test_vget_lane_bf16_even(<4 x bfloat> %v) { 137; CHECK-LABEL: test_vget_lane_bf16_even: 138; CHECK: @ %bb.0: @ %entry 139; CHECK-NEXT: vmov.f32 s0, s1 140; CHECK-NEXT: bx lr 141entry: 142 %0 = extractelement <4 x bfloat> %v, i32 2 143 ret bfloat %0 144} 145 146define arm_aapcs_vfpcc bfloat @test_vget_lane_bf16_odd(<4 x bfloat> %v) { 147; CHECK-LABEL: test_vget_lane_bf16_odd: 148; CHECK: @ %bb.0: @ %entry 149; CHECK-NEXT: vmovx.f16 s0, s0 150; CHECK-NEXT: bx lr 151entry: 152 %0 = extractelement <4 x bfloat> %v, i32 1 153 ret bfloat %0 154} 155 156define arm_aapcs_vfpcc <4 x bfloat> @test_vset_lane_bf16(bfloat %a, <4 x bfloat> %v) { 157; CHECK-LABEL: test_vset_lane_bf16: 158; CHECK: @ %bb.0: @ %entry 159; CHECK-NEXT: vmov r0, s0 160; CHECK-NEXT: vmov.16 d1[1], r0 161; CHECK-NEXT: vorr d0, d1, d1 162; CHECK-NEXT: bx lr 163entry: 164 %0 = insertelement <4 x bfloat> %v, bfloat %a, i32 1 165 ret <4 x bfloat> %0 166} 167 168define arm_aapcs_vfpcc <8 x bfloat> @test_vsetq_lane_bf16(bfloat %a, <8 x bfloat> %v) { 169; CHECK-LABEL: test_vsetq_lane_bf16: 170; CHECK: @ %bb.0: @ %entry 171; CHECK-NEXT: vmov r0, s0 172; CHECK-NEXT: vmov.16 d3[3], r0 173; CHECK-NEXT: vorr q0, q1, q1 174; CHECK-NEXT: bx lr 175entry: 176 %0 = insertelement <8 x bfloat> %v, bfloat %a, i32 7 177 ret <8 x bfloat> %0 178} 179