1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI %s 3 4; GCN-LABEL: {{^}}fcmp_f16_lt 5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 7; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 8; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 9; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 10; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 11; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 12; GCN: buffer_store_dword v[[R_I32]] 13; GCN: s_endpgm 14define amdgpu_kernel void @fcmp_f16_lt( 15 i32 addrspace(1)* %r, 16 half addrspace(1)* %a, 17 half addrspace(1)* %b) { 18entry: 19 %a.val = load volatile half, half addrspace(1)* %a 20 %b.val = load volatile half, half addrspace(1)* %b 21 %r.val = fcmp olt half %a.val, %b.val 22 %r.val.sext = sext i1 %r.val to i32 23 store i32 %r.val.sext, i32 addrspace(1)* %r 24 ret void 25} 26 27; GCN-LABEL: {{^}}fcmp_f16_lt_abs: 28; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 29; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 30 31; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]| 32; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]| 33 34; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 35; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]| 36 37; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 38; GCN: buffer_store_dword v[[R_I32]] 39; GCN: s_endpgm 40define amdgpu_kernel void @fcmp_f16_lt_abs( 41 i32 addrspace(1)* %r, 42 half addrspace(1)* %a, 43 half addrspace(1)* %b) { 44entry: 45 %a.val = load volatile half, half addrspace(1)* %a 46 %b.val = load volatile half, half addrspace(1)* %b 47 %a.abs = call half @llvm.fabs.f16(half %a.val) 48 %b.abs = call half @llvm.fabs.f16(half %b.val) 49 %r.val = fcmp olt half %a.abs, %b.abs 50 %r.val.sext = sext i1 %r.val to i32 51 store i32 %r.val.sext, i32 addrspace(1)* %r 52 ret void 53} 54 55; GCN-LABEL: {{^}}fcmp_f16_eq 56; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 57; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 58; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 59; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 60; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 61; VI: v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 62; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 63; GCN: buffer_store_dword v[[R_I32]] 64; GCN: s_endpgm 65define amdgpu_kernel void @fcmp_f16_eq( 66 i32 addrspace(1)* %r, 67 half addrspace(1)* %a, 68 half addrspace(1)* %b) { 69entry: 70 %a.val = load volatile half, half addrspace(1)* %a 71 %b.val = load volatile half, half addrspace(1)* %b 72 %r.val = fcmp oeq half %a.val, %b.val 73 %r.val.sext = sext i1 %r.val to i32 74 store i32 %r.val.sext, i32 addrspace(1)* %r 75 ret void 76} 77 78; GCN-LABEL: {{^}}fcmp_f16_le 79; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 80; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 81; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 82; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 83; SI: v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 84; VI: v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 85; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 86; GCN: buffer_store_dword v[[R_I32]] 87; GCN: s_endpgm 88define amdgpu_kernel void @fcmp_f16_le( 89 i32 addrspace(1)* %r, 90 half addrspace(1)* %a, 91 half addrspace(1)* %b) { 92entry: 93 %a.val = load volatile half, half addrspace(1)* %a 94 %b.val = load volatile half, half addrspace(1)* %b 95 %r.val = fcmp ole half %a.val, %b.val 96 %r.val.sext = sext i1 %r.val to i32 97 store i32 %r.val.sext, i32 addrspace(1)* %r 98 ret void 99} 100 101; GCN-LABEL: {{^}}fcmp_f16_gt 102; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 103; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 104; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 105; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 106; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 107; VI: v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 108; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 109; GCN: buffer_store_dword v[[R_I32]] 110; GCN: s_endpgm 111define amdgpu_kernel void @fcmp_f16_gt( 112 i32 addrspace(1)* %r, 113 half addrspace(1)* %a, 114 half addrspace(1)* %b) { 115entry: 116 %a.val = load volatile half, half addrspace(1)* %a 117 %b.val = load volatile half, half addrspace(1)* %b 118 %r.val = fcmp ogt half %a.val, %b.val 119 %r.val.sext = sext i1 %r.val to i32 120 store i32 %r.val.sext, i32 addrspace(1)* %r 121 ret void 122} 123 124; GCN-LABEL: {{^}}fcmp_f16_lg 125; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 126; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 127; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 128; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 129; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 130; VI: v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 131; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 132; GCN: buffer_store_dword v[[R_I32]] 133; GCN: s_endpgm 134define amdgpu_kernel void @fcmp_f16_lg( 135 i32 addrspace(1)* %r, 136 half addrspace(1)* %a, 137 half addrspace(1)* %b) { 138entry: 139 %a.val = load volatile half, half addrspace(1)* %a 140 %b.val = load volatile half, half addrspace(1)* %b 141 %r.val = fcmp one half %a.val, %b.val 142 %r.val.sext = sext i1 %r.val to i32 143 store i32 %r.val.sext, i32 addrspace(1)* %r 144 ret void 145} 146 147; GCN-LABEL: {{^}}fcmp_f16_ge 148; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 149; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 150; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 151; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 152; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 153; VI: v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 154; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 155; GCN: buffer_store_dword v[[R_I32]] 156; GCN: s_endpgm 157define amdgpu_kernel void @fcmp_f16_ge( 158 i32 addrspace(1)* %r, 159 half addrspace(1)* %a, 160 half addrspace(1)* %b) { 161entry: 162 %a.val = load volatile half, half addrspace(1)* %a 163 %b.val = load volatile half, half addrspace(1)* %b 164 %r.val = fcmp oge half %a.val, %b.val 165 %r.val.sext = sext i1 %r.val to i32 166 store i32 %r.val.sext, i32 addrspace(1)* %r 167 ret void 168} 169 170; GCN-LABEL: {{^}}fcmp_f16_o 171; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 172; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 173; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 174; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 175; SI: v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 176; VI: v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 177; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 178; GCN: buffer_store_dword v[[R_I32]] 179; GCN: s_endpgm 180define amdgpu_kernel void @fcmp_f16_o( 181 i32 addrspace(1)* %r, 182 half addrspace(1)* %a, 183 half addrspace(1)* %b) { 184entry: 185 %a.val = load volatile half, half addrspace(1)* %a 186 %b.val = load volatile half, half addrspace(1)* %b 187 %r.val = fcmp ord half %a.val, %b.val 188 %r.val.sext = sext i1 %r.val to i32 189 store i32 %r.val.sext, i32 addrspace(1)* %r 190 ret void 191} 192 193; GCN-LABEL: {{^}}fcmp_f16_u 194; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 195; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 196; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 197; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 198; SI: v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 199; VI: v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 200; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 201; GCN: buffer_store_dword v[[R_I32]] 202; GCN: s_endpgm 203define amdgpu_kernel void @fcmp_f16_u( 204 i32 addrspace(1)* %r, 205 half addrspace(1)* %a, 206 half addrspace(1)* %b) { 207entry: 208 %a.val = load volatile half, half addrspace(1)* %a 209 %b.val = load volatile half, half addrspace(1)* %b 210 %r.val = fcmp uno half %a.val, %b.val 211 %r.val.sext = sext i1 %r.val to i32 212 store i32 %r.val.sext, i32 addrspace(1)* %r 213 ret void 214} 215 216; GCN-LABEL: {{^}}fcmp_f16_nge 217; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 218; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 219; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 220; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 221; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 222; VI: v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 223; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 224; GCN: buffer_store_dword v[[R_I32]] 225; GCN: s_endpgm 226define amdgpu_kernel void @fcmp_f16_nge( 227 i32 addrspace(1)* %r, 228 half addrspace(1)* %a, 229 half addrspace(1)* %b) { 230entry: 231 %a.val = load volatile half, half addrspace(1)* %a 232 %b.val = load volatile half, half addrspace(1)* %b 233 %r.val = fcmp ult half %a.val, %b.val 234 %r.val.sext = sext i1 %r.val to i32 235 store i32 %r.val.sext, i32 addrspace(1)* %r 236 ret void 237} 238 239; GCN-LABEL: {{^}}fcmp_f16_nlg 240; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 241; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 242; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 243; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 244; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 245; VI: v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 246; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 247; GCN: buffer_store_dword v[[R_I32]] 248; GCN: s_endpgm 249define amdgpu_kernel void @fcmp_f16_nlg( 250 i32 addrspace(1)* %r, 251 half addrspace(1)* %a, 252 half addrspace(1)* %b) { 253entry: 254 %a.val = load volatile half, half addrspace(1)* %a 255 %b.val = load volatile half, half addrspace(1)* %b 256 %r.val = fcmp ueq half %a.val, %b.val 257 %r.val.sext = sext i1 %r.val to i32 258 store i32 %r.val.sext, i32 addrspace(1)* %r 259 ret void 260} 261 262; GCN-LABEL: {{^}}fcmp_f16_ngt 263; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 264; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 265; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 266; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 267; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 268; VI: v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 269; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 270; GCN: buffer_store_dword v[[R_I32]] 271; GCN: s_endpgm 272define amdgpu_kernel void @fcmp_f16_ngt( 273 i32 addrspace(1)* %r, 274 half addrspace(1)* %a, 275 half addrspace(1)* %b) { 276entry: 277 %a.val = load volatile half, half addrspace(1)* %a 278 %b.val = load volatile half, half addrspace(1)* %b 279 %r.val = fcmp ule half %a.val, %b.val 280 %r.val.sext = sext i1 %r.val to i32 281 store i32 %r.val.sext, i32 addrspace(1)* %r 282 ret void 283} 284 285; GCN-LABEL: {{^}}fcmp_f16_nle 286; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 287; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 288; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 289; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 290; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 291; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 292; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 293; GCN: buffer_store_dword v[[R_I32]] 294; GCN: s_endpgm 295define amdgpu_kernel void @fcmp_f16_nle( 296 i32 addrspace(1)* %r, 297 half addrspace(1)* %a, 298 half addrspace(1)* %b) { 299entry: 300 %a.val = load volatile half, half addrspace(1)* %a 301 %b.val = load volatile half, half addrspace(1)* %b 302 %r.val = fcmp ugt half %a.val, %b.val 303 %r.val.sext = sext i1 %r.val to i32 304 store i32 %r.val.sext, i32 addrspace(1)* %r 305 ret void 306} 307 308; GCN-LABEL: {{^}}fcmp_f16_neq 309; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 310; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 311; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 312; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 313; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 314; VI: v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 315; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 316; GCN: buffer_store_dword v[[R_I32]] 317; GCN: s_endpgm 318define amdgpu_kernel void @fcmp_f16_neq( 319 i32 addrspace(1)* %r, 320 half addrspace(1)* %a, 321 half addrspace(1)* %b) { 322entry: 323 %a.val = load volatile half, half addrspace(1)* %a 324 %b.val = load volatile half, half addrspace(1)* %b 325 %r.val = fcmp une half %a.val, %b.val 326 %r.val.sext = sext i1 %r.val to i32 327 store i32 %r.val.sext, i32 addrspace(1)* %r 328 ret void 329} 330 331; GCN-LABEL: {{^}}fcmp_f16_nlt 332; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] 333; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] 334; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] 335; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] 336; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] 337; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] 338; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] 339; GCN: buffer_store_dword v[[R_I32]] 340; GCN: s_endpgm 341define amdgpu_kernel void @fcmp_f16_nlt( 342 i32 addrspace(1)* %r, 343 half addrspace(1)* %a, 344 half addrspace(1)* %b) { 345entry: 346 %a.val = load volatile half, half addrspace(1)* %a 347 %b.val = load volatile half, half addrspace(1)* %b 348 %r.val = fcmp uge half %a.val, %b.val 349 %r.val.sext = sext i1 %r.val to i32 350 store i32 %r.val.sext, i32 addrspace(1)* %r 351 ret void 352} 353 354; GCN-LABEL: {{^}}fcmp_v2f16_lt: 355; SI: v_cmp_lt_f32_e32 vcc, 356; SI: v_cmp_lt_f32_e32 vcc, 357 358; VI: v_cmp_lt_f16_e32 vcc, 359; VI: v_cmp_lt_f16_e32 vcc, 360define amdgpu_kernel void @fcmp_v2f16_lt( 361 <2 x i32> addrspace(1)* %r, 362 <2 x half> addrspace(1)* %a, 363 <2 x half> addrspace(1)* %b) { 364entry: 365 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 366 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 367 %r.val = fcmp olt <2 x half> %a.val, %b.val 368 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 369 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 370 ret void 371} 372 373; GCN-LABEL: {{^}}fcmp_v2f16_eq 374; SI: v_cmp_eq_f32_e32 vcc, 375; SI: v_cmp_eq_f32_e32 vcc, 376 377; VI: v_cmp_eq_f16_e32 vcc, 378; VI: v_cmp_eq_f16_e32 vcc, 379define amdgpu_kernel void @fcmp_v2f16_eq( 380 <2 x i32> addrspace(1)* %r, 381 <2 x half> addrspace(1)* %a, 382 <2 x half> addrspace(1)* %b) { 383entry: 384 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 385 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 386 %r.val = fcmp oeq <2 x half> %a.val, %b.val 387 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 388 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 389 ret void 390} 391 392; GCN-LABEL: {{^}}fcmp_v2f16_le: 393; SI: v_cmp_le_f32_e32 vcc 394; SI: v_cmp_le_f32_e32 vcc 395; VI: v_cmp_le_f16_e32 vcc 396; VI: v_cmp_le_f16_e32 vcc 397define amdgpu_kernel void @fcmp_v2f16_le( 398 <2 x i32> addrspace(1)* %r, 399 <2 x half> addrspace(1)* %a, 400 <2 x half> addrspace(1)* %b) { 401entry: 402 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 403 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 404 %r.val = fcmp ole <2 x half> %a.val, %b.val 405 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 406 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 407 ret void 408} 409 410; GCN-LABEL: {{^}}fcmp_v2f16_gt: 411; SI: v_cmp_gt_f32_e32 vcc, 412; SI: v_cmp_gt_f32_e32 vcc, 413 414; VI: v_cmp_gt_f16_e32 vcc, 415; VI: v_cmp_gt_f16_e32 vcc, 416define amdgpu_kernel void @fcmp_v2f16_gt( 417 <2 x i32> addrspace(1)* %r, 418 <2 x half> addrspace(1)* %a, 419 <2 x half> addrspace(1)* %b) { 420entry: 421 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 422 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 423 %r.val = fcmp ogt <2 x half> %a.val, %b.val 424 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 425 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 426 ret void 427} 428 429; GCN-LABEL: {{^}}fcmp_v2f16_lg: 430; SI: v_cmp_lg_f32_e32 vcc, 431; SI: v_cmp_lg_f32_e32 vcc, 432 433; VI: v_cmp_lg_f16_e32 vcc, 434; VI: v_cmp_lg_f16_e32 vcc, 435define amdgpu_kernel void @fcmp_v2f16_lg( 436 <2 x i32> addrspace(1)* %r, 437 <2 x half> addrspace(1)* %a, 438 <2 x half> addrspace(1)* %b) { 439entry: 440 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 441 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 442 %r.val = fcmp one <2 x half> %a.val, %b.val 443 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 444 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 445 ret void 446} 447 448; GCN-LABEL: {{^}}fcmp_v2f16_ge: 449; SI: v_cmp_ge_f32_e32 vcc, 450; SI: v_cmp_ge_f32_e32 vcc, 451 452; VI: v_cmp_ge_f16_e32 vcc, 453; VI: v_cmp_ge_f16_e32 vcc, 454define amdgpu_kernel void @fcmp_v2f16_ge( 455 <2 x i32> addrspace(1)* %r, 456 <2 x half> addrspace(1)* %a, 457 <2 x half> addrspace(1)* %b) { 458entry: 459 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 460 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 461 %r.val = fcmp oge <2 x half> %a.val, %b.val 462 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 463 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 464 ret void 465} 466 467; GCN-LABEL: {{^}}fcmp_v2f16_o: 468; SI: v_cmp_o_f32_e32 vcc, 469; SI: v_cmp_o_f32_e32 vcc, 470 471; VI: v_cmp_o_f16_e32 vcc, 472; VI: v_cmp_o_f16_e32 vcc, 473define amdgpu_kernel void @fcmp_v2f16_o( 474 <2 x i32> addrspace(1)* %r, 475 <2 x half> addrspace(1)* %a, 476 <2 x half> addrspace(1)* %b) { 477entry: 478 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 479 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 480 %r.val = fcmp ord <2 x half> %a.val, %b.val 481 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 482 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 483 ret void 484} 485 486; GCN-LABEL: {{^}}fcmp_v2f16_u: 487; SI: v_cmp_u_f32_e32 vcc, 488; SI: v_cmp_u_f32_e32 vcc, 489 490; VI: v_cmp_u_f16_e32 vcc, 491; VI: v_cmp_u_f16_e32 vcc, 492define amdgpu_kernel void @fcmp_v2f16_u( 493 <2 x i32> addrspace(1)* %r, 494 <2 x half> addrspace(1)* %a, 495 <2 x half> addrspace(1)* %b) { 496entry: 497 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 498 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 499 %r.val = fcmp uno <2 x half> %a.val, %b.val 500 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 501 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 502 ret void 503} 504 505; GCN-LABEL: {{^}}fcmp_v2f16_nge 506; SI: v_cmp_nge_f32_e32 vcc, 507; SI: v_cmp_nge_f32_e32 vcc, 508 509; VI: v_cmp_nge_f16_e32 vcc, 510; VI: v_cmp_nge_f16_e32 vcc, 511define amdgpu_kernel void @fcmp_v2f16_nge( 512 <2 x i32> addrspace(1)* %r, 513 <2 x half> addrspace(1)* %a, 514 <2 x half> addrspace(1)* %b) { 515entry: 516 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 517 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 518 %r.val = fcmp ult <2 x half> %a.val, %b.val 519 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 520 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 521 ret void 522} 523 524; GCN-LABEL: {{^}}fcmp_v2f16_nlg 525; SI: v_cmp_nlg_f32_e32 vcc 526; SI: v_cmp_nlg_f32_e32 vcc 527 528; VI: v_cmp_nlg_f16_e32 vcc 529; VI: v_cmp_nlg_f16_e32 vcc 530define amdgpu_kernel void @fcmp_v2f16_nlg( 531 <2 x i32> addrspace(1)* %r, 532 <2 x half> addrspace(1)* %a, 533 <2 x half> addrspace(1)* %b) { 534entry: 535 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 536 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 537 %r.val = fcmp ueq <2 x half> %a.val, %b.val 538 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 539 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 540 ret void 541} 542 543; GCN-LABEL: {{^}}fcmp_v2f16_ngt 544; SI: v_cmp_ngt_f32_e32 vcc, 545; SI: v_cmp_ngt_f32_e32 vcc, 546 547; VI: v_cmp_ngt_f16_e32 vcc, 548; VI: v_cmp_ngt_f16_e32 vcc, 549define amdgpu_kernel void @fcmp_v2f16_ngt( 550 <2 x i32> addrspace(1)* %r, 551 <2 x half> addrspace(1)* %a, 552 <2 x half> addrspace(1)* %b) { 553entry: 554 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 555 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 556 %r.val = fcmp ule <2 x half> %a.val, %b.val 557 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 558 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 559 ret void 560} 561 562; GCN-LABEL: {{^}}fcmp_v2f16_nle 563; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 564; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 565 566; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 567; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 568define amdgpu_kernel void @fcmp_v2f16_nle( 569 <2 x i32> addrspace(1)* %r, 570 <2 x half> addrspace(1)* %a, 571 <2 x half> addrspace(1)* %b) { 572entry: 573 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 574 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 575 %r.val = fcmp ugt <2 x half> %a.val, %b.val 576 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 577 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 578 ret void 579} 580 581; GCN-LABEL: {{^}}fcmp_v2f16_neq 582; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 583; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 584 585; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 586; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} 587define amdgpu_kernel void @fcmp_v2f16_neq( 588 <2 x i32> addrspace(1)* %r, 589 <2 x half> addrspace(1)* %a, 590 <2 x half> addrspace(1)* %b) { 591entry: 592 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 593 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 594 %r.val = fcmp une <2 x half> %a.val, %b.val 595 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 596 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 597 ret void 598} 599 600; GCN-LABEL: {{^}}fcmp_v2f16_nlt 601; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] 602; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] 603; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] 604; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] 605; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] 606; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] 607 608; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] 609; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] 610; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] 611; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] 612; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]] 613; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] 614 615; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]] 616; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] 617; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} 618; GCN: s_endpgm 619define amdgpu_kernel void @fcmp_v2f16_nlt( 620 <2 x i32> addrspace(1)* %r, 621 <2 x half> addrspace(1)* %a, 622 <2 x half> addrspace(1)* %b) { 623entry: 624 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 625 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 626 %r.val = fcmp uge <2 x half> %a.val, %b.val 627 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 628 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r 629 ret void 630} 631 632declare half @llvm.fabs.f16(half) #1 633 634attributes #0 = { nounwind } 635attributes #1 = { nounwind readnone } 636