1from peachpy import * 2from peachpy.x86_64 import * 3 4 5def fp16_alt_xmm_to_fp32_xmm(xmm_half): 6 xmm_zero = XMMRegister() 7 VPXOR(xmm_zero, xmm_zero, xmm_zero) 8 9 xmm_word = XMMRegister() 10 VPUNPCKLWD(xmm_word, xmm_zero, xmm_half) 11 12 xmm_shl1_half = XMMRegister() 13 VPADDW(xmm_shl1_half, xmm_half, xmm_half) 14 15 xmm_shl1_nonsign = XMMRegister() 16 VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word) 17 18 sign_mask = Constant.float32x4(-0.0) 19 20 xmm_sign = XMMRegister() 21 VANDPS(xmm_sign, xmm_word, sign_mask) 22 23 xmm_shr3_nonsign = XMMRegister() 24 VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4) 25 26 exp_offset = Constant.uint32x4(0x38000000) 27 28 xmm_norm_nonsign = XMMRegister() 29 VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset) 30 31 magic_mask = Constant.uint16x8(0x3E80) 32 xmm_denorm_nonsign = XMMRegister() 33 VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask) 34 35 magic_bias = Constant.float32x4(0.25) 36 VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias) 37 38 xmm_denorm_cutoff = XMMRegister() 39 VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000)) 40 41 xmm_denorm_mask = XMMRegister() 42 VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign) 43 44 xmm_nonsign = XMMRegister() 45 VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask) 46 47 xmm_float = XMMRegister() 48 VORPS(xmm_float, xmm_nonsign, xmm_sign) 49 50 return xmm_float 51