.globl _start

_start:
        # This code tests for the fldcw "load floating point command word"
	#   instruction.  On most x86 processors the retired_instruction
	#   performance counter counts this as one instruction.  However,
	#   on Pentium 4 systems it counts as two.  Therefore this can
	#   affect BBV results on such a system.
	# fldcw is most often used to set the rouding mode when doing
	#   floating point to integer conversions
	
	# It is encoded as "d9 /5" which means
	#   1101 1001 xx10 1yyy
	# Where xx is the "mod" which will be 00, 01, or 10 indicating offset
	#   and yyy is the register field

        # these are instructions with similar encodings to fldcw
	# that can cause false positives if the test isn't explicit enough
similar:	
        fld1   	   	       		# d9 e8
	fldl2t				# d9 e9
	fldl2e				# d9 ea
	fldpi				# d9 eb
	fldlg2				# d9 ec
	fldln2				# d9 ed
	fldz				# d9 ee

	# check some varied ways of calling fldcw

	# offset on stack
stack:	
	sub	$8,%rsp			# allocate space on stack
	fnstcw	2(%rsp)		
	fldcw	2(%rsp)		
	add	$8,%rsp			# restore stack

	# 64-bit register
sixtyfour_reg:	
	fnstcw	cw
	mov	$cw,%rax
	fldcw	0(%rax)			# rax
	mov	$cw,%rbx
	fldcw	0(%rbx)			# rbx
	mov	$cw,%rcx	
	fldcw	0(%rcx)			# rcx
	mov	$cw,%rdx		 
	fldcw	0(%rdx)			# rdx

	# 32-bit register
	
	# Note!  The assembler that comes with SuSE 9.1
	#        cannot assemble 32-bit fldcw on 64-bit systems
	#        Hence the need to hand-code them
	
	
thirtytwo_reg:	
	fnstcw	cw
	mov	$cw,%eax
	
#	fldcw	0(%eax)			# eax
	.byte	0x67,0xd9,0x28

	mov	$cw,%ebx
	
#	fldcw	0(%ebx)			# ebx
	.byte	0x67,0xd9,0x2b

	mov	$cw,%ecx	
	
#	fldcw	0(%ecx)			# ecx
	.byte	0x67,0xd9,0x29

	mov	$cw,%edx
	
#	fldcw	0(%edx)			# edx
	.byte	0x67,0xd9,0x2a

	# register + 8-bit offset
eight_bit:	
	mov	$cw,%eax
	sub	$32,%eax
	
#	fldcw	32(%eax)		# eax + 8 bit offset
	.byte 0x67,0xd9,0x68,0x20

	mov	%eax,%ebx
#	fldcw	32(%ebx)		# ebx + 8 bit offset	
	.byte	0x67,0xd9,0x6b,0x20

	mov	%eax,%ecx
	
#	fldcw	32(%ecx)		# ecx + 8 bit offset		
	.byte	0x67,0xd9,0x69,0x20

	mov	%eax,%edx
	
#	fldcw	32(%edx)		# edx + 8 bit offset
	.byte	0x67,0xd9,0x6a,0x20
	
	
	# register + 32-bit offset
thirtytwo_bit:	
	mov	$cw,%eax
	sub	$30000,%eax
	
#	fldcw	30000(%eax)		# eax + 16 bit offset
	.byte	0x67,0xd9,0xa8,0x30,0x75,0x00,0x00
	
	mov	%eax,%ebx
	
#	fldcw	30000(%ebx)		# ebx + 16 bit offset
	.byte	0x67,0xd9,0xab,0x30,0x75,0x00,0x00
	
	mov	%eax,%ecx
	
#	fldcw	30000(%ecx)		# ecx + 16 bit offset
	.byte	0x67,0xd9,0xa9,0x30,0x75,0x00,0x00
	
	mov	%eax,%edx
	
#	fldcw	30000(%edx)		# edx + 16 bit offset
	.byte	0x67,0xd9,0xaa,0x30,0x75,0x00,0x00
	
	# check an fp/integer conversion
	# in a loop to give a bigger count

	mov	$1024,%rcx
big_loop:

	fldl	three			# load value onto fp stack
	fnstcw	saved_cw		# store control word to mem
	movzwl	saved_cw, %eax		# load cw from mem, zero extending
	movb	$12, %ah		# set cw for "round to zero"
	movw	%ax, cw			# store back to memory
	fldcw	cw   			# save new rounding mode
	fistpl	result			# save stack value as integer to mem
	fldcw	saved_cw		# restore old cw
	
	loop	big_loop		# loop to make the count more obvious

	movl	result, %ebx		# sanity check to see if the
	cmp	$3,%rbx			# result is the expected one
	je	exit
	
print_error:
	mov 	$1,%rax			# write syscall
	mov	$1,%rdi			# stdout
	mov	$error,%rsi		# string	
	mov 	$22,%rdx		# length of string
	syscall
	
exit:	
	xor	%rdi, %rdi		# return 0
	mov	$60, %rax		# SYSCALL_EXIT
	syscall
	

.data
saved_cw:	.long 0
cw:  	.long	0
result: .long	0
three:	.long	0			# a floating point 3.0
	.long	1074266112
error:	.asciz  "Error!  Wrong result!\n"