; Altirra BASIC - Misc math module
; Copyright (C) 2014 Avery Lee, All Rights Reserved.
;
; Copying and distribution of this file, with or without modification,
; are permitted in any medium without royalty provided the copyright
; notice and this notice are preserved.  This file is offered as-is,
; without any warranty.

;===========================================================================
;FCOMP		Floating point compare routine.
;
; Inputs:
;	FR0
;	FR1
;
; Outputs:
;	Z, C set for comparison result like SBC
;
.proc fcomp
		;check for sign difference
		lda		fr1
		tax
		eor		fr0
		bpl		signs_same

		;signs are different
		cpx		fr0
xit:
		rts
		
signs_same:
		;check for both values being zero, as only signexp and first
		;mantissa byte are guaranteed to be $00 in that case
		txa
		ora		fr0
		beq		xit
		
		;compare signexp and mantissa bytes in order
		ldx		#$80-6
loop:
		lda		fr0+6-$80,x
		cmp		fr1+6-$80,x
		bne		diff
		inx
		bpl		loop
		lda		#0
		rts
		
diff:
		;okay, we've confirmed that the numbers are different, but the
		;carry flag may be going the wrong way if the numbers are
		;negative... so let's fix that.
		ror
		eor		fr0
		sec
		rol
		rts
.endp

;===========================================================================
.proc	fld1
		ldx		#<const_one
.def :MathLoadConstFR0 = *
		ldy		#>const_table
		jmp		fp_fld0r
.endp

;===========================================================================
.proc	MathFloor
		;if exponent is > $44 then there can be no decimals
		lda		fr0
		and		#$7f
		cmp		#$45
		bcs		done
		
		;if exponent is < $40 then we have zero or -1
		cmp		#$40
		bcs		not_tiny
		lda		fr0
		bmi		neg_tiny
		
		;positive... load 0
		jmp		fp_zfr0
		
neg_tiny:
		;negative... load -1
		ldx		#<const_negone
		jmp		MathLoadConstFR0
done:
		rts
		
not_tiny:
		;ok... using the exponent, compute the first digit offset we should
		;check
		adc		#$3b		;note: carry is set coming in
		tax
		
		;check digit pairs until we find a non-zero fractional digit pair,
		;zeroing as we go
		lda		#0
		tay
zero_loop:
		ora		fr0+6-$80,x
		sty		fr0+6-$80,x
		inx
		bpl		zero_loop
		
		;skip rounding if it was already integral
		tay
		beq		done
		
neg_round:
		;check if we have a negative number; if so, we need to add one
		lda		fr0
		bpl		done
		
		;subtract one to round down
		jsr		MathLoadOneFR1
		jmp		fp_fsub
		
.endp

;===========================================================================
; Extract sign from FR0 into funScratch1 and take abs(FR0).
;
.proc MathSplitSign
		lda		fr0
		sta		funScratch1
		and		#$7f
		sta		fr0
		rts
.endp

;===========================================================================
.proc MathByteToFP
		ldx		#0
.def :MathWordToFP = *
		stx		fr0+1
.def :MathWordToFP_FR0Hi_A = *
		sta		fr0
		jmp		fp_ifp
.endp

;===========================================================================
.proc MathLoadOneFR1
		ldx		#<const_one
.def :MathLoadConstFR1 = *
		ldy		#>const_one
		jmp		fp_fld1r
.endp

;===========================================================================
.proc MathStoreFR0_FPSCR
		rep		#$10
		ldx		#fpscr
		jsr		MathStoreFR0
		sep		#$10
		rts
.endp

;===========================================================================
.proc MathLoadFR1_FPSCR
		ldx		#<fpscr
.def :MathLoadFR1_Page5 = *
		ldy		#>fpscr
		jmp		fp_fld1r
.endp

;===========================================================================
.proc MathLoadFR0
		rep		#$20
		mva		0,x fr0
		mva		2,x fr0+2
		mva		4,x fr0+4
		sep		#$20
		rts
.endp

;===========================================================================
.proc MathLoadFR1
		rep		#$20
		mva		0,x fr1
		mva		2,x fr1+2
		mva		4,x fr1+4
		sep		#$20
		rts
.endp

;===========================================================================
.proc MathStoreFR0
		rep		#$20
		mva		fr0 0,x
		mva		fr0+2 2,x
		mva		fr0+4 4,x
		sep		#$20
		rts
.endp

;===========================================================================
.proc MathCopyFR0ToFR1
		rep		#$20
		mva		fr0 fr1
		mva		fr0+2 fr1+2
		mva		fr0+4 fr1+4
		sep		#$20
		rts
.endp

;==========================================================================
; AFP [D800]	Convert ASCII string at INBUFF[CIX] to FR0
;
.proc fp_afp
dotflag = fr2
xinvert = fr2+1
cix0 = fr2+2
sign = fr2+3
digit2 = fr2+4

	;skip initial spaces
	jsr		fp_skpspc

	;init FR0 and one extra mantissa byte
	lda		#$7f
	sta		fr0
	sta		digit2
	
	ldx		#fr0+1
	jsr		fp_zf1

	;clear decimal flag
	sta		dotflag
	sta		sign
	
	;check for sign
	ldy		cix
	lda		(inbuff),y
	cmp		#'+'
	beq		isplus
	cmp		#'-'
	bne		postsign
	ror		sign
isplus:
	iny
postsign:	
	sty		cix0

	;skip leading zeroes
	lda		#'0'
	jsr		fp_skipchar
	
	;check if next char is a dot, indicating mantissa <1
	lda		(inbuff),y
	cmp		#'.'
	bne		not_tiny
	iny
	
	;set dot flag
	ror		dotflag
	
	;skip zeroes and adjust exponent
	lda		#'0'
tiny_denorm_loop:
	cmp		(inbuff),y
	bne		tiny_denorm_loop_exit
	dec		fr0
	iny
	bne		tiny_denorm_loop
tiny_denorm_loop_exit:
	
not_tiny:

	;grab digits left of decimal point
	ldx		#1
nextdigit:
	lda		(inbuff),y
	cmp		#'E'
	beq		isexp
	iny
	cmp		#'.'
	beq		isdot
	eor		#'0'
	cmp		#10
	bcs		termcheck
	
	;write digit if we haven't exceeded digit count
	cpx		#6
	bcs		afterwrite
	
	bit		digit2
	bpl		writehi

	;clear second digit flag
	dec		digit2
	
	;merge in low digit
	ora		fr0,x
	sta		fr0,x
	
	;advance to next byte
	inx
	bne		afterwrite
	
writehi:
	;set second digit flag
	inc		digit2
	
	;shift digit to high nibble and write
	asl
	asl
	asl
	asl
	sta		fr0,x

afterwrite:
	;adjust digit exponent if we haven't seen a dot yet
	bit		dotflag
	smi:inc	fr0
	
	;go back for more
	jmp		nextdigit
	
isdot:
	lda		dotflag
	bne		termcheck
	
	;set the dot flag and loop back for more
	ror		dotflag
	bne		nextdigit

termcheck:
	dey
	cpy		cix0
	beq		err_carryset
term:
	;stash offset
	sty		cix

term_rollback_exp:
	;divide digit exponent by two and merge in sign
	rol		sign
	ror		fr0
	
	;check if we need a one digit shift
	bcs		nodigitshift

	;shift right one digit
	ldx		#4
digitshift:
	lsr		fr0+1
	ror		fr0+2
	ror		fr0+3
	ror		fr0+4
	ror		fr0+5
	dex
	bne		digitshift

nodigitshift:
	jmp		fp_normalize

err_carryset:
	rts

isexp:
	cpy		cix0
	beq		err_carryset
	
	;save off this point as a fallback in case we don't actually have
	;exponential notation
	sty		cix

	;check for sign
	ldx		#0
	iny
	lda		(inbuff),y
	cmp		#'+'
	beq		isexpplus
	cmp		#'-'
	bne		postexpsign
	dex						;x=$ff
isexpplus:
	iny
postexpsign:
	stx		xinvert

	;pull up to two exponent digits -- check first digit
	jsr		fp_isdigit_y
	iny
	bcs		term_rollback_exp
	
	;stash first digit
	tax
	
	;check for another digit
	jsr		fp_isdigit_y
	bcs		notexpzero2
	iny

	adc		fp_mul10,x
	tax
notexpzero2:
	txa
	
	;zero is not a valid exponent
	beq		term_rollback_exp
	
	;check if mantissa is zero -- if so, don't bias
;	ldx		fr0+1
;	beq		term
	
	;apply sign to exponent
	eor		xinvert
	rol		xinvert

	;bias digit exponent
	adc		fr0
	sta		fr0
expterm:
	jmp		term

.endp

;==========================================================================
.proc fp_carryup
round_loop:
	adc		0,x
	sta		0,x
dec_entry:
	dex
	lda		#0
	bcs		round_loop
	rts
.endp

;==========================================================================
.proc fp_tab_lo_100
	:10 dta <[100*#]
.endp

;==========================================================================
.proc fp_fasc
dotcntr = fr2
expflg = fr2+1
absexp = fr2+2
expval = fr2+3
	jsr		fp_ldbufa
	ldy		#0
	sty		expval

	;check if number is zero
	ldx		fr0
	bne		notzero
	
	lda		#$b0
	sta		(inbuff),y
	rts
	
notzero:
	;check if number is negative
	bpl		ispos
	lda		#'-'
	sta		(inbuff),y
	iny
ispos:

	;compute digit offset to place dot
	;  0.001 (10.0E-04) = 3E 10 00 00 00 00 -> -1
	;   0.01 ( 1.0E-02) = 3F 01 00 00 00 00 -> 1
	;    0.1 (10.0E-02) = 3F 10 00 00 00 00 -> 1
	;    1.0 ( 1.0E+00) = 40 01 00 00 00 00 -> 3
	;   10.0 (10.0E+00) = 40 10 00 00 00 00 -> 3
	;  100.0 ( 1.0E+02) = 40 01 00 00 00 00 -> 5
	; 1000.0 (10.0E+02) = 40 10 00 00 00 00 -> 5
	txa
	asl
	sec
	sbc		#125

	;check if we should go to exponential form (exp >= 10 or <=-3)
	bmi		exp
	cmp		#12
	bcc		noexp

exp:
	;compute and stash explicit exponent
	sec
	sbc		#2
	sta		expval
	
	;reset dot counter
	lda		#2

noexp:
	sta		dotcntr
	
	;set up for 5 mantissa bytes
	ldx		#$80-5
	
	;check if number is less than 1.0 and init dot counter
	cmp		#2
	bcs		not_tiny
	
	;insert a sixth mantissa byte
	mva		#0 fr0
	inc		dotcntr
	inc		dotcntr
	dex
not_tiny:
	
	;check if number begins with a leading zero, and if so, skip high digit
	lda		fr0+6-$80,x
	cmp		#$10
	bcs		digitloop
	lda		#$fe
	and		expval
	sta		expval
	bne		writelow
	dec		dotcntr
	bcc		writelow

	;write out mantissa digits
digitloop:
	dec		dotcntr
	bne		no_hidot
	lda		#'.'
	sta		(inbuff),y
	iny
no_hidot:

	;write out high digit
	lda		fr0+6-$80,x
	lsr
	lsr
	lsr
	lsr
	ora		#$30
	sta		(inbuff),y
	iny
	
writelow:
	;write out low digit
	dec		dotcntr
	bne		no_lodot
	lda		#'.'
	sta		(inbuff),y
	iny
no_lodot:
	
	lda		fr0+6-$80,x
	and		#$0f
	ora		#$30
	sta		(inbuff),y
	iny

	;next digit
	inx
	bpl		digitloop

	;skip trim if dot hasn't been written
	lda		dotcntr
	bpl		skip_zero_trim	
	
	;trim off leading zeroes
	lda		#'0'
lzloop:
	dey
	cmp		(inbuff),y
	beq		lzloop

	;trim off dot
	lda		(inbuff),y
	cmp		#'.'
	bne		no_trailing_dot

skip_zero_trim:
	dey
	lda		(inbuff),y
no_trailing_dot:

	;check if we have an exponent to deal with
	ldx		expval
	beq		noexp2
	
	;print an 'E'
	lda		#'E'
	iny
	sta		(inbuff),y
	
	;check for a negative exponent
	txa
	bpl		exppos
	eor		#$ff
	tax
	inx
	lda		#'-'
	dta		{bit $0100}
exppos:
	lda		#'+'
expneg:
	iny
	sta		(inbuff),y
	
	;print tens digit, if any
	txa
	sec
	ldx		#$2f
tensloop:
	inx
	sbc		#10
	bcs		tensloop
	pha
	txa
	iny
	sta		(inbuff),y
	pla
	adc		#$3a
	iny
noexp2:
	;set high bit on last char
	ora		#$80
	sta		(inbuff),y
	rts
.endp

;==========================================================================
fp_mul10:
	dta		0,10,20,30,40,50,60,70,80,90

;==========================================================================
; IFP [D9AA]	Convert 16-bit integer at FR0 to FP
;
; !NOTE! Cannot use FR2/FR3 -- MAC/65 requires that $DE-DF be preserved.
;
.proc fp_ifp
	sed

	ldx		#fr0+2
	ldy		#5
	jsr		fp_zfl
	
	ldy		#16
byteloop:
	;shift out binary bit
	asl		fr0
	rol		fr0+1
	
	;shift in BCD bit
	lda		fr0+4
	adc		fr0+4
	sta		fr0+4
	lda		fr0+3
	adc		fr0+3
	sta		fr0+3
	rol		fr0+2
	
	dey
	bne		byteloop
	
	lda		#$43
	sta		fr0

	jmp		fp_normalize_cld
.endp

;==========================================================================
; FPI [D9D2]	Convert FR0 to 16-bit integer at FR0 with rounding
;
; This cannot overwrite FR1. Darg relies on being able to stash a value
; there across a call to FPI in its startup.
;
.nowarn .proc fp_fpi
_acc0 = fr2
_acc1 = fr2+1
	
	;error out if it's guaranteed to be too big or negative (>999999)
	lda		fr0
	cmp		#$43
	bcs		err

	;zero number if it's guaranteed to be too small (<0.01)
	sbc		#$3f-1			;!!- carry is clear
	bcc		fp_zfr0

	tax
	
	;clear temp accum and set up rounding
	lda		#0
	ldy		fr0+1,x
	cpy		#$50
	rol						;!! - clears carry too
	sta		fr0
	lda		#0

	;check for [0.01, 1)
	dex
	bmi		done

	;convert ones/tens digit pair to binary (one result byte: 0-100)
	lda		fr0+1,x
	jsr		fp_dectobin
	adc		fr0
	adc		fp_dectobin_tab,y
	clc
	sta		fr0
	lda		#0

	;check if we're done
	dex
	bmi		done

	;convert hundreds/thousands digit pair to binary (two result bytes: 0-10000)
	lda		fr0+1,x
	jsr		fp_dectobin
	lda		fr0
	adc		fp_tab_lo_1000,y
	sta		fr0
	lda		fp_tab_hi_1000,y
	adc		#0
	pha
	lda		fr0+1,x
	and		#$0f
	tay
	lda		fr0
	adc		fp_tab_lo_100,y
	sta		fr0
	pla
	adc		fp_tab_hi_100,y

	;check if we're done
	dex
	bmi		done

	;convert ten thousands digit pair to binary (two result bytes: 0-100000, overflow possible)
	ldy		fr0+1,x
	cpy		#$07
	bcs		err
	tax
	tya
	asl
	asl
	asl
	asl
	adc		fr0
	sta		fr0
	txa
	adc		fp_tab_hi_10000-1,y

done:
	;move result back to FR0, with rounding
	sta		fr0+1
err:
	rts
.endp

;==========================================================================
; ZFR0 [DA44]	Zero FR0
; ZF1 [DA46]	Zero float at (X)
; ZFL [DA48]	Zero float at (X) with length Y (UNDOCUMENTED)
;
fp_zfr0:
	ldx		#fr0
fp_zf1:
	ldy		#6
fp_zfl:
	lda		#0
zflloop:
	sta		0,x
	inx
	dey
	bne		zflloop
	rts

;==========================================================================
; LDBUFA [DA51]	Set LBUFF to #INBUFF (UNDOCUMENTED)
;
fp_ldbufa:
	mwa		#lbuff inbuff
	rts

;==========================================================================
; FSUB [DA60]	Subtract FR1 from FR0; FR1 is altered
; FADD [DA66]	Add FR1 to FR0; FR1 is altered
fp_fadd = fp_fsub._fadd
.proc fp_fsub

_diffmode = fr1

	;toggle sign on FR1
	lda		fr1
	eor		#$80
	sta		fr1
	
	;fall through to FADD
_fadd:
	;if fr1 is zero, we're done
	lda		fr1
	beq		sum_xit
	
	;if fr0 is zero, swap
	lda		fr0
	beq		swap

	;compute difference in exponents, ignoring sign
	lda		fr1			;load fr1 sign
	eor		fr0			;compute fr0 ^ fr1 signs
	and		#$80		;mask to just sign
	tax
	eor		fr1			;flip fr1 sign to match fr0
	clc
	sbc		fr0			;compute difference in exponents - 1
	bcc		noswap
	
	;swap FR0 and FR1
swap:
	jsr		fp_swap
	
	;loop back and retry
	bmi		_fadd
	
noswap:	
	;A = FR1 - FR0 - 1
	;X = add/sub flag

	;compute positions for add/subtract	
	adc		#6			;A = (FR1) - (FR0) + 6   !! carry is clear coming in
	tay
	
	;check if FR1 is too small in magnitude to matter
	bmi		sum_xit
	
	;jump to decimal mode and prepare for add/sub loops
	sed

	;check if we are doing a sum or a difference
	cpx		#$80
	ldx		#5
	bcs		do_subtract
	
	;set up rounding
	lda		#0
	cpy		#5
	bcs		add_no_round
	lda		fr1+1,y
add_no_round:
	cmp		#$50
		
	;add mantissas
	tya
	beq		post_add_loop
add_loop:
	lda		fr1,y
	adc		fr0,x
	sta		fr0,x
	dex
	dey
	bne		add_loop
post_add_loop:
		
	;check if we had a carry out
	bcc		sum_xit
	
	;carry it up
	bcs		sum_carryloop_start
sum_carryloop:
	lda		fr0+1,x
	adc		#0
	sta		fr0+1,x
	bcc		sum_xit
sum_carryloop_start:
	dex
	bpl		sum_carryloop

	jsr		fp_carry_expup
	
sum_xit:
	;exit decimal mode
	;normalize if necessary and exit (needed for borrow, as well to check over/underflow)
	jmp		fp_normalize_cld

do_subtract:
	;subtract FR0 and FR1 mantissas (!! carry is set coming in)
	sty		fr1
	bcs		sub_loop_entry
sub_loop:
	lda		fr0,x
	sbc		fr1+1,y
	sta		fr0,x
	dex
sub_loop_entry:
	dey
	bpl		sub_loop
	jmp		fp_fsub_cont
.endp

;==========================================================================
; Entry:
;	A = BCD value
;	P.D = clear
;
; Exit:
;	A = binary value
;	Y = modified
;
.proc fp_dectobin
	pha
	lsr
	lsr
	lsr
	lsr
	tay
	pla
	clc
	rts
.endp

;==========================================================================
; FMUL [DADB]:	Multiply FR0 * FR1 -> FR0
;
fp_fld1r_const_fmul:
	ldy		#>fpconst_ten
fp_fld1r_fmul:
	jsr		fp_fld1r
.proc fp_fmul

	;We use FR0:FR3 as a double-precision accumulator, and copy the
	;original multiplicand value in FR0 to FR1. The multiplier in
	;FR1 is converted to binary digit pairs into FR2.
	
_offset = _fr3+5
_offset2 = fr2

	;if FR0 is zero, we're done
	lda		fr0
	beq		xit
	
	;if FR1 is zero, zero FR0 and exit
	lda		fr1
	bne		nonzero
	clc
	jmp		fp_zfr0

nonzero:
	
	;move fr0 to fr2
	jsr		fp_fmul_fr0_to_binfr2
	
	;compute new exponent and stash
	lda		fr1
	clc
	jsr		fp_adjust_exponent.fmul_entry
	
	sta		fr0

	jmp		fp_fmul_innerloop

xit:
	clc
	rts	
.endp

.proc fp_adjust_exponent
fdiv_entry:
	lda		fr1
	eor		#$7f
	sec
fmul_entry:
	;stash modified exp1
	tax
	
	;compute new sign
	eor		fr0
	and		#$80
	sta		fr1
	
	;merge exponents
	txa
	adc		fr0
	tax
	eor		fr1
	
	;check for underflow/overflow
	cmp		#128-49
	bcc		underflow_overflow
	
	cmp		#128+49
	bcs		underflow_overflow
	
	;rebias exponent
	txa
	sec
	sbc		#$40
	rts
	
underflow_overflow:
	pla
	pla
	jmp		fp_zfr0
.endp

;==========================================================================
	.pages 1	;optimized by fp_fld1r_const_fmul
	
fpconst_ten:
	.fl		10

fpconst_ln10:
	.fl		2.3025850929940456840179914546844

	.endpg
;==========================================================================
; FDIV [DB28]	Divide FR0 / FR1 -> FR0
;
; Compatibility:
;	- It is important that FDIV rounds if FADD/FMUL do. Otherwise, some
;	  forms of square root computation can have a slight error on integers,
;	  which breaks TICKTOCK.BAS.
;
.proc fp_fdiv
_digit = _fr3+1
_index = _fr3+2
	;check if dividend is zero
	lda		fr0
	beq		ok
	
	;check if divisor is zero
	lda		fr1
	beq		err
	
	ldx		#fr2
	jsr		fp_zf1
	lda		#$50
	sta		fr2+6
	
	;compute new exponent
	jsr		fp_adjust_exponent.fdiv_entry
	
	jsr		fp_fdiv_init	
	sec

digitloop:
	;just keep going if we're accurate
	lda		fr0
	ora		fr0+1
	beq		nextdigit
	
	;check if we should either divide or add based on current sign (stored in carry)
	bcc		incloop

decloop:
	;increment quotient mantissa byte
	lda		_digit
	ldx		_index
uploop:
	adc		fr2+7-$80,x
	sta		fr2+7-$80,x
	lda		#0
	dex
	bcs		uploop

	;subtract mantissas
	jsr		fp_fastsub5
	lda		fr0
	sbc		#0
	sta		fr0

	;keep going until we underflow
	bcs		decloop
	bcc		nextdigit
	
incloop:
	;decrement quotient mantissa byte
	lda		#0
	sbc		_digit
	ldx		_index
downloop:
	adc		fr2+7-$80,x
	sta		fr2+7-$80,x
	lda		#$99
	dex
	bcc		downloop
	
	;add mantissas
	ldx		#fr0
	jsr		fp_fastadd6_fr1
	
	;keep going until we overflow
	bcc		incloop	
	
nextdigit:
	;shift dividend (make sure to save carry state)
	jsr		fp_fr0_shl4
	
	;next digit
	lda		_digit
	eor		#$09
	sta		_digit
	beq		digitloop
	
	;next quo byte
	inc		_index
	bpl		digitloop
	
	;move back to fr0
	ldx		#fr2-1
	ldy		_fr3
	lda		fr2
	bne		no_normstep
	inx
	dey
no_normstep:
	sty		0,x
	jsr		fp_fld0r_zp

	cld
ok:
	clc
	rts
err:
	sec
	rts
.endp

;==========================================================================
; SKPSPC [DBA1]	Increment CIX while INBUFF[CIX] is a space
fp_skpspc:
	lda		#' '
	ldy		cix
fp_skipchar:
skpspc_loop:
	cmp		(inbuff),y
	bne		skpspc_xit
	iny
	bne		skpspc_loop
skpspc_xit:
	sty		cix
	rts

;==========================================================================
; ISDIGT [DBAF]	Check if INBUFF[CIX] is a digit (UNDOCUMENTED)
fp_isdigt = _isdigt
.proc _isdigt
	ldy		cix
.def :fp_isdigit_y = *
	lda		(inbuff),y
	sec
	sbc		#'0'
	cmp		#10
	rts
.endp

;==========================================================================
.proc fp_fastadd6_fr1			;$36 bytes
	clc
	lda		5,x
	adc		fr1+5
	sta		5,x
	lda		4,x
	adc		fr1+4
	sta		4,x
	lda		3,x
	adc		fr1+3
	sta		3,x
	lda		2,x
	adc		fr1+2
	sta		2,x
	lda		1,x
	adc		fr1+1
	sta		1,x
	lda		0,x
	adc		fr1+0
	sta		0,x
	rts
.endp

;==========================================================================
.proc fp_fastsub5				;$20 bytes
	sec
	lda		fr0+5
	sbc		fr1+5
	sta		fr0+5
	lda		fr0+4
	sbc		fr1+4
	sta		fr0+4
	lda		fr0+3
	sbc		fr1+3
	sta		fr0+3
	lda		fr0+2
	sbc		fr1+2
	sta		fr0+2
	lda		fr0+1
	sbc		fr1+1
	sta		fr0+1
	rts
.endp

;==========================================================================
; NORMALIZE [DC00]	Normalize FR0 (UNDOCUMENTED)
fp_normalize_cld:
	cld
fp_normalize:
normalize .proc
	ldy		#5
normloop:
	lda		fr0
	and		#$7f
	beq		underflow2
	
	ldx		fr0+1
	beq		need_norm

	;Okay, we're done normalizing... check if the exponent is in bounds.
	;It needs to be within +/-48 to be valid. If the exponent is <-49,
	;we set it to zero; otherwise, we mark overflow.
	
	cmp		#64-49
	bcc		underflow
	cmp		#64+49
	rts
	
need_norm:
	dec		fr0
	ldx		#$80-5
normloop2:
	mva		fr0+7-$80,x fr0+6-$80,x
	inx
	bpl		normloop2
	stz		fr0+6
	dey
	bne		normloop
	
	;Hmm, we shifted out everything... must be zero; reset exponent. This
	;is critical since Atari Basic depends on the exponent being zero for
	;a zero result.
	sty		fr0
	sty		fr0+1
xit:
	clc
	rts
	
underflow2:
	clc
underflow:
	jmp		fp_zfr0
	
.endp

;==========================================================================
; HELPER ROUTINES
;==========================================================================

.proc fp_fdiv_init		
	sta		_fr3

	ldx		#0
	stx		fr0
	stx		fr1
	
	;check if dividend begins with a leading zero digit -- if so, shift it left 4
	;and begin with the tens digit
	lda		fr1+1
	cmp		#$10
	bcs		start_with_ones

	ldy		#4
bitloop:
	asl		fr1+5
	rol		fr1+4
	rol		fr1+3
	rol		fr1+2
	rol		fr1+1
	dey
	bne		bitloop

	ldx		#$09
	
start_with_ones:

	stx		fp_fdiv._digit
	sed

	ldx		#$80-7
	stx		fp_fdiv._index
	rts
.endp

;--------------------------------------------------------------------------
.proc fp_fsub_cont
	;check if we had a borrow
	bcs		sub_xit
	bcc		borrow_loop_start

	;propagate borrow up
borrow_loop:
	lda		fr0+1,x
	sbc		#0
	sta		fr0+1,x
	bcs		sub_xit
borrow_loop_start:
	dex
	bpl		borrow_loop

	ldx		#5
	sec
diff_borrow:
	lda		#0
	sbc		fr0,x
	sta		fr0,x
	dex
	bne		diff_borrow
	lda		#$80
	eor		fr0
	sta		fr0
sub_xit:

norm_loop:
	;Check if the exponent is in bounds.
	;It needs to be within +/-48 to be valid. If the exponent is <-49,
	;we set it to zero. Overflow isn't possible as this is the mantissa
	;subtraction path.
	lda		fr0
	and		#$7f
	cmp		#64-49
	bcc		underflow
	
	ldx		fr0+1
	beq		need_norm

	;check if we need to round, i.e.:
	; 2.00000000
	;-0.000000005
	;load rounding byte offset
	ldx		fr1
	cpx		#4
	bcs		no_round
	lda		fr1+2,x
	cmp		#$50
	bcs		round_up
no_round:

	clc
	cld
	rts
	
need_norm:
	ldx		#$80-4
scan_loop:
	dec		fr0
	ldy		fr0+6-$80,x
	bne		found_pos
	inx
	bpl		scan_loop
	
	;hmm... mantissa is all zero.
underflow2:
	clc
underflow:
	cld
	jmp		fp_zfr0
	
found_pos:
	;shift up mantissa
	ldy		#0
shift_loop:
	mva		fr0+6-$80,x fr0+1,y
	iny
	inx
	bpl		shift_loop
	
	;clear remaining mantissa bytes
	ldx		#0
clear_loop:
	stx		fr0+1,y+
	cpy		#6
	bne		clear_loop
	
	;check if we need to round
	
	
	;if not, loop back to check the exponent and exit
;	bcc		norm_loop
	beq		norm_loop
	
round_up:
	;jump back into fadd code to carry up and exit
	ldx		#5
	jmp		fp_fsub.sum_carryloop
.endp

;--------------------------------------------------------------------------
.proc fp_fmul_innerloop
_offset = _fr3+5
_offset2 = fr2

	inc		fr0

	;clear accumulator through to exponent byte of fr1
	ldx		#fr0+1
	ldy		#12
	jsr		fp_zfl

	;set up for 7 bits per digit pair (0-99 in 0-127)
	ldx		#7
	stx		_offset
	sed

	;set rounding byte, assuming renormalize needed (fr0+2 through fr0+6)
	lda		#$50
	sta		fr0+7

	;begin outer loop -- this is where we process one _bit_ out of each
	;multiplier byte in FR2's mantissa (note that this is inverted in that
	;it is bytes-in-bits instead of bits-in-bytes)
offloop:

	;begin inner loop -- here we process the same bit in each multiplier
	;byte, going from byte 5 down to byte 1
	ldx		#fr0+5
offloop2:
	;shift a bit out of fr1 mantissa
	lsr		fr2-fr0,x
	bcc		noadd
			
	;add fr1 to fr0 at offset	
	jsr		fp_fastadd6_fr1
	
	;check if we have a carry out to the upper bytes
	bcc		no_carry
	stx		_offset2
	jsr		fp_carryup.dec_entry
	ldx		_offset2
no_carry:
	
noadd:
	;go back for next byte
	dex
	cpx		#fr0
	bne		offloop2

	;double fr1
	clc
	jsr		fp_fastdbl_fr1

	;loop back until all mantissa bytes finished
	dec		_offset
	bne		offloop
	
	;check if no renormalize is needed, and if so, re-add new rounding
	lda		fr0+1
	beq		renorm_needed

	lda		#$50
	ldx		#fr0+6
	jsr		fp_carryup

renorm_needed:
	;all done
	jmp		fp_normalize_cld
.endp

;==========================================================================
.proc fp_fmul_fr0_to_binfr2		;$15 bytes
	ldx		#4
loop:
	lda		fr0+1,x
	lsr
	lsr
	lsr
	lsr
	tay
	clc
	lda		fr0+1,x
	adc		fp_dectobin_tab,y
	sta		fr2+1,x
	dex
	bpl		loop
	rts
.endp

;==========================================================================
.proc fp_tab_lo_1000
	:10 dta <[1000*#]
.endp

.proc fp_tab_hi_1000
	:10 dta >[1000*#]
.endp

.proc fp_tab_hi_100
	:10 dta >[100*#]
.endp

.proc fp_tab_hi_10000
	:6 dta >[10000*[#+1]]
.endp

;==========================================================================
; PLYEVL [DD40]	Eval polynomial at (X:Y) with A coefficients using FR0
;
fp_plyevl_10:
	lda		#10
.nowarn .proc fp_plyevl
	;stash arguments
	stx		fptr2
	sty		fptr2+1
	sta		_fpcocnt
	
	;copy FR0 -> PLYARG
	ldx		#<plyarg
	ldy		#>plyarg
	jsr		fp_fst0r
	
	jsr		fp_zfr0
	
loop:
	;load next coefficient and increment coptr
	lda		fptr2
	tax
	clc
	adc		#6
	sta		fptr2
	ldy		fptr2+1
	scc:inc	fptr2+1
	jsr		fp_fld1r

	;add coefficient to acc
	jsr		fp_fadd
	bcs		xit

	dec		_fpcocnt
	beq		xit
	
	;copy PLYARG -> FR1
	;multiply accumulator by Z and continue
	ldx		#<plyarg
	ldy		#>plyarg	
	jsr		fp_fld1r_fmul
	bcc		loop
xit:
	rts
.endp

;==========================================================================
.proc fp_swap
	ldx		#5
swaploop:
	lda		fr0,x
	ldy		fr1,x
	sta		fr1,x
	sty		fr0,x
	dex
	bpl		swaploop
	rts
.endp

;==========================================================================
; FLD0R [DD89]	Load FR0 from (X:Y)
; FLD0P [DD8D]	Load FR0 from (FLPTR)
;
fp_fld0r_zp:
	ldy		#0
fp_fld0r:
	stx		flptr
	sty		flptr+1
fp_fld0p:
	ldy		#5
fld0ploop:
	lda		(flptr),y
	sta		fr0,y
	dey
	bpl		fld0ploop
	rts

;==========================================================================
; FLD1R [DD98]	Load FR1 from (X:Y)
; FLD1P [DD9C]	Load FR1 from (FLPTR)
;
fp_fld1r:
	stx		flptr
	sty		flptr+1
fp_fld1p:
	ldy		#5
fld1ploop:
	lda		(flptr),y
	sta		fr1,y
	dey
	bpl		fld1ploop
	rts

;==========================================================================
; FST0R [DDA7]	Store FR0 to (X:Y)
; FST0P [DDAB]	Store FR0 to (FLPTR)
;
fp_fst0r:
	stx		flptr
	sty		flptr+1
fp_fst0p:
	ldy		#5
fst0ploop:
	lda		fr0,y
	sta		(flptr),y
	dey
	bpl		fst0ploop
	rts

;==========================================================================
; FMOVE [DDB6]	Move FR0 to FR1
;
.proc fp_fmove
	ldx		#5
fmoveloop:
	lda		fr0,x
	sta		fr1,x
	dex
	bpl		fmoveloop
	rts
.endp

;==========================================================================
; EXP [DDC0]	Compute e^x
; EXP10 [DDCC]	Compute 10^x
;
fp_exp10 = fp_exp._exp10
.proc fp_exp
	ldx		#<fpconst_log10_e
	ldy		#>fpconst_log10_e
	jsr		fp_fld1r		;we could use fp_fld1r_fmul, but then we have a hole :(
	jsr		fp_fmul
	bcs		err2
_exp10:
	;stash sign and compute abs
	asl		fr0
	ror		_fptemp1
	lsr		fr0

	ldy		#0
	
	;check for |exp| >= 100 which would guarantee over/underflow
	sec
	sbc		#$40
	bcc		abs_ok
	bne		abs_too_big
	
	;|exp|>=1, so split it into integer/fraction
	lda		fr0+1
	jsr		fp_dectobin
	adc		fp_dectobin_tab,y
	tay

	dec		fr0
	ldx		#$80-4
frac_loop:
	lda		fr0+6-$80,x
	sta		fr0+5-$80,x
	inx
	bpl		frac_loop
	stz		fr0+5
	bne		abs_ok
	
abs_too_big:
	;okay, the |x| is too big... check if the original was negative.
	;if so, zero and exit, otherwise error.
	lda		_fptemp1
	bpl		err2
	clc
	jmp		fp_zfr0
		
abs_ok:
	;stash integer portion of exponent
	sty		_fptemp0
		
	;compute approximation z = 10^y
	ldx		#<coeff
	ldy		#>coeff
	jsr		fp_plyevl_10
	
	;tweak exponent
	lsr		_fptemp0
	
	;scale by 10 if necessary
	bcc		even
	ldx		#<fpconst_ten
	jsr		fp_fld1r_const_fmul
	bcs		abs_too_big
even:

	;bias exponent
	lda		_fptemp0
	adc		fr0
	cmp		#64+49
	bcs		err2
	sta		fr0
	
	;check if we should invert
	rol		_fptemp1
	bcc		xit2
	
	jsr		fp_fmove
	ldx		#<fp_one
	ldy		#>fp_one
	jsr		fp_fld0r
	jmp		fp_fdiv

err2:
xit2:
	rts
	
coeff:		;Minimax polynomial for 10^x over 0 <= x < 1
	.fl		 0.0146908308
	.fl		-0.002005331171
	.fl		 0.0919452045
	.fl		 0.1921383884
	.fl		 0.5447325197
	.fl		 1.17018250
	.fl		 2.03478581
	.fl		 2.65094494
	.fl		 2.30258512
	.fl		 1
.endp	

;==========================================================================
.proc fp_fastdbl_fr1
	lda		fr1+5
	adc		fr1+5
	sta		fr1+5
	lda		fr1+4
	adc		fr1+4
	sta		fr1+4
	lda		fr1+3
	adc		fr1+3
	sta		fr1+3
	lda		fr1+2
	adc		fr1+2
	sta		fr1+2
	lda		fr1+1
	adc		fr1+1
	sta		fr1+1
	lda		fr1+0
	adc		fr1+0
	sta		fr1+0
xit:
	rts
.endp

;==========================================================================
fpconst_log10_e:
	.fl		0.43429448190325182765112891891661

;==========================================================================
; REDRNG [DE95]	Reduce range via y = (x-C)/(x+C) (undocumented)
;
; X:Y = pointer to C argument
;
fp_redrng = _redrng
.proc _redrng
	stx		fptr2
	sty		fptr2+1
	jsr		fp_fld1r
	ldx		#<fpscr
	ldy		#>fpscr
	jsr		fp_fst0r
	jsr		fp_fadd
	bcs		fail
	ldx		#<plyarg
	ldy		#>plyarg
	jsr		fp_fst0r
	ldx		#<fpscr
	ldy		#>fpscr
	jsr		fp_fld0r
	ldx		fptr2
	ldy		fptr2+1
	jsr		fp_fld1r
	jsr		fp_fsub
	bcs		fail
	ldx		#<plyarg
	ldy		#>plyarg
	jsr		fp_fld1r
	jmp		fp_fdiv
	
fail = fp_fastdbl_fr1.xit
.endp

;==========================================================================
; LOG [DECD]	Compute ln x
; LOG10 [DED1]	Compute log10 x
;
fp_log10 = fp_log._log10
.proc fp_log
	lsr		_fptemp1
	bpl		entry
_log10:
	sec
	ror		_fptemp1
entry:
	;throw error on negative number
	lda		fr0
	bmi		err
	
	;stash exponentx2 - 128
	asl
	eor		#$80
	sta		_fptemp0
	
	;raise error if argument is zero
	lda		fr0+1
	beq		err
	
	;reset exponent so we are in 1 <= z < 100
	ldx		#$40
	stx		fr0
	
	;split into three ranges based on mantissa:
	;  1/sqrt(10) <= x < 1:            [31, 99] divide by 100
	;  sqrt(10)/100 <= x < 1/sqrt(10): [ 3, 30] divide by 10
	;  0 < x < sqrt(10)/100:           [ 1,  2] leave as-is
	
	cmp		#$03
	bcc		post_range_adjust
	cmp		#$31
	bcc		mid_range

	;increase result by 1 (equivalent to *10 input)
	inc		_fptemp0
	bne		adjust_exponent
	
mid_range:
	;multiply by 10
	ldx		#<fpconst_ten
	jsr		fp_fld1r_const_fmul
	bcs		err2

adjust_exponent:
	;increase result by 1 (equivalent to *10 input)
	inc		_fptemp0
	
	;divide fraction by 100
	dec		fr0
	
post_range_adjust:
	;at this point, we have 0.30 <= z <= 3; apply y = (z-1)/(z+1) transform
	;so we can use a faster converging series... this reduces y to
	;0 <= y < 0.81
	ldx		#<fp_one
	ldy		#>fp_one
	jsr		fp_redrng
	
	;stash y so we can later multiply it back in
	ldx		#<fpscr
	ldy		#>fpscr
	jsr		fp_fst0r
	
	;square the value so we compute a series on y^2n
	jsr		fp_fmove
	jsr		fp_fmul
	
	;do polynomial expansion
	ldx		#<fpconst_log10coeff
	ldy		#>fpconst_log10coeff
	jsr		fp_plyevl_10
	bcs		err2
	
	;multiply back in so we have series on y^(2n+1)
	ldx		#<fpscr
	ldy		#>fpscr
	jsr		fp_fld1r_fmul
	
	;stash
	jsr		fp_fmove
	
	;convert exponent adjustment back to float (signed)
	lda		#0
	sta		fr0+1
	ldx		_fptemp0
	bpl		expadj_positive
	sec
	sbc		_fptemp0
	tax
expadj_positive:
	stx		fr0
	jsr		fp_ifp
	
	;merge (cannot fail)
	asl		fr0
	asl		_fptemp0
	ror		fr0
	jsr		fp_fadd
	
	;scale if doing log
	bit		_fptemp1
	bmi		xit2
	
	ldx		#<fpconst_ln10
	jmp		fp_fld1r_const_fmul

err:
	sec
xit2:
err2:
	rts
.endp

;==========================================================================
.proc fp_fr0_shl4				;$14 bytes
	php
	ldx		#4
bitloop:
	asl		fr0+5
	rol		fr0+4
	rol		fr0+3
	rol		fr0+2
	rol		fr0+1
	rol		fr0
	dex
	bne		bitloop
	plp
	rts
.endp

;==========================================================================
.proc fp_carry_expup
	;adjust exponent
	inc		fr0

	;shift down FR0
	ldx		#4
sum_shiftloop:
	lda		fr0,x
	sta		fr0+1,x
	dex
	bne		sum_shiftloop
	
	;add a $01 at the top
	inx
	stx		fr0+1
	rts
.endp

;==========================================================================
; HALF (used by Atari BASIC)
;
fpconst_half:
	.fl		0.5
	
;==========================================================================
; log10(x) coefficients
;
; LOG10 computes:
;							-0.30 <= z <= 3.0
;	y = (z-1)/(z+1)			-0.54 <= y <= 0.5
;	x = y^2					0 <= x <= 0.29
;	log10(z) = f(x)*y
;
; Therefore:
;	f(x) = log10((1+y)/(1-y))/y
;
fpconst_log10coeff:		;Maclaurin series expansion for log10((z-1)/(z+1))
	.fl		 0.2026227154
	.fl		-0.0732044921
	.fl		 0.1060983564
	.fl		 0.0560417329
	.fl		 0.0804188407
	.fl		 0.0963916015
	.fl		 0.1240896135
	.fl		 0.1737176646
	.fl		 0.2895296558
	.fl		 0.8685889638

;==========================================================================
; Arctangent coefficients
;
; The 11 coefficients here form a power series approximation
; f(x^2) ~= atn(x)/x. This is not an official feature of the math pack but
; is relied upon by BASIC.
;
; We used to use the coefficients from Abramowitz & Stegun 4.4.49 here, but
; there seems to be an error there such that the result falls far short
; of the specified 2x10^-8 accuracy over 0<=x<=1 at x=1. Instead, we now
; use a custom minimax polynomial for f(y)=atn(sqrt(y))/sqrt(y) where y=x^2.
;
fpconst_atncoef:	;coefficients for atn(x)/x ~= f(x^2)
			;see Abramowitz & Stegun 4.4.49
		
	.fl		 0.001112075881		;x**10*1.11207588057982e-3
	.fl		-0.007304087520		;x**9*-7.30408751951452e-3
	.fl		 0.0224965573		;x**8*2.24965572957342e-2
	.fl		-0.0446185172		;x**7*-4.46185172165888e-2
	.fl		 0.0673463245		;x**6*6.73463245104305e-2
	.fl		-0.0880690664		;x**5*-8.80690663570546e-2
	.fl		 0.1105667499		;x**4*1.10566749879313e-1
	.fl		-0.1427949312		;x**3*-1.42794931245212e-1
	.fl		 0.1999963060		;x**2*1.99996306023439e-1
	.fl		-0.3333332472		;x**1*-3.33333247188074e-1
									;x**0*9.99999999667198e-1
fp_one:
	.fl		1.0				;also an arctan coeff
fp_pi4:	;pi/4 - needed by Atari Basic ATN()
	.fl		0.78539816339744830961566084581988
	
fp_dectobin_tab:
	:10 dta	<[-6*#]
