shithub: qk1

--- a/asm_draw.h

+++ /dev/null

@@ -1,132 +1,0 @@

-//

-// asm_draw.h

-//

-// Include file for asm drawing routines.

-//

-//

-// !!! note that this file must match the corresponding C structures at all

-// times !!!

-//

-// !!! if this is changed, it must be changed in r_local.h too !!!

-#define	NEAR_CLIP	0.01

-// !!! if this is changed, it must be changed in r_local.h too !!!

-#define	CYCLE	128

-// espan_t structure

-// !!! if this is changed, it must be changed in r_shared.h too !!!

-#define espan_t_u    	0

-#define espan_t_v	    4

-#define espan_t_count   8

-#define espan_t_pnext	12

-#define espan_t_size    16

-// sspan_t structure

-// !!! if this is changed, it must be changed in d_local.h too !!!

-#define sspan_t_u    	0

-#define sspan_t_v	    4

-#define sspan_t_count   8

-#define sspan_t_size    12

-// spanpackage_t structure

-// !!! if this is changed, it must be changed in d_polyset.c too !!!

-#define spanpackage_t_pdest				0

-#define spanpackage_t_pz				4

-#define spanpackage_t_count				8

-#define spanpackage_t_ptex				12

-#define spanpackage_t_sfrac				16

-#define spanpackage_t_tfrac				20

-#define spanpackage_t_light				24

-#define spanpackage_t_zi				28

-#define spanpackage_t_size				32

-// edge_t structure

-// !!! if this is changed, it must be changed in r_shared.h too !!!

-#define et_u			0

-#define et_u_step		4

-#define et_prev			8

-#define et_next			12

-#define et_surfs		16

-#define et_nextremove	20

-#define et_nearzi		24

-#define et_owner		28

-#define et_size			32

-// surf_t structure

-// !!! if this is changed, it must be changed in r_shared.h too !!!

-#define SURF_T_SHIFT	6

-#define st_next			0

-#define st_prev			4

-#define st_spans		8

-#define st_key			12

-#define st_last_u		16

-#define st_spanstate	20

-#define st_flags		24

-#define st_data			28

-#define st_entity		32

-#define st_nearzi		36

-#define st_insubmodel	40

-#define st_d_ziorigin	44

-#define st_d_zistepu	48

-#define st_d_zistepv	52

-#define st_pad			56

-#define st_size			64

-// clipplane_t structure

-// !!! if this is changed, it must be changed in r_local.h too !!!

-#define cp_normal		0

-#define cp_dist			12

-#define cp_next			16

-#define cp_leftedge		20

-#define cp_rightedge	21

-#define cp_reserved		22

-#define cp_size			24

-// medge_t structure

-// !!! if this is changed, it must be changed in model.h too !!!

-#define me_v				0

-#define me_cachededgeoffset	4

-#define me_size				8

-// mvertex_t structure

-// !!! if this is changed, it must be changed in model.h too !!!

-#define mv_position		0

-#define mv_size			12

-// refdef_t structure

-// !!! if this is changed, it must be changed in render.h too !!!

-#define rd_vrect					0

-#define rd_aliasvrect				20

-#define rd_vrectright				40

-#define rd_vrectbottom				44

-#define rd_aliasvrectright			48

-#define rd_aliasvrectbottom			52

-#define rd_vrectrightedge			56

-#define rd_fvrectx					60

-#define rd_fvrecty					64

-#define rd_fvrectx_adj				68

-#define rd_fvrecty_adj				72

-#define rd_vrect_x_adj_shift20		76

-#define rd_vrectright_adj_shift20	80

-#define rd_fvrectright_adj			84

-#define rd_fvrectbottom_adj			88

-#define rd_fvrectright				92

-#define rd_fvrectbottom				96

-#define rd_horizontalFieldOfView	100

-#define rd_xOrigin					104

-#define rd_yOrigin					108

-#define rd_vieworg					112

-#define rd_viewangles				124

-#define rd_ambientlight				136

-#define rd_size						140

-// mtriangle_t structure

-// !!! if this is changed, it must be changed in model.h too !!!

-#define mtri_facesfront		0

-#define mtri_vertindex		4

-#define mtri_size			16	// !!! if this changes, array indexing in !!!

-								// !!! d_polysa.s must be changed to match !!!

-#define mtri_shift			4

--- a/asm_i386.h

+++ /dev/null

@@ -1,78 +1,0 @@

-#ifndef __ASM_I386__

-#define __ASM_I386__

-#ifdef ELF

-#define C(label) label

-#endif

-#ifndef ELF

-#define C(label) _##label

-#endif

-//

-// !!! note that this file must match the corresponding C structures at all

-// times !!!

-//

-// plane_t structure

-// !!! if this is changed, it must be changed in model.h too !!!

-// !!! if the size of this is changed, the array lookup in SV_HullPointContents

-//     must be changed too !!!

-#define pl_normal	0

-#define pl_dist		12

-#define pl_type		16

-#define pl_signbits	17

-#define pl_pad		18

-#define pl_size		20

-// hull_t structure

-// !!! if this is changed, it must be changed in model.h too !!!

-#define	hu_clipnodes		0

-#define	hu_planes			4

-#define	hu_firstclipnode	8

-#define	hu_lastclipnode		12

-#define	hu_clip_mins		16

-#define	hu_clip_maxs		28

-#define hu_size  			40

-// dnode_t structure

-// !!! if this is changed, it must be changed in bspfile.h too !!!

-#define	nd_planenum		0

-#define	nd_children		4

-#define	nd_mins			8

-#define	nd_maxs			20

-#define	nd_firstface	32

-#define	nd_numfaces		36

-#define nd_size			40

-// sfxcache_t structure

-// !!! if this is changed, it much be changed in sound.h too !!!

-#define sfxc_length		0

-#define sfxc_loopstart	4

-#define sfxc_speed		8

-#define sfxc_width		12

-#define sfxc_stereo		16

-#define sfxc_data		20

-// channel_t structure

-// !!! if this is changed, it much be changed in sound.h too !!!

-#define ch_sfx			0

-#define ch_leftvol		4

-#define ch_rightvol		8

-#define ch_end			12

-#define ch_pos			16

-#define ch_looping		20

-#define ch_entnum		24

-#define ch_entchannel	28

-#define ch_origin		32

-#define ch_dist_mult	44

-#define ch_master_vol	48

-#define ch_size			52

-// portable_samplepair_t structure

-// !!! if this is changed, it much be changed in sound.h too !!!

-#define psp_left		0

-#define psp_right		4

-#define psp_size		8

-#endif

--- a/block16.h

+++ /dev/null

@@ -1,123 +1,0 @@

-LEnter16_16:

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch0:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch1:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch2:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch3:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch4:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch5:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch6:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch7:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-LEnter8_16:

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch8:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch9:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch10:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch11:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-LEnter4_16:

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch12:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch13:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

-LEnter2_16:

-	movb	(%esi),%al

-	movb	(%esi,%ebx,),%cl

-	movb	%dh,%ah

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	leal	(%esi,%ebx,2),%esi

-	movw	0x12345678(,%eax,2),%ax

-LBPatch14:

-	addl	%ebp,%edx

-	movw	%ax,(%edi)

-	movw	0x12345678(,%ecx,2),%cx

-LBPatch15:

-	movw	%cx,2(%edi)

-	addl	$0x4,%edi

--- a/d_draw.s

+++ /dev/null

@@ -1,1018 +1,0 @@

-//

-// d_draw.s

-// x86 assembly-language horizontal 8-bpp span-drawing code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef	id386

-//----------------------------------------------------------------------

-// 8-bpp horizontal span drawing code for polygons, with no transparency.

-//

-// Assumes there is at least one span in pspans, and that every span

-// contains at least one pixel

-//----------------------------------------------------------------------

-	.text

-// out-of-line, rarely-needed clamping code

-LClampHigh0:

-	movl	C(bbextents),%esi

-	jmp		LClampReentry0

-LClampHighOrLow0:

-	jg		LClampHigh0

-	xorl	%esi,%esi

-	jmp		LClampReentry0

-LClampHigh1:

-	movl	C(bbextentt),%edx

-	jmp		LClampReentry1

-LClampHighOrLow1:

-	jg		LClampHigh1

-	xorl	%edx,%edx

-	jmp		LClampReentry1

-LClampLow2:

-	movl	$2048,%ebp

-	jmp		LClampReentry2

-LClampHigh2:

-	movl	C(bbextents),%ebp

-	jmp		LClampReentry2

-LClampLow3:

-	movl	$2048,%ecx

-	jmp		LClampReentry3

-LClampHigh3:

-	movl	C(bbextentt),%ecx

-	jmp		LClampReentry3

-LClampLow4:

-	movl	$2048,%eax

-	jmp		LClampReentry4

-LClampHigh4:

-	movl	C(bbextents),%eax

-	jmp		LClampReentry4

-LClampLow5:

-	movl	$2048,%ebx

-	jmp		LClampReentry5

-LClampHigh5:

-	movl	C(bbextentt),%ebx

-	jmp		LClampReentry5

-#define pspans	4+16

-	.align 4

-.globl C(D_DrawSpans8)

-C(D_DrawSpans8):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//

-// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock

-// and span list pointers

-//

-// TODO: any overlap from rearranging?

-	flds	C(d_sdivzstepu)

-	fmuls	fp_8

-	movl	C(cacheblock),%edx

-	flds	C(d_tdivzstepu)

-	fmuls	fp_8

-	movl	pspans(%esp),%ebx	// point to the first span descriptor

-	flds	C(d_zistepu)

-	fmuls	fp_8

-	movl	%edx,pbase			// pbase = cacheblock

-	fstps	zi8stepu

-	fstps	tdivz8stepu

-	fstps	sdivz8stepu

-LSpanLoop:

-//

-// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the

-// initial s and t values

-//

-// FIXME: pipeline FILD?

-	fildl	espan_t_v(%ebx)

-	fildl	espan_t_u(%ebx)

-	fld		%st(1)			// dv | du | dv

-	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv

-	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv

-	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

-	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

-	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |

-							//  dv*d_sdivzstepv | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |

-							//  dv*d_sdivzstepv | du | dv

-	faddp	%st(0),%st(2)	// du*d_tdivzstepu |

-							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |

-							//  du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv

-	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +

-							//  du*d_sdivzstepu; stays in %st(2) at end

-	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |

-							//  s/z

-	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |

-							//  du*d_tdivzstepu | du | s/z

-	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |

-							//  du*d_tdivzstepu | du | s/z

-	faddp	%st(0),%st(2)	// dv*d_zistepv |

-							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z

-	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  dv*d_zistepv | s/z

-	fmuls	C(d_zistepu)		// du*d_zistepu |

-							//  dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  dv*d_zistepv | s/z

-	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  du*d_zistepu | dv*d_zistepv | s/z

-	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +

-							//  du*d_tdivzstepu; stays in %st(1) at end

-	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z

-	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z

-	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z

-	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z

-	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +

-							//  du*d_zistepu; stays in %st(0) at end

-							// 1/z | fp_64k | t/z | s/z

-//

-// calculate and clamp s & t

-//

-	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z

-//

-// point %edi to the first pixel in the span

-//

-	movl	C(d_viewbuffer),%ecx

-	movl	espan_t_v(%ebx),%eax

-	movl	%ebx,pspantemp	// preserve spans pointer

-	movl	C(tadjust),%edx

-	movl	C(sadjust),%esi

-	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth

-	addl	%ecx,%edi

-	movl	espan_t_u(%ebx),%ecx

-	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];

-	movl	espan_t_count(%ebx),%ecx

-//

-// now start the FDIV for the end of the span

-//

-	cmpl	$8,%ecx

-	ja		LSetupNotLast1

-	decl	%ecx

-	jz		LCleanup1		// if only one pixel, no need to start an FDIV

-	movl	%ecx,spancountminus1

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	fildl	spancountminus1

-	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1

-	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1

-	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1

-	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

-	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

-	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1

-	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |

-							//  C(d_tdivzstepu)*scm1

-	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |

-							//  C(d_tdivzstepu)*scm1

-	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

-	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

-	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

-	faddp	%st(0),%st(3)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

-							//  overlap

-	jmp		LFDIVInFlight1

-LCleanup1:

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	jmp		LFDIVInFlight1

-	.align	4

-LSetupNotLast1:

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	fadds	zi8stepu

-	fxch	%st(2)

-	fadds	sdivz8stepu

-	fxch	%st(2)

-	flds	tdivz8stepu

-	faddp	%st(0),%st(2)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// z = 1/1/z

-							// this is what we've gone to all this trouble to

-							//  overlap

-LFDIVInFlight1:

-	addl	s,%esi

-	addl	t,%edx

-	movl	C(bbextents),%ebx

-	movl	C(bbextentt),%ebp

-	cmpl	%ebx,%esi

-	ja		LClampHighOrLow0

-LClampReentry0:

-	movl	%esi,s

-	movl	pbase,%ebx

-	shll	$16,%esi

-	cmpl	%ebp,%edx

-	movl	%esi,sfracf

-	ja		LClampHighOrLow1

-LClampReentry1:

-	movl	%edx,t

-	movl	s,%esi					// sfrac = scans->sfrac;

-	shll	$16,%edx

-	movl	t,%eax					// tfrac = scans->tfrac;

-	sarl	$16,%esi

-	movl	%edx,tfracf

-//

-// calculate the texture starting address

-//

-	sarl	$16,%eax

-	movl	C(cachewidth),%edx

-	imull	%edx,%eax				// (tfrac >> 16) * cachewidth

-	addl	%ebx,%esi

-	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +

-									//           ((tfrac >> 16) * cachewidth);

-//

-// determine whether last span or not

-//

-	cmpl	$8,%ecx

-	jna		LLastSegment

-//

-// not the last segment; do full 8-wide segment

-//

-LNotLastSegment:

-//

-// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

-// get there

-//

-// pick up after the FDIV that was left in flight previously

-	fld		%st(0)			// duplicate it

-	fmul	%st(4),%st(0)	// s = s/z * z

-	fxch	%st(1)

-	fmul	%st(3),%st(0)	// t = t/z * z

-	fxch	%st(1)

-	fistpl	snext

-	fistpl	tnext

-	movl	snext,%eax

-	movl	tnext,%edx

-	movb	(%esi),%bl	// get first source texel

-	subl	$8,%ecx		// count off this segments' pixels

-	movl	C(sadjust),%ebp

-	movl	%ecx,counttemp	// remember count of remaining pixels

-	movl	C(tadjust),%ecx

-	movb	%bl,(%edi)	// store first dest pixel

-	addl	%eax,%ebp

-	addl	%edx,%ecx

-	movl	C(bbextents),%eax

-	movl	C(bbextentt),%edx

-	cmpl	$2048,%ebp

-	jl		LClampLow2

-	cmpl	%eax,%ebp

-	ja		LClampHigh2

-LClampReentry2:

-	cmpl	$2048,%ecx

-	jl		LClampLow3

-	cmpl	%edx,%ecx

-	ja		LClampHigh3

-LClampReentry3:

-	movl	%ebp,snext

-	movl	%ecx,tnext

-	subl	s,%ebp

-	subl	t,%ecx

-//

-// set up advancetable

-//

-	movl	%ecx,%eax

-	movl	%ebp,%edx

-	sarl	$19,%eax			// tstep >>= 16;

-	jz		LZero

-	sarl	$19,%edx			// sstep >>= 16;

-	movl	C(cachewidth),%ebx

-	imull	%ebx,%eax

-	jmp		LSetUp1

-LZero:

-	sarl	$19,%edx			// sstep >>= 16;

-	movl	C(cachewidth),%ebx

-LSetUp1:

-	addl	%edx,%eax			// add in sstep

-								// (tstep >> 16) * cachewidth + (sstep >> 16);

-	movl	tfracf,%edx

-	movl	%eax,advancetable+4	// advance base in t

-	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +

-								//  (sstep >> 16);

-	shll	$13,%ebp			// left-justify sstep fractional part

-	movl	sfracf,%ebx

-	shll	$13,%ecx			// left-justify tstep fractional part

-	movl	%eax,advancetable	// advance extra in t

-	movl	%ecx,tstep

-	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac

-	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)

-	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac

-	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	(%esi),%al

-	addl	%ebp,%ebx

-	movb	%al,1(%edi)

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,2(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,3(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-//

-// start FDIV for end of next segment in flight, so it can overlap

-//

-	movl	counttemp,%ecx

-	cmpl	$8,%ecx			// more than one segment after this?

-	ja		LSetupNotLast2	// yes

-	decl	%ecx

-	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV

-	movl	%ecx,spancountminus1

-	fildl	spancountminus1

-	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1

-	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1

-	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

-	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

-	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1

-	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1

-	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1

-	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

-	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

-	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

-	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1

-	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k

-	faddp	%st(0),%st(4)	// 64k

-	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

-							//  overlap

-	jmp		LFDIVInFlight2

-	.align	4

-LSetupNotLast2:

-	fadds	zi8stepu

-	fxch	%st(2)

-	fadds	sdivz8stepu

-	fxch	%st(2)

-	flds	tdivz8stepu

-	faddp	%st(0),%st(2)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// z = 1/1/z

-							// this is what we've gone to all this trouble to

-							//  overlap

-LFDIVInFlight2:

-	movl	%ecx,counttemp

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,4(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,5(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,6(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	$8,%edi

-	movl	%edx,tfracf

-	movl	snext,%edx

-	movl	%ebx,sfracf

-	movl	tnext,%ebx

-	movl	%edx,s

-	movl	%ebx,t

-	movl	counttemp,%ecx		// retrieve count

-//

-// determine whether last span or not

-//

-	cmpl	$8,%ecx				// are there multiple segments remaining?

-	movb	%al,-1(%edi)

-	ja		LNotLastSegment		// yes

-//

-// last segment of scan

-//

-LLastSegment:

-//

-// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

-// get there. The number of pixels left is variable, and we want to land on the

-// last pixel, not step one past it, so we can't run into arithmetic problems

-//

-	testl	%ecx,%ecx

-	jz		LNoSteps		// just draw the last pixel and we're done

-// pick up after the FDIV that was left in flight previously

-	fld		%st(0)			// duplicate it

-	fmul	%st(4),%st(0)	// s = s/z * z

-	fxch	%st(1)

-	fmul	%st(3),%st(0)	// t = t/z * z

-	fxch	%st(1)

-	fistpl	snext

-	fistpl	tnext

-	movb	(%esi),%al		// load first texel in segment

-	movl	C(tadjust),%ebx

-	movb	%al,(%edi)		// store first pixel in segment

-	movl	C(sadjust),%eax

-	addl	snext,%eax

-	addl	tnext,%ebx

-	movl	C(bbextents),%ebp

-	movl	C(bbextentt),%edx

-	cmpl	$2048,%eax

-	jl		LClampLow4

-	cmpl	%ebp,%eax

-	ja		LClampHigh4

-LClampReentry4:

-	movl	%eax,snext

-	cmpl	$2048,%ebx

-	jl		LClampLow5

-	cmpl	%edx,%ebx

-	ja		LClampHigh5

-LClampReentry5:

-	cmpl	$1,%ecx			// don't bother

-	je		LOnlyOneStep	// if two pixels in segment, there's only one step,

-							//  of the segment length

-	subl	s,%eax

-	subl	t,%ebx

-	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31

-	addl	%ebx,%ebx		//  reciprocal yields 16.48

-	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)

-	movl	%edx,%ebp

-	movl	%ebx,%eax

-	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)

-LSetEntryvec:

-//

-// set up advancetable

-//

-	movl	entryvec_table(,%ecx,4),%ebx

-	movl	%edx,%eax

-	movl	%ebx,jumptemp		// entry point into code for RET later

-	movl	%ebp,%ecx

-	sarl	$16,%edx			// tstep >>= 16;

-	movl	C(cachewidth),%ebx

-	sarl	$16,%ecx			// sstep >>= 16;

-	imull	%ebx,%edx

-	addl	%ecx,%edx			// add in sstep

-								// (tstep >> 16) * cachewidth + (sstep >> 16);

-	movl	tfracf,%ecx

-	movl	%edx,advancetable+4	// advance base in t

-	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +

-								//  (sstep >> 16);

-	shll	$16,%ebp			// left-justify sstep fractional part

-	movl	sfracf,%ebx

-	shll	$16,%eax			// left-justify tstep fractional part

-	movl	%edx,advancetable	// advance extra in t

-	movl	%eax,tstep

-	movl	%ecx,%edx

-	addl	%eax,%edx

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	jmp		*jumptemp			// jump to the number-of-pixels handler

-//----------------------------------------

-LNoSteps:

-	movb	(%esi),%al		// load first texel in segment

-	subl	$7,%edi			// adjust for hardwired offset

-	jmp		LEndSpan

-LOnlyOneStep:

-	subl	s,%eax

-	subl	t,%ebx

-	movl	%eax,%ebp

-	movl	%ebx,%edx

-	jmp		LSetEntryvec

-//----------------------------------------

-.globl	Entry2_8

-Entry2_8:

-	subl	$6,%edi		// adjust for hardwired offsets

-	movb	(%esi),%al

-	jmp		LLEntry2_8

-//----------------------------------------

-.globl	Entry3_8

-Entry3_8:

-	subl	$5,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	jmp		LLEntry3_8

-//----------------------------------------

-.globl	Entry4_8

-Entry4_8:

-	subl	$4,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LLEntry4_8

-//----------------------------------------

-.globl	Entry5_8

-Entry5_8:

-	subl	$3,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LLEntry5_8

-//----------------------------------------

-.globl	Entry6_8

-Entry6_8:

-	subl	$2,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LLEntry6_8

-//----------------------------------------

-.globl	Entry7_8

-Entry7_8:

-	decl	%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LLEntry7_8

-//----------------------------------------

-.globl	Entry8_8

-Entry8_8:

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,1(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LLEntry7_8:

-	sbbl	%ecx,%ecx

-	movb	%al,2(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LLEntry6_8:

-	sbbl	%ecx,%ecx

-	movb	%al,3(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LLEntry5_8:

-	sbbl	%ecx,%ecx

-	movb	%al,4(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LLEntry4_8:

-	sbbl	%ecx,%ecx

-	movb	%al,5(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-LLEntry3_8:

-	movb	%al,6(%edi)

-	movb	(%esi),%al

-LLEntry2_8:

-LEndSpan:

-//

-// clear s/z, t/z, 1/z from FP stack

-//

-	fstp %st(0)

-	fstp %st(0)

-	fstp %st(0)

-	movl	pspantemp,%ebx				// restore spans pointer

-	movl	espan_t_pnext(%ebx),%ebx	// point to next span

-	testl	%ebx,%ebx			// any more spans?

-	movb	%al,7(%edi)

-	jnz		LSpanLoop			// more spans

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-//----------------------------------------------------------------------

-// 8-bpp horizontal span z drawing codefor polygons, with no transparency.

-//

-// Assumes there is at least one span in pzspans, and that every span

-// contains at least one pixel

-//----------------------------------------------------------------------

-	.text

-// z-clamp on a non-negative gradient span

-LClamp:

-	movl	$0x40000000,%edx

-	xorl	%ebx,%ebx

-	fstp	%st(0)

-	jmp		LZDraw

-// z-clamp on a negative gradient span

-LClampNeg:

-	movl	$0x40000000,%edx

-	xorl	%ebx,%ebx

-	fstp	%st(0)

-	jmp		LZDrawNeg

-#define pzspans	4+16

-.globl C(D_DrawZSpans)

-C(D_DrawZSpans):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-	flds	C(d_zistepu)

-	movl	C(d_zistepu),%eax

-	movl	pzspans(%esp),%esi

-	testl	%eax,%eax

-	jz		LFNegSpan

-	fmuls	Float2ToThe31nd

-	fistpl	izistep		// note: we are relying on FP exceptions being turned

-						// off here to avoid range problems

-	movl	izistep,%ebx	// remains loaded for all spans

-LFSpanLoop:

-// set up the initial 1/z value

-	fildl	espan_t_v(%esi)

-	fildl	espan_t_u(%esi)

-	movl	espan_t_v(%esi),%ecx

-	movl	C(d_pzbuffer),%edi

-	fmuls	C(d_zistepu)

-	fxch	%st(1)

-	fmuls	C(d_zistepv)

-	fxch	%st(1)

-	fadds	C(d_ziorigin)

-	imull	C(d_zrowbytes),%ecx

-	faddp	%st(0),%st(1)

-// clamp if z is nearer than 2 (1/z > 0.5)

-	fcoms	float_point5

-	addl	%ecx,%edi

-	movl	espan_t_u(%esi),%edx

-	addl	%edx,%edx				// word count

-	movl	espan_t_count(%esi),%ecx

-	addl	%edx,%edi				// pdest = &pdestspan[scans->u];

-	pushl	%esi		// preserve spans pointer

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jz		LClamp

-	fmuls	Float2ToThe31nd

-	fistpl	izi			// note: we are relying on FP exceptions being turned

-						// off here to avoid problems when the span is closer

-						// than 1/(2**31)

-	movl	izi,%edx

-// at this point:

-// %ebx = izistep

-// %ecx = count

-// %edx = izi

-// %edi = pdest

-LZDraw:

-// do a single pixel up front, if necessary to dword align the destination

-	testl	$2,%edi

-	jz		LFMiddle

-	movl	%edx,%eax

-	addl	%ebx,%edx

-	shrl	$16,%eax

-	decl	%ecx

-	movw	%ax,(%edi)

-	addl	$2,%edi

-// do middle a pair of aligned dwords at a time

-LFMiddle:

-	pushl	%ecx

-	shrl	$1,%ecx				// count / 2

-	jz		LFLast				// no aligned dwords to do

-	shrl	$1,%ecx				// (count / 2) / 2

-	jnc		LFMiddleLoop		// even number of aligned dwords to do

-	movl	%edx,%eax

-	addl	%ebx,%edx

-	shrl	$16,%eax

-	movl	%edx,%esi

-	addl	%ebx,%edx

-	andl	$0xFFFF0000,%esi

-	orl		%esi,%eax

-	movl	%eax,(%edi)

-	addl	$4,%edi

-	andl	%ecx,%ecx

-	jz		LFLast

-LFMiddleLoop:

-	movl	%edx,%eax

-	addl	%ebx,%edx

-	shrl	$16,%eax

-	movl	%edx,%esi

-	addl	%ebx,%edx

-	andl	$0xFFFF0000,%esi

-	orl		%esi,%eax

-	movl	%edx,%ebp

-	movl	%eax,(%edi)

-	addl	%ebx,%edx

-	shrl	$16,%ebp

-	movl	%edx,%esi

-	addl	%ebx,%edx

-	andl	$0xFFFF0000,%esi

-	orl		%esi,%ebp

-	movl	%ebp,4(%edi)	// FIXME: eliminate register contention

-	addl	$8,%edi

-	decl	%ecx

-	jnz		LFMiddleLoop

-LFLast:

-	popl	%ecx			// retrieve count

-	popl	%esi			// retrieve span pointer

-// do the last, unaligned pixel, if there is one

-	andl	$1,%ecx			// is there an odd pixel left to do?

-	jz		LFSpanDone		// no

-	shrl	$16,%edx

-	movw	%dx,(%edi)		// do the final pixel's z

-LFSpanDone:

-	movl	espan_t_pnext(%esi),%esi

-	testl	%esi,%esi

-	jnz		LFSpanLoop

-	jmp		LFDone

-LFNegSpan:

-	fmuls	FloatMinus2ToThe31nd

-	fistpl	izistep		// note: we are relying on FP exceptions being turned

-						// off here to avoid range problems

-	movl	izistep,%ebx	// remains loaded for all spans

-LFNegSpanLoop:

-// set up the initial 1/z value

-	fildl	espan_t_v(%esi)

-	fildl	espan_t_u(%esi)

-	movl	espan_t_v(%esi),%ecx

-	movl	C(d_pzbuffer),%edi

-	fmuls	C(d_zistepu)

-	fxch	%st(1)

-	fmuls	C(d_zistepv)

-	fxch	%st(1)

-	fadds	C(d_ziorigin)

-	imull	C(d_zrowbytes),%ecx

-	faddp	%st(0),%st(1)

-// clamp if z is nearer than 2 (1/z > 0.5)

-	fcoms	float_point5

-	addl	%ecx,%edi

-	movl	espan_t_u(%esi),%edx

-	addl	%edx,%edx				// word count

-	movl	espan_t_count(%esi),%ecx

-	addl	%edx,%edi				// pdest = &pdestspan[scans->u];

-	pushl	%esi		// preserve spans pointer

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jz		LClampNeg

-	fmuls	Float2ToThe31nd

-	fistpl	izi			// note: we are relying on FP exceptions being turned

-						// off here to avoid problems when the span is closer

-						// than 1/(2**31)

-	movl	izi,%edx

-// at this point:

-// %ebx = izistep

-// %ecx = count

-// %edx = izi

-// %edi = pdest

-LZDrawNeg:

-// do a single pixel up front, if necessary to dword align the destination

-	testl	$2,%edi

-	jz		LFNegMiddle

-	movl	%edx,%eax

-	subl	%ebx,%edx

-	shrl	$16,%eax

-	decl	%ecx

-	movw	%ax,(%edi)

-	addl	$2,%edi

-// do middle a pair of aligned dwords at a time

-LFNegMiddle:

-	pushl	%ecx

-	shrl	$1,%ecx				// count / 2

-	jz		LFNegLast			// no aligned dwords to do

-	shrl	$1,%ecx				// (count / 2) / 2

-	jnc		LFNegMiddleLoop		// even number of aligned dwords to do

-	movl	%edx,%eax

-	subl	%ebx,%edx

-	shrl	$16,%eax

-	movl	%edx,%esi

-	subl	%ebx,%edx

-	andl	$0xFFFF0000,%esi

-	orl		%esi,%eax

-	movl	%eax,(%edi)

-	addl	$4,%edi

-	andl	%ecx,%ecx

-	jz		LFNegLast

-LFNegMiddleLoop:

-	movl	%edx,%eax

-	subl	%ebx,%edx

-	shrl	$16,%eax

-	movl	%edx,%esi

-	subl	%ebx,%edx

-	andl	$0xFFFF0000,%esi

-	orl		%esi,%eax

-	movl	%edx,%ebp

-	movl	%eax,(%edi)

-	subl	%ebx,%edx

-	shrl	$16,%ebp

-	movl	%edx,%esi

-	subl	%ebx,%edx

-	andl	$0xFFFF0000,%esi

-	orl		%esi,%ebp

-	movl	%ebp,4(%edi)	// FIXME: eliminate register contention

-	addl	$8,%edi

-	decl	%ecx

-	jnz		LFNegMiddleLoop

-LFNegLast:

-	popl	%ecx			// retrieve count

-	popl	%esi			// retrieve span pointer

-// do the last, unaligned pixel, if there is one

-	andl	$1,%ecx			// is there an odd pixel left to do?

-	jz		LFNegSpanDone	// no

-	shrl	$16,%edx

-	movw	%dx,(%edi)		// do the final pixel's z

-LFNegSpanDone:

-	movl	espan_t_pnext(%esi),%esi

-	testl	%esi,%esi

-	jnz		LFNegSpanLoop

-LFDone:

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-#endif	// id386

--- a/d_draw16.s

+++ /dev/null

@@ -1,955 +1,0 @@

-//

-// d_draw16.s

-// x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel

-// subdivision.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef	id386

-//----------------------------------------------------------------------

-// 8-bpp horizontal span drawing code for polygons, with no transparency and

-// 16-pixel subdivision.

-//

-// Assumes there is at least one span in pspans, and that every span

-// contains at least one pixel

-//----------------------------------------------------------------------

-	.data

-	.text

-// out-of-line, rarely-needed clamping code

-LClampHigh0:

-	movl	C(bbextents),%esi

-	jmp		LClampReentry0

-LClampHighOrLow0:

-	jg		LClampHigh0

-	xorl	%esi,%esi

-	jmp		LClampReentry0

-LClampHigh1:

-	movl	C(bbextentt),%edx

-	jmp		LClampReentry1

-LClampHighOrLow1:

-	jg		LClampHigh1

-	xorl	%edx,%edx

-	jmp		LClampReentry1

-LClampLow2:

-	movl	$4096,%ebp

-	jmp		LClampReentry2

-LClampHigh2:

-	movl	C(bbextents),%ebp

-	jmp		LClampReentry2

-LClampLow3:

-	movl	$4096,%ecx

-	jmp		LClampReentry3

-LClampHigh3:

-	movl	C(bbextentt),%ecx

-	jmp		LClampReentry3

-LClampLow4:

-	movl	$4096,%eax

-	jmp		LClampReentry4

-LClampHigh4:

-	movl	C(bbextents),%eax

-	jmp		LClampReentry4

-LClampLow5:

-	movl	$4096,%ebx

-	jmp		LClampReentry5

-LClampHigh5:

-	movl	C(bbextentt),%ebx

-	jmp		LClampReentry5

-#define pspans	4+16

-	.align 4

-.globl C(D_DrawSpans16)

-C(D_DrawSpans16):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//

-// set up scaled-by-16 steps, for 16-long segments; also set up cacheblock

-// and span list pointers

-//

-// TODO: any overlap from rearranging?

-	flds	C(d_sdivzstepu)

-	fmuls	fp_16

-	movl	C(cacheblock),%edx

-	flds	C(d_tdivzstepu)

-	fmuls	fp_16

-	movl	pspans(%esp),%ebx	// point to the first span descriptor

-	flds	C(d_zistepu)

-	fmuls	fp_16

-	movl	%edx,pbase			// pbase = cacheblock

-	fstps	zi16stepu

-	fstps	tdivz16stepu

-	fstps	sdivz16stepu

-LSpanLoop:

-//

-// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the

-// initial s and t values

-//

-// FIXME: pipeline FILD?

-	fildl	espan_t_v(%ebx)

-	fildl	espan_t_u(%ebx)

-	fld		%st(1)			// dv | du | dv

-	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv

-	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv

-	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

-	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

-	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |

-							//  dv*d_sdivzstepv | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |

-							//  dv*d_sdivzstepv | du | dv

-	faddp	%st(0),%st(2)	// du*d_tdivzstepu |

-							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |

-							//  du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv

-	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +

-							//  du*d_sdivzstepu; stays in %st(2) at end

-	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |

-							//  s/z

-	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |

-							//  du*d_tdivzstepu | du | s/z

-	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |

-							//  du*d_tdivzstepu | du | s/z

-	faddp	%st(0),%st(2)	// dv*d_zistepv |

-							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z

-	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  dv*d_zistepv | s/z

-	fmuls	C(d_zistepu)		// du*d_zistepu |

-							//  dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  dv*d_zistepv | s/z

-	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  du*d_zistepu | dv*d_zistepv | s/z

-	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +

-							//  du*d_tdivzstepu; stays in %st(1) at end

-	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z

-	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z

-	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z

-	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z

-	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +

-							//  du*d_zistepu; stays in %st(0) at end

-							// 1/z | fp_64k | t/z | s/z

-//

-// calculate and clamp s & t

-//

-	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z

-//

-// point %edi to the first pixel in the span

-//

-	movl	C(d_viewbuffer),%ecx

-	movl	espan_t_v(%ebx),%eax

-	movl	%ebx,pspantemp	// preserve spans pointer

-	movl	C(tadjust),%edx

-	movl	C(sadjust),%esi

-	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth

-	addl	%ecx,%edi

-	movl	espan_t_u(%ebx),%ecx

-	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];

-	movl	espan_t_count(%ebx),%ecx

-//

-// now start the FDIV for the end of the span

-//

-	cmpl	$16,%ecx

-	ja		LSetupNotLast1

-	decl	%ecx

-	jz		LCleanup1		// if only one pixel, no need to start an FDIV

-	movl	%ecx,spancountminus1

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	fildl	spancountminus1

-	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1

-	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1

-	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1

-	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

-	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

-	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1

-	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |

-							//  C(d_tdivzstepu)*scm1

-	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |

-							//  C(d_tdivzstepu)*scm1

-	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

-	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

-	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

-	faddp	%st(0),%st(3)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

-							//  overlap

-	jmp		LFDIVInFlight1

-LCleanup1:

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	jmp		LFDIVInFlight1

-	.align	4

-LSetupNotLast1:

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	fadds	zi16stepu

-	fxch	%st(2)

-	fadds	sdivz16stepu

-	fxch	%st(2)

-	flds	tdivz16stepu

-	faddp	%st(0),%st(2)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// z = 1/1/z

-							// this is what we've gone to all this trouble to

-							//  overlap

-LFDIVInFlight1:

-	addl	s,%esi

-	addl	t,%edx

-	movl	C(bbextents),%ebx

-	movl	C(bbextentt),%ebp

-	cmpl	%ebx,%esi

-	ja		LClampHighOrLow0

-LClampReentry0:

-	movl	%esi,s

-	movl	pbase,%ebx

-	shll	$16,%esi

-	cmpl	%ebp,%edx

-	movl	%esi,sfracf

-	ja		LClampHighOrLow1

-LClampReentry1:

-	movl	%edx,t

-	movl	s,%esi					// sfrac = scans->sfrac;

-	shll	$16,%edx

-	movl	t,%eax					// tfrac = scans->tfrac;

-	sarl	$16,%esi

-	movl	%edx,tfracf

-//

-// calculate the texture starting address

-//

-	sarl	$16,%eax

-	movl	C(cachewidth),%edx

-	imull	%edx,%eax				// (tfrac >> 16) * cachewidth

-	addl	%ebx,%esi

-	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +

-									//           ((tfrac >> 16) * cachewidth);

-//

-// determine whether last span or not

-//

-	cmpl	$16,%ecx

-	jna		LLastSegment

-//

-// not the last segment; do full 16-wide segment

-//

-LNotLastSegment:

-//

-// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

-// get there

-//

-// pick up after the FDIV that was left in flight previously

-	fld		%st(0)			// duplicate it

-	fmul	%st(4),%st(0)	// s = s/z * z

-	fxch	%st(1)

-	fmul	%st(3),%st(0)	// t = t/z * z

-	fxch	%st(1)

-	fistpl	snext

-	fistpl	tnext

-	movl	snext,%eax

-	movl	tnext,%edx

-	movb	(%esi),%bl	// get first source texel

-	subl	$16,%ecx		// count off this segments' pixels

-	movl	C(sadjust),%ebp

-	movl	%ecx,counttemp	// remember count of remaining pixels

-	movl	C(tadjust),%ecx

-	movb	%bl,(%edi)	// store first dest pixel

-	addl	%eax,%ebp

-	addl	%edx,%ecx

-	movl	C(bbextents),%eax

-	movl	C(bbextentt),%edx

-	cmpl	$4096,%ebp

-	jl		LClampLow2

-	cmpl	%eax,%ebp

-	ja		LClampHigh2

-LClampReentry2:

-	cmpl	$4096,%ecx

-	jl		LClampLow3

-	cmpl	%edx,%ecx

-	ja		LClampHigh3

-LClampReentry3:

-	movl	%ebp,snext

-	movl	%ecx,tnext

-	subl	s,%ebp

-	subl	t,%ecx

-//

-// set up advancetable

-//

-	movl	%ecx,%eax

-	movl	%ebp,%edx

-	sarl	$20,%eax			// tstep >>= 16;

-	jz		LZero

-	sarl	$20,%edx			// sstep >>= 16;

-	movl	C(cachewidth),%ebx

-	imull	%ebx,%eax

-	jmp		LSetUp1

-LZero:

-	sarl	$20,%edx			// sstep >>= 16;

-	movl	C(cachewidth),%ebx

-LSetUp1:

-	addl	%edx,%eax			// add in sstep

-								// (tstep >> 16) * cachewidth + (sstep >> 16);

-	movl	tfracf,%edx

-	movl	%eax,advancetable+4	// advance base in t

-	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +

-								//  (sstep >> 16);

-	shll	$12,%ebp			// left-justify sstep fractional part

-	movl	sfracf,%ebx

-	shll	$12,%ecx			// left-justify tstep fractional part

-	movl	%eax,advancetable	// advance extra in t

-	movl	%ecx,tstep

-	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac

-	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)

-	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac

-	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	(%esi),%al

-	addl	%ebp,%ebx

-	movb	%al,1(%edi)

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,2(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,3(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,4(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,5(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,6(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,7(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-//

-// start FDIV for end of next segment in flight, so it can overlap

-//

-	movl	counttemp,%ecx

-	cmpl	$16,%ecx			// more than one segment after this?

-	ja		LSetupNotLast2	// yes

-	decl	%ecx

-	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV

-	movl	%ecx,spancountminus1

-	fildl	spancountminus1

-	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1

-	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1

-	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

-	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

-	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1

-	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1

-	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1

-	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

-	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

-	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

-	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1

-	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k

-	faddp	%st(0),%st(4)	// 64k

-	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

-							//  overlap

-	jmp		LFDIVInFlight2

-	.align	4

-LSetupNotLast2:

-	fadds	zi16stepu

-	fxch	%st(2)

-	fadds	sdivz16stepu

-	fxch	%st(2)

-	flds	tdivz16stepu

-	faddp	%st(0),%st(2)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// z = 1/1/z

-							// this is what we've gone to all this trouble to

-							//  overlap

-LFDIVInFlight2:

-	movl	%ecx,counttemp

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,8(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,9(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,10(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,11(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,12(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,13(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,14(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	$16,%edi

-	movl	%edx,tfracf

-	movl	snext,%edx

-	movl	%ebx,sfracf

-	movl	tnext,%ebx

-	movl	%edx,s

-	movl	%ebx,t

-	movl	counttemp,%ecx		// retrieve count

-//

-// determine whether last span or not

-//

-	cmpl	$16,%ecx				// are there multiple segments remaining?

-	movb	%al,-1(%edi)

-	ja		LNotLastSegment		// yes

-//

-// last segment of scan

-//

-LLastSegment:

-//

-// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

-// get there. The number of pixels left is variable, and we want to land on the

-// last pixel, not step one past it, so we can't run into arithmetic problems

-//

-	testl	%ecx,%ecx

-	jz		LNoSteps		// just draw the last pixel and we're done

-// pick up after the FDIV that was left in flight previously

-	fld		%st(0)			// duplicate it

-	fmul	%st(4),%st(0)	// s = s/z * z

-	fxch	%st(1)

-	fmul	%st(3),%st(0)	// t = t/z * z

-	fxch	%st(1)

-	fistpl	snext

-	fistpl	tnext

-	movb	(%esi),%al		// load first texel in segment

-	movl	C(tadjust),%ebx

-	movb	%al,(%edi)		// store first pixel in segment

-	movl	C(sadjust),%eax

-	addl	snext,%eax

-	addl	tnext,%ebx

-	movl	C(bbextents),%ebp

-	movl	C(bbextentt),%edx

-	cmpl	$4096,%eax

-	jl		LClampLow4

-	cmpl	%ebp,%eax

-	ja		LClampHigh4

-LClampReentry4:

-	movl	%eax,snext

-	cmpl	$4096,%ebx

-	jl		LClampLow5

-	cmpl	%edx,%ebx

-	ja		LClampHigh5

-LClampReentry5:

-	cmpl	$1,%ecx			// don't bother

-	je		LOnlyOneStep	// if two pixels in segment, there's only one step,

-							//  of the segment length

-	subl	s,%eax

-	subl	t,%ebx

-	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31

-	addl	%ebx,%ebx		//  reciprocal yields 16.48

-	imull	reciprocal_table_16-8(,%ecx,4)	// sstep = (snext - s) /

-											//  (spancount-1)

-	movl	%edx,%ebp

-	movl	%ebx,%eax

-	imull	reciprocal_table_16-8(,%ecx,4)	// tstep = (tnext - t) /

-											//  (spancount-1)

-LSetEntryvec:

-//

-// set up advancetable

-//

-	movl	entryvec_table_16(,%ecx,4),%ebx

-	movl	%edx,%eax

-	movl	%ebx,jumptemp		// entry point into code for RET later

-	movl	%ebp,%ecx

-	sarl	$16,%edx			// tstep >>= 16;

-	movl	C(cachewidth),%ebx

-	sarl	$16,%ecx			// sstep >>= 16;

-	imull	%ebx,%edx

-	addl	%ecx,%edx			// add in sstep

-								// (tstep >> 16) * cachewidth + (sstep >> 16);

-	movl	tfracf,%ecx

-	movl	%edx,advancetable+4	// advance base in t

-	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +

-								//  (sstep >> 16);

-	shll	$16,%ebp			// left-justify sstep fractional part

-	movl	sfracf,%ebx

-	shll	$16,%eax			// left-justify tstep fractional part

-	movl	%edx,advancetable	// advance extra in t

-	movl	%eax,tstep

-	movl	%ecx,%edx

-	addl	%eax,%edx

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	jmp		*jumptemp			// jump to the number-of-pixels handler

-//----------------------------------------

-LNoSteps:

-	movb	(%esi),%al		// load first texel in segment

-	subl	$15,%edi			// adjust for hardwired offset

-	jmp		LEndSpan

-LOnlyOneStep:

-	subl	s,%eax

-	subl	t,%ebx

-	movl	%eax,%ebp

-	movl	%ebx,%edx

-	jmp		LSetEntryvec

-//----------------------------------------

-.globl	Entry2_16, Entry3_16, Entry4_16, Entry5_16

-.globl	Entry6_16, Entry7_16, Entry8_16, Entry9_16

-.globl	Entry10_16, Entry11_16, Entry12_16, Entry13_16

-.globl	Entry14_16, Entry15_16, Entry16_16

-Entry2_16:

-	subl	$14,%edi		// adjust for hardwired offsets

-	movb	(%esi),%al

-	jmp		LEntry2_16

-//----------------------------------------

-Entry3_16:

-	subl	$13,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	jmp		LEntry3_16

-//----------------------------------------

-Entry4_16:

-	subl	$12,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry4_16

-//----------------------------------------

-Entry5_16:

-	subl	$11,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry5_16

-//----------------------------------------

-Entry6_16:

-	subl	$10,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry6_16

-//----------------------------------------

-Entry7_16:

-	subl	$9,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry7_16

-//----------------------------------------

-Entry8_16:

-	subl	$8,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry8_16

-//----------------------------------------

-Entry9_16:

-	subl	$7,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry9_16

-//----------------------------------------

-Entry10_16:

-	subl	$6,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry10_16

-//----------------------------------------

-Entry11_16:

-	subl	$5,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry11_16

-//----------------------------------------

-Entry12_16:

-	subl	$4,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry12_16

-//----------------------------------------

-Entry13_16:

-	subl	$3,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry13_16

-//----------------------------------------

-Entry14_16:

-	subl	$2,%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry14_16

-//----------------------------------------

-Entry15_16:

-	decl	%edi		// adjust for hardwired offsets

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	jmp		LEntry15_16

-//----------------------------------------

-Entry16_16:

-	addl	%eax,%edx

-	movb	(%esi),%al

-	sbbl	%ecx,%ecx

-	addl	%ebp,%ebx

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-	sbbl	%ecx,%ecx

-	movb	%al,1(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry15_16:

-	sbbl	%ecx,%ecx

-	movb	%al,2(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry14_16:

-	sbbl	%ecx,%ecx

-	movb	%al,3(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry13_16:

-	sbbl	%ecx,%ecx

-	movb	%al,4(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry12_16:

-	sbbl	%ecx,%ecx

-	movb	%al,5(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry11_16:

-	sbbl	%ecx,%ecx

-	movb	%al,6(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry10_16:

-	sbbl	%ecx,%ecx

-	movb	%al,7(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry9_16:

-	sbbl	%ecx,%ecx

-	movb	%al,8(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry8_16:

-	sbbl	%ecx,%ecx

-	movb	%al,9(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry7_16:

-	sbbl	%ecx,%ecx

-	movb	%al,10(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry6_16:

-	sbbl	%ecx,%ecx

-	movb	%al,11(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry5_16:

-	sbbl	%ecx,%ecx

-	movb	%al,12(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-	addl	tstep,%edx

-LEntry4_16:

-	sbbl	%ecx,%ecx

-	movb	%al,13(%edi)

-	addl	%ebp,%ebx

-	movb	(%esi),%al

-	adcl	advancetable+4(,%ecx,4),%esi

-LEntry3_16:

-	movb	%al,14(%edi)

-	movb	(%esi),%al

-LEntry2_16:

-LEndSpan:

-//

-// clear s/z, t/z, 1/z from FP stack

-//

-	fstp %st(0)

-	fstp %st(0)

-	fstp %st(0)

-	movl	pspantemp,%ebx				// restore spans pointer

-	movl	espan_t_pnext(%ebx),%ebx	// point to next span

-	testl	%ebx,%ebx			// any more spans?

-	movb	%al,15(%edi)

-	jnz		LSpanLoop			// more spans

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-#endif	// id386

--- a/d_ifacea.h

+++ /dev/null

@@ -1,79 +1,0 @@

-//

-// d_ifacea.h

-//

-// Include file for asm driver interface.

-//

-//

-// !!! note that this file must match the corresponding C structures in

-// d_iface.h at all times !!!

-//

-// !!! if this is changed, it must be changed in r_shared.h too !!!

-#define ALIAS_ONSEAM				0x0020

-// !!! if this is changed, it must be changed in d_iface.h too !!!

-#define TURB_TEX_SIZE	64		// base turbulent texture size

-// !!! if this is changed, it must be changed in d_iface.h too !!!

-#define	CYCLE	128

-// !!! if this is changed, it must be changed in r_shared.h too !!!

-#define	MAXHEIGHT	1024

-// !!! if this is changed, it must be changed in quakedef.h too !!!

-#define CACHE_SIZE	32		// used to align key data structures

-// particle_t structure

-// !!! if this is changed, it must be changed in d_iface.h too !!!

-// driver-usable fields

-#define pt_org				0

-#define pt_color			12

-// drivers never touch the following fields

-#define pt_next				16

-#define pt_vel				20

-#define pt_ramp				32

-#define pt_die				36

-#define pt_type				40

-#define pt_size				44

-#define PARTICLE_Z_CLIP	8.0

-// finalvert_t structure

-// !!! if this is changed, it must be changed in d_iface.h too !!!

-#define fv_v				0	// !!! if this is moved, cases where the !!!

-								// !!! address of this field is pushed in !!!

-								// !!! d_polysa.s must be changed !!!

-#define fv_flags			24

-#define fv_reserved			28

-#define fv_size				32

-#define fv_shift			5

-// stvert_t structure

-// !!! if this is changed, it must be changed in modelgen.h too !!!

-#define stv_onseam	0

-#define stv_s		4

-#define stv_t		8

-#define stv_size	12

-// trivertx_t structure

-// !!! if this is changed, it must be changed in modelgen.h too !!!

-#define tv_v				0

-#define tv_lightnormalindex	3

-#define tv_size				4

-// affinetridesc_t structure

-// !!! if this is changed, it must be changed in d_iface.h too !!!

-#define atd_pskin			0

-#define atd_pskindesc		4

-#define atd_skinwidth		8

-#define atd_skinheight		12

-#define atd_ptriangles		16

-#define atd_pfinalverts		20

-#define atd_numtriangles	24

-#define atd_drawtype		28

-#define atd_seamfixupX16	32

-#define atd_size			36

--- a/d_parta.s

+++ /dev/null

@@ -1,458 +1,0 @@

-//

-// d_parta.s

-// x86 assembly-language 8-bpp particle-drawing code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "d_ifacea.h"

-#include "asm_draw.h"

-#ifdef	id386

-//----------------------------------------------------------------------

-// 8-bpp particle drawing code.

-//----------------------------------------------------------------------

-//FIXME: comments, full optimization

-//----------------------------------------------------------------------

-// 8-bpp particle queueing code.

-//----------------------------------------------------------------------

-	.text

-#define P	12+4

-	.align 4

-.globl C(D_DrawParticle)

-C(D_DrawParticle):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi				// preserve register variables

-	pushl	%ebx

-	movl	P(%esp),%edi

-// FIXME: better FP overlap in general here

-// transform point

-//	VectorSubtract (p->org, r_origin, local);

-	flds	C(r_origin)

-	fsubrs	pt_org(%edi)

-	flds	pt_org+4(%edi)

-	fsubs	C(r_origin)+4

-	flds	pt_org+8(%edi)

-	fsubs	C(r_origin)+8

-	fxch	%st(2)			// local[0] | local[1] | local[2]

-//	transformed[2] = DotProduct(local, r_ppn);

-	flds	C(r_ppn)		// r_ppn[0] | local[0] | local[1] | local[2]

-	fmul	%st(1),%st(0)	// dot0 | local[0] | local[1] | local[2]

-	flds	C(r_ppn)+4	// r_ppn[1] | dot0 | local[0] | local[1] | local[2]

-	fmul	%st(3),%st(0)	// dot1 | dot0 | local[0] | local[1] | local[2]

-	flds	C(r_ppn)+8	// r_ppn[2] | dot1 | dot0 | local[0] |

-						//  local[1] | local[2]

-	fmul	%st(5),%st(0)	// dot2 | dot1 | dot0 | local[0] | local[1] | local[2]

-	fxch	%st(2)		// dot0 | dot1 | dot2 | local[0] | local[1] | local[2]

-	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[0] | local[1] |

-						  //  local[2]

-	faddp	%st(0),%st(1) // z | local[0] | local[1] | local[2]

-	fld		%st(0)		// z | z | local[0] | local[1] |

-						//  local[2]

-	fdivrs	float_1		// 1/z | z | local[0] | local[1] | local[2]

-	fxch	%st(1)		// z | 1/z | local[0] | local[1] | local[2]

-//	if (transformed[2] < PARTICLE_Z_CLIP)

-//		return;

-	fcomps	float_particle_z_clip	// 1/z | local[0] | local[1] | local[2]

-	fxch	%st(3)					// local[2] | local[0] | local[1] | 1/z

-	flds	C(r_pup)	// r_pup[0] | local[2] | local[0] | local[1] | 1/z

-	fmul	%st(2),%st(0)	// dot0 | local[2] | local[0] | local[1] | 1/z

-	flds	C(r_pup)+4	// r_pup[1] | dot0 | local[2] | local[0] |

-						//  local[1] | 1/z

-	fnstsw	%ax

-	testb	$1,%ah

-	jnz		LPop6AndDone

-//	transformed[1] = DotProduct(local, r_pup);

-	fmul	%st(4),%st(0)	// dot1 | dot0 | local[2] | local[0] | local[1] | 1/z

-	flds	C(r_pup)+8	// r_pup[2] | dot1 | dot0 | local[2] |

-						//  local[0] | local[1] | 1/z

-	fmul	%st(3),%st(0)	// dot2 | dot1 | dot0 | local[2] | local[0] |

-						//  local[1] | 1/z

-	fxch	%st(2)		// dot0 | dot1 | dot2 | local[2] | local[0] |

-						//  local[1] | 1/z

-	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[2] | local[0] |

-						//  local[1] | 1/z

-	faddp	%st(0),%st(1) // y | local[2] | local[0] | local[1] | 1/z

-	fxch	%st(3)		// local[1] | local[2] | local[0] | y | 1/z

-//	transformed[0] = DotProduct(local, r_pright);

-	fmuls	C(r_pright)+4	// dot1 | local[2] | local[0] | y | 1/z

-	fxch	%st(2)		// local[0] | local[2] | dot1 | y | 1/z

-	fmuls	C(r_pright)	// dot0 | local[2] | dot1 | y | 1/z

-	fxch	%st(1)		// local[2] | dot0 | dot1 | y | 1/z

-	fmuls	C(r_pright)+8	// dot2 | dot0 | dot1 | y | 1/z

-	fxch	%st(2)		// dot1 | dot0 | dot2 | y | 1/z

-	faddp	%st(0),%st(1) // dot1 + dot0 | dot2 | y | 1/z

-	faddp	%st(0),%st(1)	// x | y | 1/z

-	fxch	%st(1)			// y | x | 1/z

-// project the point

-	fmul	%st(2),%st(0)	// y/z | x | 1/z

-	fxch	%st(1)			// x | y/z | 1/z

-	fmul	%st(2),%st(0)	// x/z | y/z | 1/z

-	fxch	%st(1)			// y/z | x/z | 1/z

-	fsubrs	C(ycenter)		// v | x/z | 1/z

-	fxch	%st(1)			// x/z | v | 1/z

-	fadds	C(xcenter)		// u | v | 1/z

-// FIXME: preadjust xcenter and ycenter

-	fxch	%st(1)			// v | u | 1/z

-	fadds	float_point5	// v | u | 1/z

-	fxch	%st(1)			// u | v | 1/z

-	fadds	float_point5	// u | v | 1/z

-	fxch	%st(2)			// 1/z | v | u

-	fmuls	DP_32768		// 1/z * 0x8000 | v | u

-	fxch	%st(2)			// u | v | 1/z * 0x8000

-// FIXME: use Terje's fp->int trick here?

-// FIXME: check we're getting proper rounding here

-	fistpl	DP_u			// v | 1/z * 0x8000

-	fistpl	DP_v			// 1/z * 0x8000

-	movl	DP_u,%eax

-	movl	DP_v,%edx

-// if ((v > d_vrectbottom_particle) ||

-// 	(u > d_vrectright_particle) ||

-// 	(v < d_vrecty) ||

-// 	(u < d_vrectx))

-// {

-// 	continue;

-// }

-	movl	C(d_vrectbottom_particle),%ebx

-	movl	C(d_vrectright_particle),%ecx

-	cmpl	%ebx,%edx

-	jg		LPop1AndDone

-	cmpl	%ecx,%eax

-	jg		LPop1AndDone

-	movl	C(d_vrecty),%ebx

-	movl	C(d_vrectx),%ecx

-	cmpl	%ebx,%edx

-	jl		LPop1AndDone

-	cmpl	%ecx,%eax

-	jl		LPop1AndDone

-	flds	pt_color(%edi)	// color | 1/z * 0x8000

-// FIXME: use Terje's fast fp->int trick?

-	fistpl	DP_Color		// 1/z * 0x8000

-	movl	C(d_viewbuffer),%ebx

-	addl	%eax,%ebx

-	movl	C(d_scantable)(,%edx,4),%edi		// point to the pixel

-	imull	C(d_zrowbytes),%edx		// point to the z pixel

-	leal	(%edx,%eax,2),%edx

-	movl	C(d_pzbuffer),%eax

-	fistpl	izi

-	addl	%ebx,%edi

-	addl	%eax,%edx

-// pix = izi >> d_pix_shift;

-	movl	izi,%eax

-	movl	C(d_pix_shift),%ecx

-	shrl	%cl,%eax

-	movl	izi,%ebp

-// if (pix < d_pix_min)

-// 		pix = d_pix_min;

-// else if (pix > d_pix_max)

-//  	pix = d_pix_max;

-	movl	C(d_pix_min),%ebx

-	movl	C(d_pix_max),%ecx

-	cmpl	%ebx,%eax

-	jnl		LTestPixMax

-	movl	%ebx,%eax

-	jmp		LTestDone

-LTestPixMax:

-	cmpl	%ecx,%eax

-	jng		LTestDone

-	movl	%ecx,%eax

-LTestDone:

-	movb	DP_Color,%ch

-	movl	C(d_y_aspect_shift),%ebx

-	testl	%ebx,%ebx

-	jnz		LDefault

-	cmpl	$4,%eax

-	ja		LDefault

-	jmp		DP_EntryTable-4(,%eax,4)

-// 1x1

-.globl	DP_1x1

-DP_1x1:

-	cmpw	%bp,(%edx)		// just one pixel to do

-	jg		LDone

-	movw	%bp,(%edx)

-	movb	%ch,(%edi)

-	jmp		LDone

-// 2x2

-.globl	DP_2x2

-DP_2x2:

-	pushl	%esi

-	movl	C(screenwidth),%ebx

-	movl	C(d_zrowbytes),%esi

-	cmpw	%bp,(%edx)

-	jg		L2x2_1

-	movw	%bp,(%edx)

-	movb	%ch,(%edi)

-L2x2_1:

-	cmpw	%bp,2(%edx)

-	jg		L2x2_2

-	movw	%bp,2(%edx)

-	movb	%ch,1(%edi)

-L2x2_2:

-	cmpw	%bp,(%edx,%esi,1)

-	jg		L2x2_3

-	movw	%bp,(%edx,%esi,1)

-	movb	%ch,(%edi,%ebx,1)

-L2x2_3:

-	cmpw	%bp,2(%edx,%esi,1)

-	jg		L2x2_4

-	movw	%bp,2(%edx,%esi,1)

-	movb	%ch,1(%edi,%ebx,1)

-L2x2_4:

-	popl	%esi

-	jmp		LDone

-// 3x3

-.globl	DP_3x3

-DP_3x3:

-	pushl	%esi

-	movl	C(screenwidth),%ebx

-	movl	C(d_zrowbytes),%esi

-	cmpw	%bp,(%edx)

-	jg		L3x3_1

-	movw	%bp,(%edx)

-	movb	%ch,(%edi)

-L3x3_1:

-	cmpw	%bp,2(%edx)

-	jg		L3x3_2

-	movw	%bp,2(%edx)

-	movb	%ch,1(%edi)

-L3x3_2:

-	cmpw	%bp,4(%edx)

-	jg		L3x3_3

-	movw	%bp,4(%edx)

-	movb	%ch,2(%edi)

-L3x3_3:

-	cmpw	%bp,(%edx,%esi,1)

-	jg		L3x3_4

-	movw	%bp,(%edx,%esi,1)

-	movb	%ch,(%edi,%ebx,1)

-L3x3_4:

-	cmpw	%bp,2(%edx,%esi,1)

-	jg		L3x3_5

-	movw	%bp,2(%edx,%esi,1)

-	movb	%ch,1(%edi,%ebx,1)

-L3x3_5:

-	cmpw	%bp,4(%edx,%esi,1)

-	jg		L3x3_6

-	movw	%bp,4(%edx,%esi,1)

-	movb	%ch,2(%edi,%ebx,1)

-L3x3_6:

-	cmpw	%bp,(%edx,%esi,2)

-	jg		L3x3_7

-	movw	%bp,(%edx,%esi,2)

-	movb	%ch,(%edi,%ebx,2)

-L3x3_7:

-	cmpw	%bp,2(%edx,%esi,2)

-	jg		L3x3_8

-	movw	%bp,2(%edx,%esi,2)

-	movb	%ch,1(%edi,%ebx,2)

-L3x3_8:

-	cmpw	%bp,4(%edx,%esi,2)

-	jg		L3x3_9

-	movw	%bp,4(%edx,%esi,2)

-	movb	%ch,2(%edi,%ebx,2)

-L3x3_9:

-	popl	%esi

-	jmp		LDone

-// 4x4

-.globl	DP_4x4

-DP_4x4:

-	pushl	%esi

-	movl	C(screenwidth),%ebx

-	movl	C(d_zrowbytes),%esi

-	cmpw	%bp,(%edx)

-	jg		L4x4_1

-	movw	%bp,(%edx)

-	movb	%ch,(%edi)

-L4x4_1:

-	cmpw	%bp,2(%edx)

-	jg		L4x4_2

-	movw	%bp,2(%edx)

-	movb	%ch,1(%edi)

-L4x4_2:

-	cmpw	%bp,4(%edx)

-	jg		L4x4_3

-	movw	%bp,4(%edx)

-	movb	%ch,2(%edi)

-L4x4_3:

-	cmpw	%bp,6(%edx)

-	jg		L4x4_4

-	movw	%bp,6(%edx)

-	movb	%ch,3(%edi)

-L4x4_4:

-	cmpw	%bp,(%edx,%esi,1)

-	jg		L4x4_5

-	movw	%bp,(%edx,%esi,1)

-	movb	%ch,(%edi,%ebx,1)

-L4x4_5:

-	cmpw	%bp,2(%edx,%esi,1)

-	jg		L4x4_6

-	movw	%bp,2(%edx,%esi,1)

-	movb	%ch,1(%edi,%ebx,1)

-L4x4_6:

-	cmpw	%bp,4(%edx,%esi,1)

-	jg		L4x4_7

-	movw	%bp,4(%edx,%esi,1)

-	movb	%ch,2(%edi,%ebx,1)

-L4x4_7:

-	cmpw	%bp,6(%edx,%esi,1)

-	jg		L4x4_8

-	movw	%bp,6(%edx,%esi,1)

-	movb	%ch,3(%edi,%ebx,1)

-L4x4_8:

-	leal	(%edx,%esi,2),%edx

-	leal	(%edi,%ebx,2),%edi

-	cmpw	%bp,(%edx)

-	jg		L4x4_9

-	movw	%bp,(%edx)

-	movb	%ch,(%edi)

-L4x4_9:

-	cmpw	%bp,2(%edx)

-	jg		L4x4_10

-	movw	%bp,2(%edx)

-	movb	%ch,1(%edi)

-L4x4_10:

-	cmpw	%bp,4(%edx)

-	jg		L4x4_11

-	movw	%bp,4(%edx)

-	movb	%ch,2(%edi)

-L4x4_11:

-	cmpw	%bp,6(%edx)

-	jg		L4x4_12

-	movw	%bp,6(%edx)

-	movb	%ch,3(%edi)

-L4x4_12:

-	cmpw	%bp,(%edx,%esi,1)

-	jg		L4x4_13

-	movw	%bp,(%edx,%esi,1)

-	movb	%ch,(%edi,%ebx,1)

-L4x4_13:

-	cmpw	%bp,2(%edx,%esi,1)

-	jg		L4x4_14

-	movw	%bp,2(%edx,%esi,1)

-	movb	%ch,1(%edi,%ebx,1)

-L4x4_14:

-	cmpw	%bp,4(%edx,%esi,1)

-	jg		L4x4_15

-	movw	%bp,4(%edx,%esi,1)

-	movb	%ch,2(%edi,%ebx,1)

-L4x4_15:

-	cmpw	%bp,6(%edx,%esi,1)

-	jg		L4x4_16

-	movw	%bp,6(%edx,%esi,1)

-	movb	%ch,3(%edi,%ebx,1)

-L4x4_16:

-	popl	%esi

-	jmp		LDone

-// default case, handling any size particle

-LDefault:

-// count = pix << d_y_aspect_shift;

-	movl	%eax,%ebx

-	movl	%eax,DP_Pix

-	movb	C(d_y_aspect_shift),%cl

-	shll	%cl,%ebx

-// for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)

-// {

-// 	for (i=0 ; i<pix ; i++)

-// 	{

-// 		if (pz[i] <= izi)

-// 		{

-// 			pz[i] = izi;

-// 			pdest[i] = color;

-// 		}

-// 	}

-// }

-LGenRowLoop:

-	movl	DP_Pix,%eax

-LGenColLoop:

-	cmpw	%bp,-2(%edx,%eax,2)

-	jg		LGSkip

-	movw	%bp,-2(%edx,%eax,2)

-	movb	%ch,-1(%edi,%eax,1)

-LGSkip:

-	decl	%eax			// --pix

-	jnz		LGenColLoop

-	addl	C(d_zrowbytes),%edx

-	addl	C(screenwidth),%edi

-	decl	%ebx			// --count

-	jnz		LGenRowLoop

-LDone:

-	popl	%ebx				// restore register variables

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-LPop6AndDone:

-	fstp	%st(0)

-	fstp	%st(0)

-	fstp	%st(0)

-	fstp	%st(0)

-	fstp	%st(0)

-LPop1AndDone:

-	fstp	%st(0)

-	jmp		LDone

-#endif	// id386

--- a/d_polysa.s

+++ /dev/null

@@ -1,1723 +1,0 @@

-//

-// d_polysa.s

-// x86 assembly-language polygon model drawing code

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef	id386

-// !!! if this is changed, it must be changed in d_polyse.c too !!!

-#define DPS_MAXSPANS			MAXHEIGHT+1

-									// 1 extra for spanpackage that marks end

-//#define	SPAN_SIZE	(((DPS_MAXSPANS + 1 + ((CACHE_SIZE - 1) / spanpackage_t_size)) + 1) * spanpackage_t_size)

-#define SPAN_SIZE (1024+1+1+1)*32

-	.data

-	.align	4

-p10_minus_p20:	.single		0

-p01_minus_p21:	.single		0

-temp0:			.single		0

-temp1:			.single		0

-Ltemp:			.single		0

-aff8entryvec_table:	.long	LDraw8, LDraw7, LDraw6, LDraw5

-				.long	LDraw4, LDraw3, LDraw2, LDraw1

-lzistepx:		.long	0

-	.text

-	.extern C(D_PolysetSetEdgeTable)

-	.extern C(D_RasterizeAliasPolySmooth)

-//----------------------------------------------------------------------

-// affine triangle gradient calculation code

-//----------------------------------------------------------------------

-#define skinwidth	4+0

-.globl C(D_PolysetCalcGradients)

-C(D_PolysetCalcGradients):

-//	p00_minus_p20 = r_p0[0] - r_p2[0];

-//	p01_minus_p21 = r_p0[1] - r_p2[1];

-//	p10_minus_p20 = r_p1[0] - r_p2[0];

-//	p11_minus_p21 = r_p1[1] - r_p2[1];

-//

-//	xstepdenominv = 1.0 / (p10_minus_p20 * p01_minus_p21 -

-//			     p00_minus_p20 * p11_minus_p21);

-//

-//	ystepdenominv = -xstepdenominv;

-	fildl	C(r_p0)+0		// r_p0[0]

-	fildl	C(r_p2)+0		// r_p2[0] | r_p0[0]

-	fildl	C(r_p0)+4		// r_p0[1] | r_p2[0] | r_p0[0]

-	fildl	C(r_p2)+4		// r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]

-	fildl	C(r_p1)+0		// r_p1[0] | r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]

-	fildl	C(r_p1)+4		// r_p1[1] | r_p1[0] | r_p2[1] | r_p0[1] |

-							//  r_p2[0] | r_p0[0]

-	fxch	%st(3)			// r_p0[1] | r_p1[0] | r_p2[1] | r_p1[1] |

-							//  r_p2[0] | r_p0[0]

-	fsub	%st(2),%st(0)	// p01_minus_p21 | r_p1[0] | r_p2[1] | r_p1[1] |

-							//  r_p2[0] | r_p0[0]

-	fxch	%st(1)			// r_p1[0] | p01_minus_p21 | r_p2[1] | r_p1[1] |

-							//  r_p2[0] | r_p0[0]

-	fsub	%st(4),%st(0)	// p10_minus_p20 | p01_minus_p21 | r_p2[1] |

-							//  r_p1[1] | r_p2[0] | r_p0[0]

-	fxch	%st(5)			// r_p0[0] | p01_minus_p21 | r_p2[1] |

-							//  r_p1[1] | r_p2[0] | p10_minus_p20

-	fsubp	%st(0),%st(4)	// p01_minus_p21 | r_p2[1] | r_p1[1] |

-							//  p00_minus_p20 | p10_minus_p20

-	fxch	%st(2)			// r_p1[1] | r_p2[1] | p01_minus_p21 |

-							//  p00_minus_p20 | p10_minus_p20

-	fsubp	%st(0),%st(1)	// p11_minus_p21 | p01_minus_p21 |

-							//  p00_minus_p20 | p10_minus_p20

-	fxch	%st(1)			// p01_minus_p21 | p11_minus_p21 |

-							//  p00_minus_p20 | p10_minus_p20

-	flds	C(d_xdenom)		// d_xdenom | p01_minus_p21 | p11_minus_p21 |

-							//  p00_minus_p20 | p10_minus_p20

-	fxch	%st(4)			// p10_minus_p20 | p01_minus_p21 | p11_minus_p21 |

-							//  p00_minus_p20 | d_xdenom

-	fstps	p10_minus_p20	// p01_minus_p21 | p11_minus_p21 |

-							//  p00_minus_p20 | d_xdenom

-	fstps	p01_minus_p21	// p11_minus_p21 | p00_minus_p20 | xstepdenominv

-	fxch	%st(2)			// xstepdenominv | p00_minus_p20 | p11_minus_p21

-//// ceil () for light so positive steps are exaggerated, negative steps

-//// diminished,  pushing us away from underflow toward overflow. Underflow is

-//// very visible, overflow is very unlikely, because of ambient lighting

-//	t0 = r_p0[4] - r_p2[4];

-//	t1 = r_p1[4] - r_p2[4];

-	fildl	C(r_p2)+16		// r_p2[4] | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fildl	C(r_p0)+16		// r_p0[4] | r_p2[4] | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p1)+16		// r_p1[4] | r_p0[4] | r_p2[4] | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// r_p2[4] | r_p0[4] | r_p1[4] | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// r_p2[4] | r_p2[4] | r_p0[4] | r_p1[4] |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fsubrp	%st(0),%st(2)	// r_p2[4] | t0 | r_p1[4] | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fsubrp	%st(0),%st(2)	// t0 | t1 | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-//	r_lstepx = (int)

-//			ceil((t1 * p01_minus_p21 - t0 * p11_minus_p21) * xstepdenominv);

-//	r_lstepy = (int)

-//			ceil((t1 * p00_minus_p20 - t0 * p10_minus_p20) * ystepdenominv);

-	fld		%st(0)			// t0 | t0 | t1 | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmul	%st(5),%st(0)	// t0*p11_minus_p21 | t0 | t1 | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

-							//  t0*p11_minus_p21 | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

-							//  t0*p11_minus_p21 | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fmul	%st(5),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 | t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |

-							//  t1*p00_minus_p20 | t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fld		%st(2)			// xstepdenominv |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmuls	float_minus_1	// ystepdenominv |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//   xstepdenominv |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//   | ystepdenominv | xstepdenominv |

-							//   p00_minus_p20 | p11_minus_p21

-	fxch	%st(1)			// t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//   xstepdenominv | ystepdenominv |

-							//   xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//  ystepdenominv |

-							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//  xstepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fldcw	ceil_cw

-	fistpl	C(r_lstepy)		// r_lstepx | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fistpl	C(r_lstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fldcw	single_cw

-//	t0 = r_p0[2] - r_p2[2];

-//	t1 = r_p1[2] - r_p2[2];

-	fildl	C(r_p2)+8		// r_p2[2] | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p0)+8		// r_p0[2] | r_p2[2] | ystepdenominv |

-							//   xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p1)+8		// r_p1[2] | r_p0[2] | r_p2[2] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// r_p2[2] | r_p0[2] | r_p1[2] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// r_p2[2] | r_p2[2] | r_p0[2] | r_p1[2] |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubrp	%st(0),%st(2)	// r_p2[2] | t0 | r_p1[2] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-//	r_sstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *

-//			xstepdenominv);

-//	r_sstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *

-//			ystepdenominv);

-	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv

-	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

-							//  t0*p11_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

-							//  t0*p11_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |

-							//  t1*p00_minus_p20 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//   ystepdenominv |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//   ystepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//  xstepdenominv |

-							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//  ystepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//  ystepdenominv |

-							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//  xstepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fistpl	C(r_sstepy)		// r_sstepx | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fistpl	C(r_sstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-//	t0 = r_p0[3] - r_p2[3];

-//	t1 = r_p1[3] - r_p2[3];

-	fildl	C(r_p2)+12		// r_p2[3] | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p0)+12		// r_p0[3] | r_p2[3] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p1)+12		// r_p1[3] | r_p0[3] | r_p2[3] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// r_p2[3] | r_p0[3] | r_p1[3] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// r_p2[3] | r_p2[3] | r_p0[3] | r_p1[3] |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubrp	%st(0),%st(2)	// r_p2[3] | t0 | r_p1[3] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-//	r_tstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *

-//			xstepdenominv);

-//	r_tstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *

-//			ystepdenominv);

-	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

-							//  t0*p11_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

-							//  t0*p11_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |

-							//  t1*p00_minus_p20 | t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//   ystepdenominv |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//  ystepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//  xstepdenominv |

-							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//  ystepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*

-							//  ystepdenominv |

-							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

-							//  xstepdenominv | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fistpl	C(r_tstepy)		// r_tstepx | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fistpl	C(r_tstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-//	t0 = r_p0[5] - r_p2[5];

-//	t1 = r_p1[5] - r_p2[5];

-	fildl	C(r_p2)+20		// r_p2[5] | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p0)+20		// r_p0[5] | r_p2[5] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fildl	C(r_p1)+20		// r_p1[5] | r_p0[5] | r_p2[5] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fxch	%st(2)			// r_p2[5] | r_p0[5] | r_p1[5] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fld		%st(0)			// r_p2[5] | r_p2[5] | r_p0[5] | r_p1[5] |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  p11_minus_p21

-	fsubrp	%st(0),%st(2)	// r_p2[5] | t0 | r_p1[5] | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

-	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-//	r_zistepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *

-//			xstepdenominv);

-//	r_zistepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *

-//			ystepdenominv);

-	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | p11_minus_p21

-	fmulp	%st(0),%st(6)	// t0 | t1 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | t0*p11_minus_p21

-	fxch	%st(1)			// t1 | t0 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | t0*p11_minus_p21

-	fld		%st(0)			// t1 | t1 | t0 | ystepdenominv | xstepdenominv |

-							//  p00_minus_p20 | t0*p11_minus_p21

-	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 |

-							//  t0*p11_minus_p21

-	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | ystepdenominv |

-							//  xstepdenominv | p00_minus_p20 |

-							//  t0*p11_minus_p21

-	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  t0*p11_minus_p21

-	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

-							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

-							//  t0*p11_minus_p21

-	fmulp	%st(0),%st(5)	// t0*p10_minus_p20 | t1*p01_minus_p21 |

-							//  ystepdenominv | xstepdenominv |

-							//  t1*p00_minus_p20 | t0*p11_minus_p21

-	fxch	%st(5)			// t0*p11_minus_p21 | t1*p01_minus_p21 |

-							//  ystepdenominv | xstepdenominv |

-							//  t1*p00_minus_p20 | t0*p10_minus_p20

-	fsubrp	%st(0),%st(1)	// t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  ystepdenominv | xstepdenominv |

-							//  t1*p00_minus_p20 | t0*p10_minus_p20

-	fxch	%st(3)			// t1*p00_minus_p20 | ystepdenominv |

-							//  xstepdenominv |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  t0*p10_minus_p20

-	fsubp	%st(0),%st(4)	// ystepdenominv | xstepdenominv |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20

-	fxch	%st(1)			// xstepdenominv | ystepdenominv |

-							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20

-	fmulp	%st(0),%st(2)	// ystepdenominv |

-							//  (t1*p01_minus_p21 - t0*p11_minus_p21) *

-							//  xstepdenominv |

-							//  t1*p00_minus_p20 - t0*p10_minus_p20

-	fmulp	%st(0),%st(2)	// (t1*p01_minus_p21 - t0*p11_minus_p21) *

-							//  xstepdenominv |

-							//  (t1*p00_minus_p20 - t0*p10_minus_p20) *

-							//  ystepdenominv

-	fistpl	C(r_zistepx)	// (t1*p00_minus_p20 - t0*p10_minus_p20) *

-							//  ystepdenominv

-	fistpl	C(r_zistepy)

-//	a_sstepxfrac = r_sstepx << 16;

-//	a_tstepxfrac = r_tstepx << 16;

-//

-//	a_ststepxwhole = r_affinetridesc.skinwidth * (r_tstepx >> 16) +

-//			(r_sstepx >> 16);

-	movl	C(r_sstepx),%eax

-	movl	C(r_tstepx),%edx

-	shll	$16,%eax

-	shll	$16,%edx

-	movl	%eax,C(a_sstepxfrac)

-	movl	%edx,C(a_tstepxfrac)

-	movl	C(r_sstepx),%ecx

-	movl	C(r_tstepx),%eax

-	sarl	$16,%ecx

-	sarl	$16,%eax

-	imull	skinwidth(%esp)

-	addl	%ecx,%eax

-	movl	%eax,C(a_ststepxwhole)

-	ret

-//----------------------------------------------------------------------

-// recursive subdivision affine triangle drawing code

-//

-// not C-callable because of stdcall return

-//----------------------------------------------------------------------

-#define lp1	4+16

-#define lp2	8+16

-#define lp3	12+16

-.globl C(D_PolysetRecursiveTriangle)

-C(D_PolysetRecursiveTriangle):

-	pushl	%ebp				// preserve caller stack frame pointer

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-	pushl	%ebx

-//	int		*temp;

-//	int		d;

-//	int		new[6];

-//	int		i;

-//	int		z;

-//	short	*zbuf;

-	movl	lp2(%esp),%esi

-	movl	lp1(%esp),%ebx

-	movl	lp3(%esp),%edi

-//	d = lp2[0] - lp1[0];

-//	if (d < -1 || d > 1)

-//		goto split;

-	movl	0(%esi),%eax

-	movl	0(%ebx),%edx

-	movl	4(%esi),%ebp

-	subl	%edx,%eax

-	movl	4(%ebx),%ecx

-	subl	%ecx,%ebp

-	incl	%eax

-	cmpl	$2,%eax

-	ja		LSplit

-//	d = lp2[1] - lp1[1];

-//	if (d < -1 || d > 1)

-//		goto split;

-	movl	0(%edi),%eax

-	incl	%ebp

-	cmpl	$2,%ebp

-	ja		LSplit

-//	d = lp3[0] - lp2[0];

-//	if (d < -1 || d > 1)

-//		goto split2;

-	movl	0(%esi),%edx

-	movl	4(%edi),%ebp

-	subl	%edx,%eax

-	movl	4(%esi),%ecx

-	subl	%ecx,%ebp

-	incl	%eax

-	cmpl	$2,%eax

-	ja		LSplit2

-//	d = lp3[1] - lp2[1];

-//	if (d < -1 || d > 1)

-//		goto split2;

-	movl	0(%ebx),%eax

-	incl	%ebp

-	cmpl	$2,%ebp

-	ja		LSplit2

-//	d = lp1[0] - lp3[0];

-//	if (d < -1 || d > 1)

-//		goto split3;

-	movl	0(%edi),%edx

-	movl	4(%ebx),%ebp

-	subl	%edx,%eax

-	movl	4(%edi),%ecx

-	subl	%ecx,%ebp

-	incl	%eax

-	incl	%ebp

-	movl	%ebx,%edx

-	cmpl	$2,%eax

-	ja		LSplit3

-//	d = lp1[1] - lp3[1];

-//	if (d < -1 || d > 1)

-//	{

-//split3:

-//		temp = lp1;

-//		lp3 = lp2;

-//		lp1 = lp3;

-//		lp2 = temp;

-//		goto split;

-//	}

-//

-//	return;			// entire tri is filled

-//

-	cmpl	$2,%ebp

-	jna		LDone

-LSplit3:

-	movl	%edi,%ebx

-	movl	%esi,%edi

-	movl	%edx,%esi

-	jmp		LSplit

-//split2:

-LSplit2:

-//	temp = lp1;

-//	lp1 = lp2;

-//	lp2 = lp3;

-//	lp3 = temp;

-	movl	%ebx,%eax

-	movl	%esi,%ebx

-	movl	%edi,%esi

-	movl	%eax,%edi

-//split:

-LSplit:

-	subl	$24,%esp		// allocate space for a new vertex

-//// split this edge

-//	new[0] = (lp1[0] + lp2[0]) >> 1;

-//	new[1] = (lp1[1] + lp2[1]) >> 1;

-//	new[2] = (lp1[2] + lp2[2]) >> 1;

-//	new[3] = (lp1[3] + lp2[3]) >> 1;

-//	new[5] = (lp1[5] + lp2[5]) >> 1;

-	movl	8(%ebx),%eax

-	movl	8(%esi),%edx

-	movl	12(%ebx),%ecx

-	addl	%edx,%eax

-	movl	12(%esi),%edx

-	sarl	$1,%eax

-	addl	%edx,%ecx

-	movl	%eax,8(%esp)

-	movl	20(%ebx),%eax

-	sarl	$1,%ecx

-	movl	20(%esi),%edx

-	movl	%ecx,12(%esp)

-	addl	%edx,%eax

-	movl	0(%ebx),%ecx

-	movl	0(%esi),%edx

-	sarl	$1,%eax

-	addl	%ecx,%edx

-	movl	%eax,20(%esp)

-	movl	4(%ebx),%eax

-	sarl	$1,%edx

-	movl	4(%esi),%ebp

-	movl	%edx,0(%esp)

-	addl	%eax,%ebp

-	sarl	$1,%ebp

-	movl	%ebp,4(%esp)

-//// draw the point if splitting a leading edge

-//	if (lp2[1] > lp1[1])

-//		goto nodraw;

-	cmpl	%eax,4(%esi)

-	jg		LNoDraw

-//	if ((lp2[1] == lp1[1]) && (lp2[0] < lp1[0]))

-//		goto nodraw;

-	movl	0(%esi),%edx

-	jnz		LDraw

-	cmpl	%ecx,%edx

-	jl		LNoDraw

-LDraw:

-// z = new[5] >> 16;

-	movl	20(%esp),%edx

-	movl	4(%esp),%ecx

-	sarl	$16,%edx

-	movl	0(%esp),%ebp

-//	zbuf = zspantable[new[1]] + new[0];

-	movl	C(zspantable)(,%ecx,4),%eax

-//	if (z >= *zbuf)

-//	{

-	cmpw	(%eax,%ebp,2),%dx

-	jnge	LNoDraw

-//		int		pix;

-//

-//		*zbuf = z;

-	movw	%dx,(%eax,%ebp,2)

-//		pix = d_pcolormap[skintable[new[3]>>16][new[2]>>16]];

-	movl	12(%esp),%eax

-	sarl	$16,%eax

-	movl	8(%esp),%edx

-	sarl	$16,%edx

-	subl	%ecx,%ecx

-	movl	C(skintable)(,%eax,4),%eax

-	movl	4(%esp),%ebp

-	movb	(%eax,%edx,),%cl

-	movl	C(d_pcolormap),%edx

-	movb	(%edx,%ecx,),%dl

-	movl	0(%esp),%ecx

-//		d_viewbuffer[d_scantable[new[1]] + new[0]] = pix;

-	movl	C(d_scantable)(,%ebp,4),%eax

-	addl	%eax,%ecx

-	movl	C(d_viewbuffer),%eax

-	movb	%dl,(%eax,%ecx,1)

-//	}

-//

-//nodraw:

-LNoDraw:

-//// recursively continue

-//	D_PolysetRecursiveTriangle (lp3, lp1, new);

-	pushl	%esp

-	pushl	%ebx

-	pushl	%edi

-	call	C(D_PolysetRecursiveTriangle)

-//	D_PolysetRecursiveTriangle (lp3, new, lp2);

-	movl	%esp,%ebx

-	pushl	%esi

-	pushl	%ebx

-	pushl	%edi

-	call	C(D_PolysetRecursiveTriangle)

-	addl	$24,%esp

-LDone:

-	popl	%ebx				// restore register variables

-	popl	%edi

-	popl	%esi

-	popl	%ebp				// restore caller stack frame pointer

-	ret		$12

-//----------------------------------------------------------------------

-// 8-bpp horizontal span drawing code for affine polygons, with smooth

-// shading and no transparency

-//----------------------------------------------------------------------

-#define pspans	4+8

-.globl C(D_PolysetAff8Start)

-C(D_PolysetAff8Start):

-.globl C(D_PolysetDrawSpans8)

-C(D_PolysetDrawSpans8):

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-	movl	pspans(%esp),%esi	// point to the first span descriptor

-	movl	C(r_zistepx),%ecx

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	rorl	$16,%ecx			// put high 16 bits of 1/z step in low word

-	movl	spanpackage_t_count(%esi),%edx

-	movl	%ecx,lzistepx

-LSpanLoop:

-//		lcount = d_aspancount - pspanpackage->count;

-//

-//		errorterm += erroradjustup;

-//		if (errorterm >= 0)

-//		{

-//			d_aspancount += d_countextrastep;

-//			errorterm -= erroradjustdown;

-//		}

-//		else

-//		{

-//			d_aspancount += ubasestep;

-//		}

-	movl	C(d_aspancount),%eax

-	subl	%edx,%eax

-	movl	C(erroradjustup),%edx

-	movl	C(errorterm),%ebx

-	addl	%edx,%ebx

-	js		LNoTurnover

-	movl	C(erroradjustdown),%edx

-	movl	C(d_countextrastep),%edi

-	subl	%edx,%ebx

-	movl	C(d_aspancount),%ebp

-	movl	%ebx,C(errorterm)

-	addl	%edi,%ebp

-	movl	%ebp,C(d_aspancount)

-	jmp		LRightEdgeStepped

-LNoTurnover:

-	movl	C(d_aspancount),%edi

-	movl	C(ubasestep),%edx

-	movl	%ebx,C(errorterm)

-	addl	%edx,%edi

-	movl	%edi,C(d_aspancount)

-LRightEdgeStepped:

-	cmpl	$1,%eax

-	jl		LNextSpan

-	jz		LExactlyOneLong

-//

-// set up advancetable

-//

-	movl	C(a_ststepxwhole),%ecx

-	movl	C(r_affinetridesc)+atd_skinwidth,%edx

-	movl	%ecx,advancetable+4	// advance base in t

-	addl	%edx,%ecx

-	movl	%ecx,advancetable	// advance extra in t

-	movl	C(a_tstepxfrac),%ecx

-	movw	C(r_lstepx),%cx

-	movl	%eax,%edx			// count

-	movl	%ecx,tstep

-	addl	$7,%edx

-	shrl	$3,%edx				// count of full and partial loops

-	movl	spanpackage_t_sfrac(%esi),%ebx

-	movw	%dx,%bx

-	movl	spanpackage_t_pz(%esi),%ecx

-	negl	%eax

-	movl	spanpackage_t_pdest(%esi),%edi

-	andl	$7,%eax		// 0->0, 1->7, 2->6, ... , 7->1

-	subl	%eax,%edi	// compensate for hardwired offsets

-	subl	%eax,%ecx

-	subl	%eax,%ecx

-	movl	spanpackage_t_tfrac(%esi),%edx

-	movw	spanpackage_t_light(%esi),%dx

-	movl	spanpackage_t_zi(%esi),%ebp

-	rorl	$16,%ebp	// put high 16 bits of 1/z in low word

-	pushl	%esi

-	movl	spanpackage_t_ptex(%esi),%esi

-	jmp		aff8entryvec_table(,%eax,4)

-// %bx = count of full and partial loops

-// %ebx high word = sfrac

-// %ecx = pz

-// %dx = light

-// %edx high word = tfrac

-// %esi = ptex

-// %edi = pdest

-// %ebp = 1/z

-// tstep low word = C(r_lstepx)

-// tstep high word = C(a_tstepxfrac)

-// C(a_sstepxfrac) low word = 0

-// C(a_sstepxfrac) high word = C(a_sstepxfrac)

-LDrawLoop:

-// FIXME: do we need to clamp light? We may need at least a buffer bit to

-// keep it from poking into tfrac and causing problems

-LDraw8:

-	cmpw	(%ecx),%bp

-	jl		Lp1

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch8:

-	movb	%al,(%edi)

-Lp1:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw7:

-	cmpw	2(%ecx),%bp

-	jl		Lp2

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,2(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch7:

-	movb	%al,1(%edi)

-Lp2:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw6:

-	cmpw	4(%ecx),%bp

-	jl		Lp3

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,4(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch6:

-	movb	%al,2(%edi)

-Lp3:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw5:

-	cmpw	6(%ecx),%bp

-	jl		Lp4

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,6(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch5:

-	movb	%al,3(%edi)

-Lp4:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw4:

-	cmpw	8(%ecx),%bp

-	jl		Lp5

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,8(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch4:

-	movb	%al,4(%edi)

-Lp5:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw3:

-	cmpw	10(%ecx),%bp

-	jl		Lp6

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,10(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch3:

-	movb	%al,5(%edi)

-Lp6:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw2:

-	cmpw	12(%ecx),%bp

-	jl		Lp7

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,12(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch2:

-	movb	%al,6(%edi)

-Lp7:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LDraw1:

-	cmpw	14(%ecx),%bp

-	jl		Lp8

-	xorl	%eax,%eax

-	movb	%dh,%ah

-	movb	(%esi),%al

-	movw	%bp,14(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch1:

-	movb	%al,7(%edi)

-Lp8:

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	lzistepx,%ebp

-	adcl	$0,%ebp

-	addl	C(a_sstepxfrac),%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	addl	$8,%edi

-	addl	$16,%ecx

-	decw	%bx

-	jnz		LDrawLoop

-	popl	%esi				// restore spans pointer

-LNextSpan:

-	addl	$(spanpackage_t_size),%esi	// point to next span

-LNextSpanESISet:

-	movl	spanpackage_t_count(%esi),%edx

-	cmpl	$-999999,%edx		// any more spans?

-	jnz		LSpanLoop			// yes

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	popl	%ebx				// restore register variables

-	popl	%esi

-	ret

-// draw a one-long span

-LExactlyOneLong:

-	movl	spanpackage_t_pz(%esi),%ecx

-	movl	spanpackage_t_zi(%esi),%ebp

-	rorl	$16,%ebp	// put high 16 bits of 1/z in low word

-	movl	spanpackage_t_ptex(%esi),%ebx

-	cmpw	(%ecx),%bp

-	jl		LNextSpan

-	xorl	%eax,%eax

-	movl	spanpackage_t_pdest(%esi),%edi

-	movb	spanpackage_t_light+1(%esi),%ah

-	addl	$(spanpackage_t_size),%esi	// point to next span

-	movb	(%ebx),%al

-	movw	%bp,(%ecx)

-	movb	0x12345678(%eax),%al

-LPatch9:

-	movb	%al,(%edi)

-	jmp		LNextSpanESISet

-.globl C(D_PolysetAff8End)

-C(D_PolysetAff8End):

-#define pcolormap		4

-.globl C(D_Aff8Patch)

-C(D_Aff8Patch):

-	movl	pcolormap(%esp),%eax

-	movl	%eax,LPatch1-4

-	movl	%eax,LPatch2-4

-	movl	%eax,LPatch3-4

-	movl	%eax,LPatch4-4

-	movl	%eax,LPatch5-4

-	movl	%eax,LPatch6-4

-	movl	%eax,LPatch7-4

-	movl	%eax,LPatch8-4

-	movl	%eax,LPatch9-4

-	ret

-//----------------------------------------------------------------------

-// Alias model polygon dispatching code, combined with subdivided affine

-// triangle drawing code

-//----------------------------------------------------------------------

-.globl C(D_PolysetDraw)

-C(D_PolysetDraw):

-//	spanpackage_t	spans[DPS_MAXSPANS + 1 +

-//			((CACHE_SIZE - 1) / sizeof(spanpackage_t)) + 1];

-//						// one extra because of cache line pretouching

-//

-//	a_spans = (spanpackage_t *)

-//			(((intptr)&spans[0] + CACHE_SIZE - 1) & ~(CACHE_SIZE - 1));

-	subl	$(SPAN_SIZE),%esp

-	movl	%esp,%eax

-	addl	$(CACHE_SIZE - 1),%eax

-	andl	$(~(CACHE_SIZE - 1)),%eax

-	movl	%eax,C(a_spans)

-//	if (r_affinetridesc.drawtype)

-//		D_DrawSubdiv ();

-//	else

-//		D_DrawNonSubdiv ();

-	movl	C(r_affinetridesc)+atd_drawtype,%eax

-	testl	%eax,%eax

-	jz		C(D_DrawNonSubdiv)

-	pushl	%ebp				// preserve caller stack frame pointer

-//	lnumtriangles = r_affinetridesc.numtriangles;

-	movl	C(r_affinetridesc)+atd_numtriangles,%ebp

-	pushl	%esi				// preserve register variables

-	shll	$4,%ebp

-	pushl	%ebx

-//	ptri = r_affinetridesc.ptriangles;

-	movl	C(r_affinetridesc)+atd_ptriangles,%ebx

-	pushl	%edi

-//	mtriangle_t		*ptri;

-//	finalvert_t		*pfv, *index0, *index1, *index2;

-//	int				i;

-//	int				lnumtriangles;

-//	int				s0, s1, s2;

-//	pfv = r_affinetridesc.pfinalverts;

-	movl	C(r_affinetridesc)+atd_pfinalverts,%edi

-//	for (i=0 ; i<lnumtriangles ; i++)

-//	{

-Llooptop:

-//		index0 = pfv + ptri[i].vertindex[0];

-//		index1 = pfv + ptri[i].vertindex[1];

-//		index2 = pfv + ptri[i].vertindex[2];

-	movl	mtri_vertindex-16+0(%ebx,%ebp,),%ecx

-	movl	mtri_vertindex-16+4(%ebx,%ebp,),%esi

-	shll	$(fv_shift),%ecx

-	movl	mtri_vertindex-16+8(%ebx,%ebp,),%edx

-	shll	$(fv_shift),%esi

-	addl	%edi,%ecx

-	shll	$(fv_shift),%edx

-	addl	%edi,%esi

-	addl	%edi,%edx

-//		if (((index0->v[1]-index1->v[1]) *

-//				(index0->v[0]-index2->v[0]) -

-//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1])) >= 0)

-//		{

-//			continue;

-//		}

-//

-//		d_pcolormap = &((byte *)acolormap)[index0->v[4] & 0xFF00];

-	fildl	fv_v+4(%ecx)	// i0v1

-	fildl	fv_v+4(%esi)	// i1v1 | i0v1

-	fildl	fv_v+0(%ecx)	// i0v0 | i1v1 | i0v1

-	fildl	fv_v+0(%edx)	// i2v0 | i0v0 | i1v1 | i0v1

-	fxch	%st(2)			// i1v1 | i0v0 | i2v0 | i0v1

-	fsubr	%st(3),%st(0)	// i0v1-i1v1 | i0v0 | i2v0 | i0v1

-	fildl	fv_v+0(%esi)	// i1v0 | i0v1-i1v1 | i0v0 | i2v0 | i0v1

-	fxch	%st(2)			// i0v0 | i0v1-i1v1 | i1v0 | i2v0 | i0v1

-	fsub	%st(0),%st(3)	// i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0 | i0v1

-	fildl	fv_v+4(%edx)	// i2v1 | i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1

-	fxch	%st(1)			// i0v0 | i2v1 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1

-	fsubp	%st(0),%st(3)	// i2v1 | i0v1-i1v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1

-	fxch	%st(1)			// i0v1-i1v1 | i2v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1

-	fmulp	%st(0),%st(3)	// i2v1 | i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1

-	fsubrp	%st(0),%st(3)	// i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1-i2v1

-	movl	fv_v+16(%ecx),%eax

-	andl	$0xFF00,%eax

-	fmulp	%st(0),%st(2)	// i0v1-i1v1*i0v0-i2v0 | i0v0-i1v0*i0v1-i2v1

-	addl	C(acolormap),%eax

-	fsubp	%st(0),%st(1)	// (i0v1-i1v1)*(i0v0-i2v0)-(i0v0-i1v0)*(i0v1-i2v1)

-	movl	%eax,C(d_pcolormap)

-	fstps	Ltemp

-	movl	Ltemp,%eax

-	subl	$0x80000001,%eax

-	jc		Lskip

-//		if (ptri[i].facesfront)

-//		{

-//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);

-	movl	mtri_facesfront-16(%ebx,%ebp,),%eax

-	testl	%eax,%eax

-	jz		Lfacesback

-	pushl	%edx

-	pushl	%esi

-	pushl	%ecx

-	call	C(D_PolysetRecursiveTriangle)

-	subl	$16,%ebp

-	jnz		Llooptop

-	jmp		Ldone2

-//		}

-//		else

-//		{

-Lfacesback:

-//			s0 = index0->v[2];

-//			s1 = index1->v[2];

-//			s2 = index2->v[2];

-	movl	fv_v+8(%ecx),%eax

-	pushl	%eax

-	movl	fv_v+8(%esi),%eax

-	pushl	%eax

-	movl	fv_v+8(%edx),%eax

-	pushl	%eax

-	pushl	%ecx

-	pushl	%edx

-//			if (index0->flags & ALIAS_ONSEAM)

-//				index0->v[2] += r_affinetridesc.seamfixupX16;

-	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax

-	testl	$(ALIAS_ONSEAM),fv_flags(%ecx)

-	jz		Lp11

-	addl	%eax,fv_v+8(%ecx)

-Lp11:

-//			if (index1->flags & ALIAS_ONSEAM)

-//				index1->v[2] += r_affinetridesc.seamfixupX16;

-	testl	$(ALIAS_ONSEAM),fv_flags(%esi)

-	jz		Lp12

-	addl	%eax,fv_v+8(%esi)

-Lp12:

-//			if (index2->flags & ALIAS_ONSEAM)

-//				index2->v[2] += r_affinetridesc.seamfixupX16;

-	testl	$(ALIAS_ONSEAM),fv_flags(%edx)

-	jz		Lp13

-	addl	%eax,fv_v+8(%edx)

-Lp13:

-//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);

-	pushl	%edx

-	pushl	%esi

-	pushl	%ecx

-	call	C(D_PolysetRecursiveTriangle)

-//			index0->v[2] = s0;

-//			index1->v[2] = s1;

-//			index2->v[2] = s2;

-	popl	%edx

-	popl	%ecx

-	popl	%eax

-	movl	%eax,fv_v+8(%edx)

-	popl	%eax

-	movl	%eax,fv_v+8(%esi)

-	popl	%eax

-	movl	%eax,fv_v+8(%ecx)

-//		}

-//	}

-Lskip:

-	subl	$16,%ebp

-	jnz		Llooptop

-Ldone2:

-	popl	%edi				// restore the caller's stack frame

-	popl	%ebx

-	popl	%esi				// restore register variables

-	popl	%ebp

-	addl	$(SPAN_SIZE),%esp

-	ret

-//----------------------------------------------------------------------

-// Alias model triangle left-edge scanning code

-//----------------------------------------------------------------------

-#define height	4+16

-.globl C(D_PolysetScanLeftEdge)

-C(D_PolysetScanLeftEdge):

-	pushl	%ebp				// preserve caller stack frame pointer

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-	pushl	%ebx

-	movl	height(%esp),%eax

-	movl	C(d_sfrac),%ecx

-	andl	$0xFFFF,%eax

-	movl	C(d_ptex),%ebx

-	orl		%eax,%ecx

-	movl	C(d_pedgespanpackage),%esi

-	movl	C(d_tfrac),%edx

-	movl	C(d_light),%edi

-	movl	C(d_zi),%ebp

-// %eax: scratch

-// %ebx: d_ptex

-// %ecx: d_sfrac in high word, count in low word

-// %edx: d_tfrac

-// %esi: d_pedgespanpackage, errorterm, scratch alternately

-// %edi: d_light

-// %ebp: d_zi

-//	do

-//	{

-LScanLoop:

-//		d_pedgespanpackage->ptex = ptex;

-//		d_pedgespanpackage->pdest = d_pdest;

-//		d_pedgespanpackage->pz = d_pz;

-//		d_pedgespanpackage->count = d_aspancount;

-//		d_pedgespanpackage->light = d_light;

-//		d_pedgespanpackage->zi = d_zi;

-//		d_pedgespanpackage->sfrac = d_sfrac << 16;

-//		d_pedgespanpackage->tfrac = d_tfrac << 16;

-	movl	%ebx,spanpackage_t_ptex(%esi)

-	movl	C(d_pdest),%eax

-	movl	%eax,spanpackage_t_pdest(%esi)

-	movl	C(d_pz),%eax

-	movl	%eax,spanpackage_t_pz(%esi)

-	movl	C(d_aspancount),%eax

-	movl	%eax,spanpackage_t_count(%esi)

-	movl	%edi,spanpackage_t_light(%esi)

-	movl	%ebp,spanpackage_t_zi(%esi)

-	movl	%ecx,spanpackage_t_sfrac(%esi)

-	movl	%edx,spanpackage_t_tfrac(%esi)

-// pretouch the next cache line

-	movb	spanpackage_t_size(%esi),%al

-//		d_pedgespanpackage++;

-	addl	$(spanpackage_t_size),%esi

-	movl	C(erroradjustup),%eax

-	movl	%esi,C(d_pedgespanpackage)

-//		errorterm += erroradjustup;

-	movl	C(errorterm),%esi

-	addl	%eax,%esi

-	movl	C(d_pdest),%eax

-//		if (errorterm >= 0)

-//		{

-	js		LNoLeftEdgeTurnover

-//			errorterm -= erroradjustdown;

-//			d_pdest += d_pdestextrastep;

-	subl	C(erroradjustdown),%esi

-	addl	C(d_pdestextrastep),%eax

-	movl	%esi,C(errorterm)

-	movl	%eax,C(d_pdest)

-//			d_pz += d_pzextrastep;

-//			d_aspancount += d_countextrastep;

-//			d_ptex += d_ptexextrastep;

-//			d_sfrac += d_sfracextrastep;

-//			d_ptex += d_sfrac >> 16;

-//			d_sfrac &= 0xFFFF;

-//			d_tfrac += d_tfracextrastep;

-	movl	C(d_pz),%eax

-	movl	C(d_aspancount),%esi

-	addl	C(d_pzextrastep),%eax

-	addl	C(d_sfracextrastep),%ecx

-	adcl	C(d_ptexextrastep),%ebx

-	addl	C(d_countextrastep),%esi

-	movl	%eax,C(d_pz)

-	movl	C(d_tfracextrastep),%eax

-	movl	%esi,C(d_aspancount)

-	addl	%eax,%edx

-//			if (d_tfrac & 0x10000)

-//			{

-	jnc		LSkip1

-//				d_ptex += r_affinetridesc.skinwidth;

-//				d_tfrac &= 0xFFFF;

-	addl	C(r_affinetridesc)+atd_skinwidth,%ebx

-//			}

-LSkip1:

-//			d_light += d_lightextrastep;

-//			d_zi += d_ziextrastep;

-	addl	C(d_lightextrastep),%edi

-	addl	C(d_ziextrastep),%ebp

-//		}

-	movl	C(d_pedgespanpackage),%esi

-	decl	%ecx

-	testl	$0xFFFF,%ecx

-	jnz		LScanLoop

-	popl	%ebx

-	popl	%edi

-	popl	%esi

-	popl	%ebp

-	ret

-//		else

-//		{

-LNoLeftEdgeTurnover:

-	movl	%esi,C(errorterm)

-//			d_pdest += d_pdestbasestep;

-	addl	C(d_pdestbasestep),%eax

-	movl	%eax,C(d_pdest)

-//			d_pz += d_pzbasestep;

-//			d_aspancount += ubasestep;

-//			d_ptex += d_ptexbasestep;

-//			d_sfrac += d_sfracbasestep;

-//			d_ptex += d_sfrac >> 16;

-//			d_sfrac &= 0xFFFF;

-	movl	C(d_pz),%eax

-	movl	C(d_aspancount),%esi

-	addl	C(d_pzbasestep),%eax

-	addl	C(d_sfracbasestep),%ecx

-	adcl	C(d_ptexbasestep),%ebx

-	addl	C(ubasestep),%esi

-	movl	%eax,C(d_pz)

-	movl	%esi,C(d_aspancount)

-//			d_tfrac += d_tfracbasestep;

-	movl	C(d_tfracbasestep),%esi

-	addl	%esi,%edx

-//			if (d_tfrac & 0x10000)

-//			{

-	jnc		LSkip2

-//				d_ptex += r_affinetridesc.skinwidth;

-//				d_tfrac &= 0xFFFF;

-	addl	C(r_affinetridesc)+atd_skinwidth,%ebx

-//			}

-LSkip2:

-//			d_light += d_lightbasestep;

-//			d_zi += d_zibasestep;

-	addl	C(d_lightbasestep),%edi

-	addl	C(d_zibasestep),%ebp

-//		}

-//	} while (--height);

-	movl	C(d_pedgespanpackage),%esi

-	decl	%ecx

-	testl	$0xFFFF,%ecx

-	jnz		LScanLoop

-	popl	%ebx

-	popl	%edi

-	popl	%esi

-	popl	%ebp

-	ret

-//----------------------------------------------------------------------

-// Alias model vertex drawing code

-//----------------------------------------------------------------------

-#define fv			4+8

-#define	numverts	8+8

-.globl C(D_PolysetDrawFinalVerts)

-C(D_PolysetDrawFinalVerts):

-	pushl	%ebp				// preserve caller stack frame pointer

-	pushl	%ebx

-//	int		i, z;

-//	short	*zbuf;

-	movl	numverts(%esp),%ecx

-	movl	fv(%esp),%ebx

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-LFVLoop:

-//	for (i=0 ; i<numverts ; i++, fv++)

-//	{

-//	// valid triangle coordinates for filling can include the bottom and

-//	// right clip edges, due to the fill rule; these shouldn't be drawn

-//		if ((fv->v[0] < r_refdef.vrectright) &&

-//			(fv->v[1] < r_refdef.vrectbottom))

-//		{

-	movl	fv_v+0(%ebx),%eax

-	movl	C(r_refdef)+rd_vrectright,%edx

-	cmpl	%edx,%eax

-	jge		LNextVert

-	movl	fv_v+4(%ebx),%esi

-	movl	C(r_refdef)+rd_vrectbottom,%edx

-	cmpl	%edx,%esi

-	jge		LNextVert

-//			zbuf = zspantable[fv->v[1]] + fv->v[0];

-	movl	C(zspantable)(,%esi,4),%edi

-//			z = fv->v[5]>>16;

-	movl	fv_v+20(%ebx),%edx

-	shrl	$16,%edx

-//			if (z >= *zbuf)

-//			{

-//				int		pix;

-	cmpw	(%edi,%eax,2),%dx

-	jl		LNextVert

-//				*zbuf = z;

-	movw	%dx,(%edi,%eax,2)

-//				pix = skintable[fv->v[3]>>16][fv->v[2]>>16];

-	movl	fv_v+12(%ebx),%edi

-	shrl	$16,%edi

-	movl	C(skintable)(,%edi,4),%edi

-	movl	fv_v+8(%ebx),%edx

-	shrl	$16,%edx

-	movb	(%edi,%edx),%dl

-//				pix = ((byte *)acolormap)[pix + (fv->v[4] & 0xFF00)];

-	movl	fv_v+16(%ebx),%edi

-	andl	$0xFF00,%edi

-	andl	$0x00FF,%edx

-	addl	%edx,%edi

-	movl	C(acolormap),%edx

-	movb	(%edx,%edi,1),%dl

-//				d_viewbuffer[d_scantable[fv->v[1]] + fv->v[0]] = pix;

-	movl	C(d_scantable)(,%esi,4),%edi

-	movl	C(d_viewbuffer),%esi

-	addl	%eax,%edi

-	movb	%dl,(%esi,%edi)

-//			}

-//		}

-//	}

-LNextVert:

-	addl	$(fv_size),%ebx

-	decl	%ecx

-	jnz		LFVLoop

-	popl	%edi

-	popl	%esi

-	popl	%ebx

-	popl	%ebp

-	ret

-//----------------------------------------------------------------------

-// Alias model non-subdivided polygon dispatching code

-//

-// not C-callable because of stack buffer cleanup

-//----------------------------------------------------------------------

-.globl C(D_DrawNonSubdiv)

-C(D_DrawNonSubdiv):

-	pushl	%ebp				// preserve caller stack frame pointer

-	movl	C(r_affinetridesc)+atd_numtriangles,%ebp

-	pushl	%ebx

-	shll	$(mtri_shift),%ebp

-	pushl	%esi				// preserve register variables

-	movl	C(r_affinetridesc)+atd_ptriangles,%esi

-	pushl	%edi

-//	mtriangle_t		*ptri;

-//	finalvert_t		*pfv, *index0, *index1, *index2;

-//	int				i;

-//	int				lnumtriangles;

-//	pfv = r_affinetridesc.pfinalverts;

-//	ptri = r_affinetridesc.ptriangles;

-//	lnumtriangles = r_affinetridesc.numtriangles;

-LNDLoop:

-//	for (i=0 ; i<lnumtriangles ; i++, ptri++)

-//	{

-//		index0 = pfv + ptri->vertindex[0];

-//		index1 = pfv + ptri->vertindex[1];

-//		index2 = pfv + ptri->vertindex[2];

-	movl	C(r_affinetridesc)+atd_pfinalverts,%edi

-	movl	mtri_vertindex+0-mtri_size(%esi,%ebp,1),%ecx

-	shll	$(fv_shift),%ecx

-	movl	mtri_vertindex+4-mtri_size(%esi,%ebp,1),%edx

-	shll	$(fv_shift),%edx

-	movl	mtri_vertindex+8-mtri_size(%esi,%ebp,1),%ebx

-	shll	$(fv_shift),%ebx

-	addl	%edi,%ecx

-	addl	%edi,%edx

-	addl	%edi,%ebx

-//		d_xdenom = (index0->v[1]-index1->v[1]) *

-//				(index0->v[0]-index2->v[0]) -

-//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1]);

-	movl	fv_v+4(%ecx),%eax

-	movl	fv_v+0(%ecx),%esi

-	subl	fv_v+4(%edx),%eax

-	subl	fv_v+0(%ebx),%esi

-	imull	%esi,%eax

-	movl	fv_v+0(%ecx),%esi

-	movl	fv_v+4(%ecx),%edi

-	subl	fv_v+0(%edx),%esi

-	subl	fv_v+4(%ebx),%edi

-	imull	%esi,%edi

-	subl	%edi,%eax

-//		if (d_xdenom >= 0)

-//		{

-//			continue;

-	jns		LNextTri

-//		}

-	movl	%eax,C(d_xdenom)

-	fildl	C(d_xdenom)

-//		r_p0[0] = index0->v[0];		// u

-//		r_p0[1] = index0->v[1];		// v

-//		r_p0[2] = index0->v[2];		// s

-//		r_p0[3] = index0->v[3];		// t

-//		r_p0[4] = index0->v[4];		// light

-//		r_p0[5] = index0->v[5];		// iz

-	movl	fv_v+0(%ecx),%eax

-	movl	fv_v+4(%ecx),%esi

-	movl	%eax,C(r_p0)+0

-	movl	%esi,C(r_p0)+4

-	movl	fv_v+8(%ecx),%eax

-	movl	fv_v+12(%ecx),%esi

-	movl	%eax,C(r_p0)+8

-	movl	%esi,C(r_p0)+12

-	movl	fv_v+16(%ecx),%eax

-	movl	fv_v+20(%ecx),%esi

-	movl	%eax,C(r_p0)+16

-	movl	%esi,C(r_p0)+20

-	fdivrs	float_1

-//		r_p1[0] = index1->v[0];

-//		r_p1[1] = index1->v[1];

-//		r_p1[2] = index1->v[2];

-//		r_p1[3] = index1->v[3];

-//		r_p1[4] = index1->v[4];

-//		r_p1[5] = index1->v[5];

-	movl	fv_v+0(%edx),%eax

-	movl	fv_v+4(%edx),%esi

-	movl	%eax,C(r_p1)+0

-	movl	%esi,C(r_p1)+4

-	movl	fv_v+8(%edx),%eax

-	movl	fv_v+12(%edx),%esi

-	movl	%eax,C(r_p1)+8

-	movl	%esi,C(r_p1)+12

-	movl	fv_v+16(%edx),%eax

-	movl	fv_v+20(%edx),%esi

-	movl	%eax,C(r_p1)+16

-	movl	%esi,C(r_p1)+20

-//		r_p2[0] = index2->v[0];

-//		r_p2[1] = index2->v[1];

-//		r_p2[2] = index2->v[2];

-//		r_p2[3] = index2->v[3];

-//		r_p2[4] = index2->v[4];

-//		r_p2[5] = index2->v[5];

-	movl	fv_v+0(%ebx),%eax

-	movl	fv_v+4(%ebx),%esi

-	movl	%eax,C(r_p2)+0

-	movl	%esi,C(r_p2)+4

-	movl	fv_v+8(%ebx),%eax

-	movl	fv_v+12(%ebx),%esi

-	movl	%eax,C(r_p2)+8

-	movl	%esi,C(r_p2)+12

-	movl	fv_v+16(%ebx),%eax

-	movl	fv_v+20(%ebx),%esi

-	movl	%eax,C(r_p2)+16

-	movl	C(r_affinetridesc)+atd_ptriangles,%edi

-	movl	%esi,C(r_p2)+20

-	movl	mtri_facesfront-mtri_size(%edi,%ebp,1),%eax

-//		if (!ptri->facesfront)

-//		{

-	testl	%eax,%eax

-	jnz		LFacesFront

-//			if (index0->flags & ALIAS_ONSEAM)

-//				r_p0[2] += r_affinetridesc.seamfixupX16;

-	movl	fv_flags(%ecx),%eax

-	movl	fv_flags(%edx),%esi

-	movl	fv_flags(%ebx),%edi

-	testl	$(ALIAS_ONSEAM),%eax

-	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax

-	jz		LOnseamDone0

-	addl	%eax,C(r_p0)+8

-LOnseamDone0:

-//			if (index1->flags & ALIAS_ONSEAM)

-// 				r_p1[2] += r_affinetridesc.seamfixupX16;

-	testl	$(ALIAS_ONSEAM),%esi

-	jz		LOnseamDone1

-	addl	%eax,C(r_p1)+8

-LOnseamDone1:

-//			if (index2->flags & ALIAS_ONSEAM)

-//				r_p2[2] += r_affinetridesc.seamfixupX16;

-	testl	$(ALIAS_ONSEAM),%edi

-	jz		LOnseamDone2

-	addl	%eax,C(r_p2)+8

-LOnseamDone2:

-//		}

-LFacesFront:

-	fstps	C(d_xdenom)

-//		D_PolysetSetEdgeTable ();

-//		D_RasterizeAliasPolySmooth ();

-		call	C(D_PolysetSetEdgeTable)

-		call	C(D_RasterizeAliasPolySmooth)

-LNextTri:

-		movl	C(r_affinetridesc)+atd_ptriangles,%esi

-		subl	$16,%ebp

-		jnz		LNDLoop

-//	}

-	popl	%edi

-	popl	%esi

-	popl	%ebx

-	popl	%ebp

-	addl	$(SPAN_SIZE),%esp

-	ret

-#endif	// id386

--- a/d_scana.s

+++ /dev/null

@@ -1,70 +1,0 @@

-//

-// d_scana.s

-// x86 assembly-language turbulent texture mapping code

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef id386

-	.data

-	.text

-//----------------------------------------------------------------------

-// turbulent texture mapping code

-//----------------------------------------------------------------------

-	.align 4

-.globl C(D_DrawTurbulent8Span)

-C(D_DrawTurbulent8Span):

-	pushl	%ebp				// preserve caller's stack frame pointer

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-	pushl	%ebx

-	movl	C(r_turb_s),%esi

-	movl	C(r_turb_t),%ecx

-	movl	C(r_turb_pdest),%edi

-	movl	C(r_turb_spancount),%ebx

-Llp:

-	movl	%ecx,%eax

-	movl	%esi,%edx

-	sarl	$16,%eax

-	movl	C(r_turb_turb),%ebp

-	sarl	$16,%edx

-	andl	$(CYCLE-1),%eax

-	andl	$(CYCLE-1),%edx

-	movl	(%ebp,%eax,4),%eax

-	movl	(%ebp,%edx,4),%edx

-	addl	%esi,%eax

-	sarl	$16,%eax

-	addl	%ecx,%edx

-	sarl	$16,%edx

-	andl	$(TURB_TEX_SIZE-1),%eax

-	andl	$(TURB_TEX_SIZE-1),%edx

-	shll	$6,%edx

-	movl	C(r_turb_pbase),%ebp

-	addl	%eax,%edx

-	incl	%edi

-	addl	C(r_turb_sstep),%esi

-	addl	C(r_turb_tstep),%ecx

-	movb	(%ebp,%edx,1),%dl

-	decl	%ebx

-	movb	%dl,-1(%edi)

-	jnz		Llp

-	movl	%edi,C(r_turb_pdest)

-	popl	%ebx				// restore register variables

-	popl	%edi

-	popl	%esi

-	popl	%ebp				// restore caller's stack frame pointer

-	ret

-#endif	// id386

--- a/d_spr8.s

+++ /dev/null

@@ -1,881 +1,0 @@

-//

-// d_spr8.s

-// x86 assembly-language horizontal 8-bpp transparent span-drawing code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#ifdef id386

-//----------------------------------------------------------------------

-// 8-bpp horizontal span drawing code for polygons, with transparency.

-//----------------------------------------------------------------------

-	.text

-// out-of-line, rarely-needed clamping code

-LClampHigh0:

-	movl	C(bbextents),%esi

-	jmp		LClampReentry0

-LClampHighOrLow0:

-	jg		LClampHigh0

-	xorl	%esi,%esi

-	jmp		LClampReentry0

-LClampHigh1:

-	movl	C(bbextentt),%edx

-	jmp		LClampReentry1

-LClampHighOrLow1:

-	jg		LClampHigh1

-	xorl	%edx,%edx

-	jmp		LClampReentry1

-LClampLow2:

-	movl	$2048,%ebp

-	jmp		LClampReentry2

-LClampHigh2:

-	movl	C(bbextents),%ebp

-	jmp		LClampReentry2

-LClampLow3:

-	movl	$2048,%ecx

-	jmp		LClampReentry3

-LClampHigh3:

-	movl	C(bbextentt),%ecx

-	jmp		LClampReentry3

-LClampLow4:

-	movl	$2048,%eax

-	jmp		LClampReentry4

-LClampHigh4:

-	movl	C(bbextents),%eax

-	jmp		LClampReentry4

-LClampLow5:

-	movl	$2048,%ebx

-	jmp		LClampReentry5

-LClampHigh5:

-	movl	C(bbextentt),%ebx

-	jmp		LClampReentry5

-#define pspans	4+16

-	.align 4

-.globl C(D_SpriteDrawSpans)

-C(D_SpriteDrawSpans):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//

-// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock

-// and span list pointers, and 1/z step in 0.32 fixed-point

-//

-// FIXME: any overlap from rearranging?

-	flds	C(d_sdivzstepu)

-	fmuls	fp_8

-	movl	C(cacheblock),%edx

-	flds	C(d_tdivzstepu)

-	fmuls	fp_8

-	movl	pspans(%esp),%ebx	// point to the first span descriptor

-	flds	C(d_zistepu)

-	fmuls	fp_8

-	movl	%edx,pbase			// pbase = cacheblock

-	flds	C(d_zistepu)

-	fmuls	fp_64kx64k

-	fxch	%st(3)

-	fstps	sdivz8stepu

-	fstps	zi8stepu

-	fstps	tdivz8stepu

-	fistpl	izistep

-	movl	izistep,%eax

-	rorl	$16,%eax		// put upper 16 bits in low word

-	movl	sspan_t_count(%ebx),%ecx

-	movl	%eax,izistep

-	cmpl	$0,%ecx

-	jle		LNextSpan

-LSpanLoop:

-//

-// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the

-// initial s and t values

-//

-// FIXME: pipeline FILD?

-	fildl	sspan_t_v(%ebx)

-	fildl	sspan_t_u(%ebx)

-	fld		%st(1)			// dv | du | dv

-	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv

-	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv

-	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

-	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

-	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |

-							//  dv*d_sdivzstepv | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |

-							//  dv*d_sdivzstepv | du | dv

-	faddp	%st(0),%st(2)	// du*d_tdivzstepu |

-							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |

-							//  du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  du*d_tdivzstepu | du | dv

-	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

-							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv

-	fadds	C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +

-							//  du*d_sdivzstepu; stays in %st(2) at end

-	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |

-							//  s/z

-	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |

-							//  du*d_tdivzstepu | du | s/z

-	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |

-							//  du*d_tdivzstepu | du | s/z

-	faddp	%st(0),%st(2)	// dv*d_zistepv |

-							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z

-	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  dv*d_zistepv | s/z

-	fmuls	C(d_zistepu)		// du*d_zistepu |

-							//  dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  dv*d_zistepv | s/z

-	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |

-							//  du*d_zistepu | dv*d_zistepv | s/z

-	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +

-							//  du*d_tdivzstepu; stays in %st(1) at end

-	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z

-	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z

-	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z

-	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z

-	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +

-							//  du*d_zistepu; stays in %st(0) at end

-							// 1/z | fp_64k | t/z | s/z

-	fld		%st(0)			// FIXME: get rid of stall on FMUL?

-	fmuls	fp_64kx64k

-	fxch	%st(1)

-//

-// calculate and clamp s & t

-//

-	fdivr	%st(0),%st(2)	// 1/z | z*64k | t/z | s/z

-	fxch	%st(1)

-	fistpl	izi				// 0.32 fixed-point 1/z

-	movl	izi,%ebp

-//

-// set pz to point to the first z-buffer pixel in the span

-//

-	rorl	$16,%ebp		// put upper 16 bits in low word

-	movl	sspan_t_v(%ebx),%eax

-	movl	%ebp,izi

-	movl	sspan_t_u(%ebx),%ebp

-	imull	C(d_zrowbytes)

-	shll	$1,%ebp					// a word per pixel

-	addl	C(d_pzbuffer),%eax

-	addl	%ebp,%eax

-	movl	%eax,pz

-//

-// point %edi to the first pixel in the span

-//

-	movl	C(d_viewbuffer),%ebp

-	movl	sspan_t_v(%ebx),%eax

-	pushl	%ebx		// preserve spans pointer

-	movl	C(tadjust),%edx

-	movl	C(sadjust),%esi

-	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth

-	addl	%ebp,%edi

-	movl	sspan_t_u(%ebx),%ebp

-	addl	%ebp,%edi				// pdest = &pdestspan[scans->u];

-//

-// now start the FDIV for the end of the span

-//

-	cmpl	$8,%ecx

-	ja		LSetupNotLast1

-	decl	%ecx

-	jz		LCleanup1		// if only one pixel, no need to start an FDIV

-	movl	%ecx,spancountminus1

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	fildl	spancountminus1

-	flds	C(d_tdivzstepu)	// _d_tdivzstepu | spancountminus1

-	flds	C(d_zistepu)	// _d_zistepu | _d_tdivzstepu | spancountminus1

-	fmul	%st(2),%st(0)	// _d_zistepu*scm1 | _d_tdivzstepu | scm1

-	fxch	%st(1)			// _d_tdivzstepu | _d_zistepu*scm1 | scm1

-	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1

-	fxch	%st(2)			// scm1 | _d_zistepu*scm1 | _d_tdivzstepu*scm1

-	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_zistepu*scm1 |

-							//  _d_tdivzstepu*scm1

-	fxch	%st(1)			// _d_zistepu*scm1 | _d_sdivzstepu*scm1 |

-							//  _d_tdivzstepu*scm1

-	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1

-	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1

-	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1

-	faddp	%st(0),%st(3)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

-							//  overlap

-	jmp		LFDIVInFlight1

-LCleanup1:

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	jmp		LFDIVInFlight1

-	.align	4

-LSetupNotLast1:

-// finish up the s and t calcs

-	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

-	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

-	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

-	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

-	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

-	fxch	%st(1)			// s | t | 1/z | t/z | s/z

-	fistpl	s				// 1/z | t | t/z | s/z

-	fistpl	t				// 1/z | t/z | s/z

-	fadds	zi8stepu

-	fxch	%st(2)

-	fadds	sdivz8stepu

-	fxch	%st(2)

-	flds	tdivz8stepu

-	faddp	%st(0),%st(2)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// z = 1/1/z

-							// this is what we've gone to all this trouble to

-							//  overlap

-LFDIVInFlight1:

-	addl	s,%esi

-	addl	t,%edx

-	movl	C(bbextents),%ebx

-	movl	C(bbextentt),%ebp

-	cmpl	%ebx,%esi

-	ja		LClampHighOrLow0

-LClampReentry0:

-	movl	%esi,s

-	movl	pbase,%ebx

-	shll	$16,%esi

-	cmpl	%ebp,%edx

-	movl	%esi,sfracf

-	ja		LClampHighOrLow1

-LClampReentry1:

-	movl	%edx,t

-	movl	s,%esi					// sfrac = scans->sfrac;

-	shll	$16,%edx

-	movl	t,%eax					// tfrac = scans->tfrac;

-	sarl	$16,%esi

-	movl	%edx,tfracf

-//

-// calculate the texture starting address

-//

-	sarl	$16,%eax

-	addl	%ebx,%esi

-	imull	C(cachewidth),%eax		// (tfrac >> 16) * cachewidth

-	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +

-									//           ((tfrac >> 16) * cachewidth);

-//

-// determine whether last span or not

-//

-	cmpl	$8,%ecx

-	jna		LLastSegment

-//

-// not the last segment; do full 8-wide segment

-//

-LNotLastSegment:

-//

-// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

-// get there

-//

-// pick up after the FDIV that was left in flight previously

-	fld		%st(0)			// duplicate it

-	fmul	%st(4),%st(0)	// s = s/z * z

-	fxch	%st(1)

-	fmul	%st(3),%st(0)	// t = t/z * z

-	fxch	%st(1)

-	fistpl	snext

-	fistpl	tnext

-	movl	snext,%eax

-	movl	tnext,%edx

-	subl	$8,%ecx		// count off this segments' pixels

-	movl	C(sadjust),%ebp

-	pushl	%ecx		// remember count of remaining pixels

-	movl	C(tadjust),%ecx

-	addl	%eax,%ebp

-	addl	%edx,%ecx

-	movl	C(bbextents),%eax

-	movl	C(bbextentt),%edx

-	cmpl	$2048,%ebp

-	jl		LClampLow2

-	cmpl	%eax,%ebp

-	ja		LClampHigh2

-LClampReentry2:

-	cmpl	$2048,%ecx

-	jl		LClampLow3

-	cmpl	%edx,%ecx

-	ja		LClampHigh3

-LClampReentry3:

-	movl	%ebp,snext

-	movl	%ecx,tnext

-	subl	s,%ebp

-	subl	t,%ecx

-//

-// set up advancetable

-//

-	movl	%ecx,%eax

-	movl	%ebp,%edx

-	sarl	$19,%edx			// sstep >>= 16;

-	movl	C(cachewidth),%ebx

-	sarl	$19,%eax			// tstep >>= 16;

-	jz		LIsZero

-	imull	%ebx,%eax			// (tstep >> 16) * cachewidth;

-LIsZero:

-	addl	%edx,%eax			// add in sstep

-								// (tstep >> 16) * cachewidth + (sstep >> 16);

-	movl	tfracf,%edx

-	movl	%eax,advancetable+4	// advance base in t

-	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +

-								//  (sstep >> 16);

-	shll	$13,%ebp			// left-justify sstep fractional part

-	movl	%ebp,sstep

-	movl	sfracf,%ebx

-	shll	$13,%ecx			// left-justify tstep fractional part

-	movl	%eax,advancetable	// advance extra in t

-	movl	%ecx,tstep

-	movl	pz,%ecx

-	movl	izi,%ebp

-	cmpw	(%ecx),%bp

-	jl		Lp1

-	movb	(%esi),%al			// get first source texel

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp1

-	movw	%bp,(%ecx)

-	movb	%al,(%edi)			// store first dest pixel

-Lp1:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx			// advance tfrac fractional part by tstep frac

-	sbbl	%eax,%eax			// turn tstep carry into -1 (0 if none)

-	addl	sstep,%ebx			// advance sfrac fractional part by sstep frac

-	adcl	advancetable+4(,%eax,4),%esi	// point to next source texel

-	cmpw	2(%ecx),%bp

-	jl		Lp2

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp2

-	movw	%bp,2(%ecx)

-	movb	%al,1(%edi)

-Lp2:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	cmpw	4(%ecx),%bp

-	jl		Lp3

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp3

-	movw	%bp,4(%ecx)

-	movb	%al,2(%edi)

-Lp3:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	cmpw	6(%ecx),%bp

-	jl		Lp4

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp4

-	movw	%bp,6(%ecx)

-	movb	%al,3(%edi)

-Lp4:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	cmpw	8(%ecx),%bp

-	jl		Lp5

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp5

-	movw	%bp,8(%ecx)

-	movb	%al,4(%edi)

-Lp5:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-//

-// start FDIV for end of next segment in flight, so it can overlap

-//

-	popl	%eax

-	cmpl	$8,%eax			// more than one segment after this?

-	ja		LSetupNotLast2	// yes

-	decl	%eax

-	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV

-	movl	%eax,spancountminus1

-	fildl	spancountminus1

-	flds	C(d_zistepu)		// _d_zistepu | spancountminus1

-	fmul	%st(1),%st(0)	// _d_zistepu*scm1 | scm1

-	flds	C(d_tdivzstepu)	// _d_tdivzstepu | _d_zistepu*scm1 | scm1

-	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1

-	fxch	%st(1)			// _d_zistepu*scm1 | _d_tdivzstepu*scm1 | scm1

-	faddp	%st(0),%st(3)	// _d_tdivzstepu*scm1 | scm1

-	fxch	%st(1)			// scm1 | _d_tdivzstepu*scm1

-	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1

-	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1

-	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1

-	flds	fp_64k			// 64k | _d_sdivzstepu*scm1

-	fxch	%st(1)			// _d_sdivzstepu*scm1 | 64k

-	faddp	%st(0),%st(4)	// 64k

-	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

-							//  overlap

-	jmp		LFDIVInFlight2

-	.align	4

-LSetupNotLast2:

-	fadds	zi8stepu

-	fxch	%st(2)

-	fadds	sdivz8stepu

-	fxch	%st(2)

-	flds	tdivz8stepu

-	faddp	%st(0),%st(2)

-	flds	fp_64k

-	fdiv	%st(1),%st(0)	// z = 1/1/z

-							// this is what we've gone to all this trouble to

-							//  overlap

-LFDIVInFlight2:

-	pushl	%eax

-	cmpw	10(%ecx),%bp

-	jl		Lp6

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp6

-	movw	%bp,10(%ecx)

-	movb	%al,5(%edi)

-Lp6:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	cmpw	12(%ecx),%bp

-	jl		Lp7

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp7

-	movw	%bp,12(%ecx)

-	movb	%al,6(%edi)

-Lp7:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	cmpw	14(%ecx),%bp

-	jl		Lp8

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp8

-	movw	%bp,14(%ecx)

-	movb	%al,7(%edi)

-Lp8:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-	addl	$8,%edi

-	addl	$16,%ecx

-	movl	%edx,tfracf

-	movl	snext,%edx

-	movl	%ebx,sfracf

-	movl	tnext,%ebx

-	movl	%edx,s

-	movl	%ebx,t

-	movl	%ecx,pz

-	movl	%ebp,izi

-	popl	%ecx				// retrieve count

-//

-// determine whether last span or not

-//

-	cmpl	$8,%ecx				// are there multiple segments remaining?

-	ja		LNotLastSegment		// yes

-//

-// last segment of scan

-//

-LLastSegment:

-//

-// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

-// get there. The number of pixels left is variable, and we want to land on the

-// last pixel, not step one past it, so we can't run into arithmetic problems

-//

-	testl	%ecx,%ecx

-	jz		LNoSteps		// just draw the last pixel and we're done

-// pick up after the FDIV that was left in flight previously

-	fld		%st(0)			// duplicate it

-	fmul	%st(4),%st(0)	// s = s/z * z

-	fxch	%st(1)

-	fmul	%st(3),%st(0)	// t = t/z * z

-	fxch	%st(1)

-	fistpl	snext

-	fistpl	tnext

-	movl	C(tadjust),%ebx

-	movl	C(sadjust),%eax

-	addl	snext,%eax

-	addl	tnext,%ebx

-	movl	C(bbextents),%ebp

-	movl	C(bbextentt),%edx

-	cmpl	$2048,%eax

-	jl		LClampLow4

-	cmpl	%ebp,%eax

-	ja		LClampHigh4

-LClampReentry4:

-	movl	%eax,snext

-	cmpl	$2048,%ebx

-	jl		LClampLow5

-	cmpl	%edx,%ebx

-	ja		LClampHigh5

-LClampReentry5:

-	cmpl	$1,%ecx			// don't bother

-	je		LOnlyOneStep	// if two pixels in segment, there's only one step,

-							//  of the segment length

-	subl	s,%eax

-	subl	t,%ebx

-	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31

-	addl	%ebx,%ebx		//  reciprocal yields 16.48

-	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)

-	movl	%edx,%ebp

-	movl	%ebx,%eax

-	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)

-LSetEntryvec:

-//

-// set up advancetable

-//

-	movl	spr8entryvec_table(,%ecx,4),%ebx

-	movl	%edx,%eax

-	pushl	%ebx				// entry point into code for RET later

-	movl	%ebp,%ecx

-	sarl	$16,%ecx			// sstep >>= 16;

-	movl	C(cachewidth),%ebx

-	sarl	$16,%edx			// tstep >>= 16;

-	jz		LIsZeroLast

-	imull	%ebx,%edx			// (tstep >> 16) * cachewidth;

-LIsZeroLast:

-	addl	%ecx,%edx			// add in sstep

-								// (tstep >> 16) * cachewidth + (sstep >> 16);

-	movl	tfracf,%ecx

-	movl	%edx,advancetable+4	// advance base in t

-	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +

-								//  (sstep >> 16);

-	shll	$16,%ebp			// left-justify sstep fractional part

-	movl	sfracf,%ebx

-	shll	$16,%eax			// left-justify tstep fractional part

-	movl	%edx,advancetable	// advance extra in t

-	movl	%eax,tstep

-	movl	%ebp,sstep

-	movl	%ecx,%edx

-	movl	pz,%ecx

-	movl	izi,%ebp

-	ret							// jump to the number-of-pixels handler

-//----------------------------------------

-LNoSteps:

-	movl	pz,%ecx

-	subl	$7,%edi			// adjust for hardwired offset

-	subl	$14,%ecx

-	jmp		LEndSpan

-LOnlyOneStep:

-	subl	s,%eax

-	subl	t,%ebx

-	movl	%eax,%ebp

-	movl	%ebx,%edx

-	jmp		LSetEntryvec

-//----------------------------------------

-.globl	Spr8Entry2_8

-Spr8Entry2_8:

-	subl	$6,%edi		// adjust for hardwired offsets

-	subl	$12,%ecx

-	movb	(%esi),%al

-	jmp		LLEntry2_8

-//----------------------------------------

-.globl	Spr8Entry3_8

-Spr8Entry3_8:

-	subl	$5,%edi		// adjust for hardwired offsets

-	subl	$10,%ecx

-	jmp		LLEntry3_8

-//----------------------------------------

-.globl	Spr8Entry4_8

-Spr8Entry4_8:

-	subl	$4,%edi		// adjust for hardwired offsets

-	subl	$8,%ecx

-	jmp		LLEntry4_8

-//----------------------------------------

-.globl	Spr8Entry5_8

-Spr8Entry5_8:

-	subl	$3,%edi		// adjust for hardwired offsets

-	subl	$6,%ecx

-	jmp		LLEntry5_8

-//----------------------------------------

-.globl	Spr8Entry6_8

-Spr8Entry6_8:

-	subl	$2,%edi		// adjust for hardwired offsets

-	subl	$4,%ecx

-	jmp		LLEntry6_8

-//----------------------------------------

-.globl	Spr8Entry7_8

-Spr8Entry7_8:

-	decl	%edi		// adjust for hardwired offsets

-	subl	$2,%ecx

-	jmp		LLEntry7_8

-//----------------------------------------

-.globl	Spr8Entry8_8

-Spr8Entry8_8:

-	cmpw	(%ecx),%bp

-	jl		Lp9

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp9

-	movw	%bp,(%ecx)

-	movb	%al,(%edi)

-Lp9:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LLEntry7_8:

-	cmpw	2(%ecx),%bp

-	jl		Lp10

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp10

-	movw	%bp,2(%ecx)

-	movb	%al,1(%edi)

-Lp10:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LLEntry6_8:

-	cmpw	4(%ecx),%bp

-	jl		Lp11

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp11

-	movw	%bp,4(%ecx)

-	movb	%al,2(%edi)

-Lp11:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LLEntry5_8:

-	cmpw	6(%ecx),%bp

-	jl		Lp12

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp12

-	movw	%bp,6(%ecx)

-	movb	%al,3(%edi)

-Lp12:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LLEntry4_8:

-	cmpw	8(%ecx),%bp

-	jl		Lp13

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp13

-	movw	%bp,8(%ecx)

-	movb	%al,4(%edi)

-Lp13:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LLEntry3_8:

-	cmpw	10(%ecx),%bp

-	jl		Lp14

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp14

-	movw	%bp,10(%ecx)

-	movb	%al,5(%edi)

-Lp14:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LLEntry2_8:

-	cmpw	12(%ecx),%bp

-	jl		Lp15

-	movb	(%esi),%al

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp15

-	movw	%bp,12(%ecx)

-	movb	%al,6(%edi)

-Lp15:

-	addl	izistep,%ebp

-	adcl	$0,%ebp

-	addl	tstep,%edx

-	sbbl	%eax,%eax

-	addl	sstep,%ebx

-	adcl	advancetable+4(,%eax,4),%esi

-LEndSpan:

-	cmpw	14(%ecx),%bp

-	jl		Lp16

-	movb	(%esi),%al		// load first texel in segment

-	cmpb	$(TRANSPARENT_COLOR),%al

-	jz		Lp16

-	movw	%bp,14(%ecx)

-	movb	%al,7(%edi)

-Lp16:

-//

-// clear s/z, t/z, 1/z from FP stack

-//

-	fstp %st(0)

-	fstp %st(0)

-	fstp %st(0)

-	popl	%ebx				// restore spans pointer

-LNextSpan:

-	addl	$(sspan_t_size),%ebx // point to next span

-	movl	sspan_t_count(%ebx),%ecx

-	cmpl	$0,%ecx				// any more spans?

-	jg		LSpanLoop			// yes

-	jz		LNextSpan			// yes, but this one's empty

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-#endif	// id386

--- a/d_varsa.s

+++ /dev/null

@@ -1,186 +1,0 @@

-//

-// d_varsa.s

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef	id386

-	.data

-//-------------------------------------------------------

-// global refresh variables

-//-------------------------------------------------------

-// FIXME: put all refresh variables into one contiguous block. Make into one

-// big structure, like cl or sv?

-	.align	4

-.globl	C(d_sdivzstepu)

-.globl	C(d_tdivzstepu)

-.globl	C(d_zistepu)

-.globl	C(d_sdivzstepv)

-.globl	C(d_tdivzstepv)

-.globl	C(d_zistepv)

-.globl	C(d_sdivzorigin)

-.globl	C(d_tdivzorigin)

-.globl	C(d_ziorigin)

-C(d_sdivzstepu):	.single	0

-C(d_tdivzstepu):	.single	0

-C(d_zistepu):		.single	0

-C(d_sdivzstepv):	.single	0

-C(d_tdivzstepv):	.single	0

-C(d_zistepv):		.single	0

-C(d_sdivzorigin):	.single	0

-C(d_tdivzorigin):	.single	0

-C(d_ziorigin):		.single	0

-.globl	C(sadjust)

-.globl	C(tadjust)

-.globl	C(bbextents)

-.globl	C(bbextentt)

-C(sadjust):			.long	0

-C(tadjust):			.long	0

-C(bbextents):		.long	0

-C(bbextentt):		.long	0

-.globl	C(cacheblock)

-.globl	C(d_viewbuffer)

-.globl	C(cachewidth)

-.globl	C(d_pzbuffer)

-.globl	C(d_zrowbytes)

-.globl	C(d_zwidth)

-C(cacheblock):		.long	0

-C(cachewidth):		.long	0

-C(d_viewbuffer):	.long	0

-C(d_pzbuffer):		.long	0

-C(d_zrowbytes):		.long	0

-C(d_zwidth):		.long	0

-//-------------------------------------------------------

-// ASM-only variables

-//-------------------------------------------------------

-.globl	izi

-izi:			.long	0

-.globl	pbase, s, t, sfracf, tfracf, snext, tnext

-.globl	spancountminus1, zi16stepu, sdivz16stepu, tdivz16stepu

-.globl	zi8stepu, sdivz8stepu, tdivz8stepu, pz

-s:				.long	0

-t:				.long	0

-snext:			.long	0

-tnext:			.long	0

-sfracf:			.long	0

-tfracf:			.long	0

-pbase:			.long	0

-zi8stepu:		.long	0

-sdivz8stepu:	.long	0

-tdivz8stepu:	.long	0

-zi16stepu:		.long	0

-sdivz16stepu:	.long	0

-tdivz16stepu:	.long	0

-spancountminus1: .long	0

-pz:				.long	0

-.globl	izistep

-izistep:				.long	0

-//-------------------------------------------------------

-// local variables for d_draw16.s

-//-------------------------------------------------------

-.globl	reciprocal_table_16, entryvec_table_16

-// 1/2, 1/3, 1/4, 1/5, 1/6, 1/7, 1/8, 1/9, 1/10, 1/11, 1/12, 1/13,

-// 1/14, and 1/15 in 0.32 form

-reciprocal_table_16:	.long	0x40000000, 0x2aaaaaaa, 0x20000000

-						.long	0x19999999, 0x15555555, 0x12492492

-						.long	0x10000000, 0xe38e38e, 0xccccccc, 0xba2e8ba

-						.long	0xaaaaaaa, 0x9d89d89, 0x9249249, 0x8888888

-	.extern Entry2_16

-	.extern Entry3_16

-	.extern Entry4_16

-	.extern Entry5_16

-	.extern Entry6_16

-	.extern Entry7_16

-	.extern Entry8_16

-	.extern Entry9_16

-	.extern Entry10_16

-	.extern Entry11_16

-	.extern Entry12_16

-	.extern Entry13_16

-	.extern Entry14_16

-	.extern Entry15_16

-	.extern Entry16_16

-entryvec_table_16:	.long	0, Entry2_16, Entry3_16, Entry4_16

-					.long	Entry5_16, Entry6_16, Entry7_16, Entry8_16

-					.long	Entry9_16, Entry10_16, Entry11_16, Entry12_16

-					.long	Entry13_16, Entry14_16, Entry15_16, Entry16_16

-//-------------------------------------------------------

-// local variables for d_parta.s

-//-------------------------------------------------------

-.globl	DP_Count, DP_u, DP_v, DP_32768, DP_Color, DP_Pix, DP_EntryTable

-DP_Count:		.long	0

-DP_u:			.long	0

-DP_v:			.long	0

-DP_32768:		.single	32768.0

-DP_Color:		.long	0

-DP_Pix:			.long	0

-	.extern DP_1x1

-	.extern DP_2x2

-	.extern DP_3x3

-	.extern DP_4x4

-DP_EntryTable:	.long	DP_1x1, DP_2x2, DP_3x3, DP_4x4

-//

-// advancetable is 8 bytes, but points to the middle of that range so negative

-// offsets will work

-//

-.globl	advancetable, sstep, tstep, pspantemp, counttemp, jumptemp

-advancetable:	.long	0, 0

-sstep:			.long	0

-tstep:			.long	0

-pspantemp:		.long	0

-counttemp:		.long	0

-jumptemp:		.long	0

-// 1/2, 1/3, 1/4, 1/5, 1/6, and 1/7 in 0.32 form

-.globl	reciprocal_table, entryvec_table

-reciprocal_table:	.long	0x40000000, 0x2aaaaaaa, 0x20000000

-					.long	0x19999999, 0x15555555, 0x12492492

-	.extern Entry2_8

-	.extern Entry3_8

-	.extern Entry4_8

-	.extern Entry5_8

-	.extern Entry6_8

-	.extern Entry7_8

-	.extern Entry8_8

-entryvec_table:	.long	0, Entry2_8, Entry3_8, Entry4_8

-				.long	Entry5_8, Entry6_8, Entry7_8, Entry8_8

-	.extern Spr8Entry2_8

-	.extern Spr8Entry3_8

-	.extern Spr8Entry4_8

-	.extern Spr8Entry5_8

-	.extern Spr8Entry6_8

-	.extern Spr8Entry7_8

-	.extern Spr8Entry8_8

-.globl spr8entryvec_table

-spr8entryvec_table:	.long	0, Spr8Entry2_8, Spr8Entry3_8, Spr8Entry4_8

-					.long	Spr8Entry5_8, Spr8Entry6_8, Spr8Entry7_8, Spr8Entry8_8

-#endif	// id386

--- a/math.s

+++ /dev/null

@@ -1,399 +1,0 @@

-//

-// math.s

-// x86 assembly-language math routines.

-#define GLQUAKE	1	// don't include unneeded defs

-#include "asm_i386.h"

-#include "quakeasm.h"

-#ifdef	id386

-	.data

-	.align	4

-Ljmptab:	.long	Lcase0, Lcase1, Lcase2, Lcase3

-			.long	Lcase4, Lcase5, Lcase6, Lcase7

-	.text

-// TODO: rounding needed?

-// stack parameter offset

-#define	val	4

-.globl C(Invert24To16)

-C(Invert24To16):

-	movl	val(%esp),%ecx

-	movl	$0x100,%edx		// 0x10000000000 as dividend

-	cmpl	%edx,%ecx

-	jle		LOutOfRange

-	subl	%eax,%eax

-	divl	%ecx

-	ret

-LOutOfRange:

-	movl	$0xFFFFFFFF,%eax

-	ret

-#define	in	4

-#define out	8

-	.align 2

-.globl C(TransformVector)

-C(TransformVector):

-	movl	in(%esp),%eax

-	movl	out(%esp),%edx

-	flds	(%eax)		// in[0]

-	fmuls	C(vright)		// in[0]*vright[0]

-	flds	(%eax)		// in[0] | in[0]*vright[0]

-	fmuls	C(vup)		// in[0]*vup[0] | in[0]*vright[0]

-	flds	(%eax)		// in[0] | in[0]*vup[0] | in[0]*vright[0]

-	fmuls	C(vpn)		// in[0]*vpn[0] | in[0]*vup[0] | in[0]*vright[0]

-	flds	4(%eax)		// in[1] | ...

-	fmuls	C(vright)+4	// in[1]*vright[1] | ...

-	flds	4(%eax)		// in[1] | in[1]*vright[1] | ...

-	fmuls	C(vup)+4		// in[1]*vup[1] | in[1]*vright[1] | ...

-	flds	4(%eax)		// in[1] | in[1]*vup[1] | in[1]*vright[1] | ...

-	fmuls	C(vpn)+4		// in[1]*vpn[1] | in[1]*vup[1] | in[1]*vright[1] | ...

-	fxch	%st(2)		// in[1]*vright[1] | in[1]*vup[1] | in[1]*vpn[1] | ...

-	faddp	%st(0),%st(5)	// in[1]*vup[1] | in[1]*vpn[1] | ...

-	faddp	%st(0),%st(3)	// in[1]*vpn[1] | ...

-	faddp	%st(0),%st(1)	// vpn_accum | vup_accum | vright_accum

-	flds	8(%eax)		// in[2] | ...

-	fmuls	C(vright)+8	// in[2]*vright[2] | ...

-	flds	8(%eax)		// in[2] | in[2]*vright[2] | ...

-	fmuls	C(vup)+8		// in[2]*vup[2] | in[2]*vright[2] | ...

-	flds	8(%eax)		// in[2] | in[2]*vup[2] | in[2]*vright[2] | ...

-	fmuls	C(vpn)+8		// in[2]*vpn[2] | in[2]*vup[2] | in[2]*vright[2] | ...

-	fxch	%st(2)		// in[2]*vright[2] | in[2]*vup[2] | in[2]*vpn[2] | ...

-	faddp	%st(0),%st(5)	// in[2]*vup[2] | in[2]*vpn[2] | ...

-	faddp	%st(0),%st(3)	// in[2]*vpn[2] | ...

-	faddp	%st(0),%st(1)	// vpn_accum | vup_accum | vright_accum

-	fstps	8(%edx)		// out[2]

-	fstps	4(%edx)		// out[1]

-	fstps	(%edx)		// out[0]

-	ret

-#define EMINS	4+4

-#define EMAXS	4+8

-#define P		4+12

-	.align 2

-.globl C(BoxOnPlaneSide)

-C(BoxOnPlaneSide):

-	pushl	%ebx

-	movl	P(%esp),%edx

-	movl	EMINS(%esp),%ecx

-	xorl	%eax,%eax

-	movl	EMAXS(%esp),%ebx

-	movb	pl_signbits(%edx),%al

-	cmpb	$8,%al

-	jge		Lerror

-	flds	pl_normal(%edx)		// p->normal[0]

-	fld		%st(0)				// p->normal[0] | p->normal[0]

-	jmp		Ljmptab(,%eax,4)

-//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

-//dist2= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

-Lcase0:

-	fmuls	(%ebx)				// p->normal[0]*emaxs[0] | p->normal[0]

-	flds	pl_normal+4(%edx)	// p->normal[1] | p->normal[0]*emaxs[0] |

-								//  p->normal[0]

-	fxch	%st(2)				// p->normal[0] | p->normal[0]*emaxs[0] |

-								//  p->normal[1]

-	fmuls	(%ecx)				// p->normal[0]*emins[0] |

-								//  p->normal[0]*emaxs[0] | p->normal[1]

-	fxch	%st(2)				// p->normal[1] | p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fld		%st(0)				// p->normal[1] | p->normal[1] |

-								//  p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fmuls	4(%ebx)				// p->normal[1]*emaxs[1] | p->normal[1] |

-								//  p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	flds	pl_normal+8(%edx)	// p->normal[2] | p->normal[1]*emaxs[1] |

-								//  p->normal[1] | p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fxch	%st(2)				// p->normal[1] | p->normal[1]*emaxs[1] |

-								//  p->normal[2] | p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fmuls	4(%ecx)				// p->normal[1]*emins[1] |

-								//  p->normal[1]*emaxs[1] |

-								//  p->normal[2] | p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fxch	%st(2)				// p->normal[2] | p->normal[1]*emaxs[1] |

-								//  p->normal[1]*emins[1] |

-								//  p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fld		%st(0)				// p->normal[2] | p->normal[2] |

-								//  p->normal[1]*emaxs[1] |

-								//  p->normal[1]*emins[1] |

-								//  p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fmuls	8(%ebx)				// p->normal[2]*emaxs[2] |

-								//  p->normal[2] |

-								//  p->normal[1]*emaxs[1] |

-								//  p->normal[1]*emins[1] |

-								//  p->normal[0]*emaxs[0] |

-								//  p->normal[0]*emins[0]

-	fxch	%st(5)				// p->normal[0]*emins[0] |

-								//  p->normal[2] |

-								//  p->normal[1]*emaxs[1] |

-								//  p->normal[1]*emins[1] |

-								//  p->normal[0]*emaxs[0] |

-								//  p->normal[2]*emaxs[2]

-	faddp	%st(0),%st(3)		//p->normal[2] |

-								// p->normal[1]*emaxs[1] |

-								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

-								// p->normal[0]*emaxs[0] |

-								// p->normal[2]*emaxs[2]

-	fmuls	8(%ecx)				//p->normal[2]*emins[2] |

-								// p->normal[1]*emaxs[1] |

-								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

-								// p->normal[0]*emaxs[0] |

-								// p->normal[2]*emaxs[2]

-	fxch	%st(1)				//p->normal[1]*emaxs[1] |

-								// p->normal[2]*emins[2] |

-								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

-								// p->normal[0]*emaxs[0] |

-								// p->normal[2]*emaxs[2]

-	faddp	%st(0),%st(3)		//p->normal[2]*emins[2] |

-								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

-								// p->normal[0]*emaxs[0]+p->normal[1]*emaxs[1]|

-								// p->normal[2]*emaxs[2]

-	fxch	%st(3)				//p->normal[2]*emaxs[2] +

-								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

-								// p->normal[0]*emaxs[0]+p->normal[1]*emaxs[1]|

-								// p->normal[2]*emins[2]

-	faddp	%st(0),%st(2)		//p->normal[1]*emins[1]+p->normal[0]*emins[0]|

-								// dist1 | p->normal[2]*emins[2]

-	jmp		LSetSides

-//dist1= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

-//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

-Lcase1:

-	fmuls	(%ecx)				// emins[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ebx)				// emaxs[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ebx)				// emaxs[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ecx)				// emins[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-	jmp		LSetSides

-//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

-//dist2= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

-Lcase2:

-	fmuls	(%ebx)				// emaxs[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ecx)				// emins[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ecx)				// emins[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ebx)				// emaxs[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-	jmp		LSetSides

-//dist1= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

-//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

-Lcase3:

-	fmuls	(%ecx)				// emins[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ebx)				// emaxs[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ecx)				// emins[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ebx)				// emaxs[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-	jmp		LSetSides

-//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

-//dist2= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

-Lcase4:

-	fmuls	(%ebx)				// emaxs[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ecx)				// emins[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ebx)				// emaxs[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ecx)				// emins[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-	jmp		LSetSides

-//dist1= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

-//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

-Lcase5:

-	fmuls	(%ecx)				// emins[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ebx)				// emaxs[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ebx)				// emaxs[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ecx)				// emins[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-	jmp		LSetSides

-//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

-//dist2= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

-Lcase6:

-	fmuls	(%ebx)				// emaxs[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ecx)				// emins[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ecx)				// emins[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ebx)				// emaxs[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-	jmp		LSetSides

-//dist1= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

-//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

-Lcase7:

-	fmuls	(%ecx)				// emins[0]

-	flds	pl_normal+4(%edx)

-	fxch	%st(2)

-	fmuls	(%ebx)				// emaxs[0]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	4(%ecx)				// emins[1]

-	flds	pl_normal+8(%edx)

-	fxch	%st(2)

-	fmuls	4(%ebx)				// emaxs[1]

-	fxch	%st(2)

-	fld		%st(0)

-	fmuls	8(%ecx)				// emins[2]

-	fxch	%st(5)

-	faddp	%st(0),%st(3)

-	fmuls	8(%ebx)				// emaxs[2]

-	fxch	%st(1)

-	faddp	%st(0),%st(3)

-	fxch	%st(3)

-	faddp	%st(0),%st(2)

-LSetSides:

-//	sides = 0;

-//	if (dist1 >= p->dist)

-//		sides = 1;

-//	if (dist2 < p->dist)

-//		sides |= 2;

-	faddp	%st(0),%st(2)		// dist1 | dist2

-	fcomps	pl_dist(%edx)

-	xorl	%ecx,%ecx

-	fnstsw	%ax

-	fcomps	pl_dist(%edx)

-	andb	$1,%ah

-	xorb	$1,%ah

-	addb	%ah,%cl

-	fnstsw	%ax

-	andb	$1,%ah

-	addb	%ah,%ah

-	addb	%ah,%cl

-//	return sides;

-	popl	%ebx

-	movl	%ecx,%eax	// return status

-	ret

-Lerror:

-	call	C(BOPS_Error)

-#endif	// id386

--- a/mkfile

+++ b/mkfile

@@ -80,31 +80,10 @@

 	snd_mem.o\

 	snd_mix.o\

 	snd_linux.o\

-	d_draw.o\

-	d_draw16.o\

-	d_parta.o\

-	d_polysa.o\

-	d_scana.o\

-	d_spr8.o\

-	d_varsa.o\

-	math.o\

-	r_aliasa.o\

-	r_drawa.o\

-	r_edgea.o\

-	r_varsa.o\

-	surf16.o\

-	surf8.o\

-	worlda.o\

-	r_aclipa.o\

-	snd_mixa.o\

-	#sys_dosa.o\

 HFILES=\

 	adivtab.h\

 	anorms.h\

-	asm_draw.h\

-	asm_i386.h\

-	block16.h\

 	bspfile.h\

 	cdaudio.h\

 	client.h\

@@ -113,7 +92,6 @@

 	console.h\

 	crc.h\

 	cvar.h\

-	d_ifacea.h\

 	d_iface.h\

 	d_local.h\

 	draw.h\

@@ -131,7 +109,6 @@

 	progdefs.h\

 	progs.h\

 	protocol.h\

-	quakeasm.h\

 	quakedef.h\

 	render.h\

 	r_local.h\

@@ -149,8 +126,3 @@

 	zone.h\

 <$PLAN9/src/mkone

-AS=gcc

-%.$O:	%.s

-	$AS $AFLAGS -o $target -c $stem.s

--- a/quakeasm.h

+++ /dev/null

@@ -1,248 +1,0 @@

-//

-// quakeasm.h: general asm header file

-//

-//#define GLQUAKE	1

-#ifdef __i386__

-#define id386

-#endif

-// !!! must be kept the same as in d_iface.h !!!

-#define TRANSPARENT_COLOR	255

-#ifndef GLQUAKE

-	.extern C(d_zistepu)

-	.extern C(d_pzbuffer)

-	.extern C(d_zistepv)

-	.extern C(d_zrowbytes)

-	.extern C(d_ziorigin)

-	.extern C(r_turb_s)

-	.extern C(r_turb_t)

-	.extern C(r_turb_pdest)

-	.extern C(r_turb_spancount)

-	.extern C(r_turb_turb)

-	.extern C(r_turb_pbase)

-	.extern C(r_turb_sstep)

-	.extern C(r_turb_tstep)

-	.extern	C(r_bmodelactive)

-	.extern	C(d_sdivzstepu)

-	.extern	C(d_tdivzstepu)

-	.extern	C(d_sdivzstepv)

-	.extern	C(d_tdivzstepv)

-	.extern	C(d_sdivzorigin)

-	.extern	C(d_tdivzorigin)

-	.extern	C(sadjust)

-	.extern	C(tadjust)

-	.extern	C(bbextents)

-	.extern	C(bbextentt)

-	.extern	C(cacheblock)

-	.extern	C(d_viewbuffer)

-	.extern	C(cachewidth)

-	.extern	C(d_pzbuffer)

-	.extern	C(d_zrowbytes)

-	.extern	C(d_zwidth)

-	.extern C(d_scantable)

-	.extern C(r_lightptr)

-	.extern C(r_numvblocks)

-	.extern C(prowdestbase)

-	.extern C(pbasesource)

-	.extern C(r_lightwidth)

-	.extern C(lightright)

-	.extern C(lightrightstep)

-	.extern C(lightdeltastep)

-	.extern C(lightdelta)

-	.extern C(lightright)

-	.extern C(lightdelta)

-	.extern C(sourcetstep)

-	.extern C(surfrowbytes)

-	.extern C(lightrightstep)

-	.extern C(lightdeltastep)

-	.extern C(r_sourcemax)

-	.extern C(r_stepback)

-	.extern C(colormap)

-	.extern C(blocksize)

-	.extern C(sourcesstep)

-	.extern C(lightleft)

-	.extern C(blockdivshift)

-	.extern C(blockdivmask)

-	.extern C(lightleftstep)

-	.extern C(r_origin)

-	.extern C(r_ppn)

-	.extern C(r_pup)

-	.extern C(r_pright)

-	.extern C(ycenter)

-	.extern C(xcenter)

-	.extern C(d_vrectbottom_particle)

-	.extern C(d_vrectright_particle)

-	.extern C(d_vrecty)

-	.extern C(d_vrectx)

-	.extern C(d_pix_shift)

-	.extern C(d_pix_min)

-	.extern C(d_pix_max)

-	.extern C(d_y_aspect_shift)

-	.extern C(screenwidth)

-	.extern C(r_leftclipped)

-	.extern C(r_leftenter)

-	.extern C(r_rightclipped)

-	.extern C(r_rightenter)

-	.extern C(modelorg)

-	.extern C(xscale)

-	.extern C(r_refdef)

-	.extern C(yscale)

-	.extern C(r_leftexit)

-	.extern C(r_rightexit)

-	.extern C(r_lastvertvalid)

-	.extern C(cacheoffset)

-	.extern C(newedges)

-	.extern C(removeedges)

-	.extern C(r_pedge)

-	.extern C(r_framecount)

-	.extern C(r_u1)

-	.extern C(r_emitted)

-	.extern C(edge_p)

-	.extern C(surface_p)

-	.extern C(surfaces)

-	.extern C(r_lzi1)

-	.extern C(r_v1)

-	.extern C(r_ceilv1)

-	.extern C(r_nearzi)

-	.extern C(r_nearzionly)

-	.extern C(edge_aftertail)

-	.extern C(edge_tail)

-	.extern C(current_iv)

-	.extern C(edge_head_u_shift20)

-	.extern C(span_p)

-	.extern C(edge_head)

-	.extern C(fv)

-	.extern C(edge_tail_u_shift20)

-	.extern C(r_apverts)

-	.extern C(r_anumverts)

-	.extern C(aliastransform)

-	.extern C(r_avertexnormals)

-	.extern C(r_plightvec)

-	.extern C(r_ambientlight)

-	.extern C(r_shadelight)

-	.extern C(aliasxcenter)

-	.extern C(aliasycenter)

-	.extern C(a_sstepxfrac)

-	.extern C(r_affinetridesc)

-	.extern C(acolormap)

-	.extern C(d_pcolormap)

-	.extern C(r_affinetridesc)

-	.extern C(d_sfrac)

-	.extern C(d_ptex)

-	.extern C(d_pedgespanpackage)

-	.extern C(d_tfrac)

-	.extern C(d_light)

-	.extern C(d_zi)

-	.extern C(d_pdest)

-	.extern C(d_pz)

-	.extern C(d_aspancount)

-	.extern C(erroradjustup)

-	.extern C(errorterm)

-	.extern C(d_xdenom)

-	.extern C(r_p0)

-	.extern C(r_p1)

-	.extern C(r_p2)

-	.extern C(a_tstepxfrac)

-	.extern C(r_sstepx)

-	.extern C(r_tstepx)

-	.extern C(a_ststepxwhole)

-	.extern C(zspantable)

-	.extern C(skintable)

-	.extern C(r_zistepx)

-	.extern C(erroradjustdown)

-	.extern C(d_countextrastep)

-	.extern C(ubasestep)

-	.extern C(a_ststepxwhole)

-	.extern C(a_tstepxfrac)

-	.extern C(r_lstepx)

-	.extern C(a_spans)

-	.extern C(erroradjustdown)

-	.extern C(d_pdestextrastep)

-	.extern C(d_pzextrastep)

-	.extern C(d_sfracextrastep)

-	.extern C(d_ptexextrastep)

-	.extern C(d_countextrastep)

-	.extern C(d_tfracextrastep)

-	.extern C(d_lightextrastep)

-	.extern C(d_ziextrastep)

-	.extern C(d_pdestbasestep)

-	.extern C(d_pzbasestep)

-	.extern C(d_sfracbasestep)

-	.extern C(d_ptexbasestep)

-	.extern C(ubasestep)

-	.extern C(d_tfracbasestep)

-	.extern C(d_lightbasestep)

-	.extern C(d_zibasestep)

-	.extern C(zspantable)

-	.extern C(r_lstepy)

-	.extern C(r_sstepy)

-	.extern C(r_tstepy)

-	.extern C(r_zistepy)

-	.extern C(D_PolysetSetEdgeTable)

-	.extern C(D_RasterizeAliasPolySmooth)

-	.extern float_point5

-	.extern Float2ToThe31nd

-	.extern izistep

-	.extern izi

-	.extern FloatMinus2ToThe31nd

-	.extern float_1

-	.extern float_particle_z_clip

-	.extern float_minus_1

-	.extern float_0

-	.extern fp_16

-	.extern fp_64k

-	.extern fp_1m

-	.extern fp_1m_minus_1

-	.extern fp_8

-	.extern entryvec_table

-	.extern advancetable

-	.extern sstep

-	.extern tstep

-	.extern pspantemp

-	.extern counttemp

-	.extern jumptemp

-	.extern reciprocal_table

-	.extern DP_Count

-	.extern DP_u

-	.extern DP_v

-	.extern DP_32768

-	.extern DP_Color

-	.extern DP_Pix

-	.extern DP_EntryTable

-	.extern	pbase

-	.extern s

-	.extern t

-	.extern sfracf

-	.extern tfracf

-	.extern snext

-	.extern tnext

-	.extern	spancountminus1

-	.extern zi16stepu

-	.extern sdivz16stepu

-	.extern tdivz16stepu

-	.extern	zi8stepu

-	.extern sdivz8stepu

-	.extern tdivz8stepu

-	.extern reciprocal_table_16

-	.extern entryvec_table_16

-	.extern ceil_cw

-	.extern single_cw

-	.extern fp_64kx64k

-	.extern pz

-	.extern spr8entryvec_table

-#endif

-	.extern C(snd_scaletable)

-	.extern C(paintbuffer)

-	.extern C(snd_linear_count)

-	.extern C(snd_p)

-	.extern C(snd_vol)

-	.extern C(snd_out)

-	.extern C(vright)

-	.extern C(vup)

-	.extern C(vpn)

-	.extern C(BOPS_Error)

--- a/r_aclipa.s

+++ /dev/null

@@ -1,197 +1,0 @@

-//

-// r_aliasa.s

-// x86 assembly-language Alias model transform and project code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef id386

-	.data

-Ltemp0:	.long	0

-Ltemp1:	.long	0

-	.text

-#define pfv0		8+4

-#define pfv1		8+8

-#define out			8+12

-.globl C(R_Alias_clip_bottom)

-C(R_Alias_clip_bottom):

-	pushl	%esi

-	pushl	%edi

-	movl	pfv0(%esp),%esi

-	movl	pfv1(%esp),%edi

-	movl	C(r_refdef)+rd_aliasvrectbottom,%eax

-LDoForwardOrBackward:

-	movl	fv_v+4(%esi),%edx

-	movl	fv_v+4(%edi),%ecx

-	cmpl	%ecx,%edx

-	jl		LDoForward

-	movl	fv_v+4(%esi),%ecx

-	movl	fv_v+4(%edi),%edx

-	movl	pfv0(%esp),%edi

-	movl	pfv1(%esp),%esi

-LDoForward:

-	subl	%edx,%ecx

-	subl	%edx,%eax

-	movl	%ecx,Ltemp1

-	movl	%eax,Ltemp0

-	fildl	Ltemp1

-	fildl	Ltemp0

-	movl	out(%esp),%edx

-	movl	$2,%eax

-	fdivp	%st(0),%st(1)					// scale

-LDo3Forward:

-	fildl	fv_v+0(%esi)	// fv0v0 | scale

-	fildl	fv_v+0(%edi)	// fv1v0 | fv0v0 | scale

-	fildl	fv_v+4(%esi)	// fv0v1 | fv1v0 | fv0v0 | scale

-	fildl	fv_v+4(%edi)	// fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale

-	fildl	fv_v+8(%esi)	// fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale

-	fildl	fv_v+8(%edi)	// fv1v2 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 |

-							//  scale

-	fxch	%st(5)			// fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv1v2 |

-							//  scale

-	fsubr	%st(0),%st(4)	// fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0-fv0v0 |

-							//  fv1v2 | scale

-	fxch	%st(3)			// fv0v1 | fv0v2 | fv1v1 | fv0v0 | fv1v0-fv0v0 |

-							//  fv1v2 | scale

-	fsubr	%st(0),%st(2)	// fv0v1 | fv0v2 | fv1v1-fv0v1 | fv0v0 |

-							//  fv1v0-fv0v0 | fv1v2 | scale

-	fxch	%st(1)			// fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 |

-							//  fv1v0-fv0v0 | fv1v2 | scale

-	fsubr	%st(0),%st(5)	// fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 |

-							//  fv1v0-fv0v0 | fv1v2-fv0v2 | scale

-	fxch	%st(6)			// scale | fv0v1 | fv1v1-fv0v1 | fv0v0 |

-							//  fv1v0-fv0v0 | fv1v2-fv0v2 | fv0v2

-	fmul	%st(0),%st(4)	// scale | fv0v1 | fv1v1-fv0v1 | fv0v0 |

-							//  (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2

-	addl	$12,%edi

-	fmul	%st(0),%st(2)	// scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 |

-							//  (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2

-	addl	$12,%esi

-	addl	$12,%edx

-	fmul	%st(0),%st(5)	// scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 |

-							//  (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale |

-							//  fv0v2

-	fxch	%st(3)			// fv0v0 | fv0v1 | (fv1v1-fv0v1)*scale | scale |

-							//  (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale |

-							//  fv0v2

-	faddp	%st(0),%st(4)	// fv0v1 | (fv1v1-fv0v1)*scale | scale |

-							//  fv0v0+(fv1v0-fv0v0)*scale |

-							//  (fv1v2-fv0v2)*scale | fv0v2

-	faddp	%st(0),%st(1)	// fv0v1+(fv1v1-fv0v1)*scale | scale |

-							//  fv0v0+(fv1v0-fv0v0)*scale |

-							//  (fv1v2-fv0v2)*scale | fv0v2

-	fxch	%st(4)			// fv0v2 | scale | fv0v0+(fv1v0-fv0v0)*scale |

-							//  (fv1v2-fv0v2)*scale | fv0v1+(fv1v1-fv0v1)*scale

-	faddp	%st(0),%st(3)	// scale | fv0v0+(fv1v0-fv0v0)*scale |

-							//  fv0v2+(fv1v2-fv0v2)*scale |

-							//  fv0v1+(fv1v1-fv0v1)*scale

-	fxch	%st(1)			// fv0v0+(fv1v0-fv0v0)*scale | scale |

-							//  fv0v2+(fv1v2-fv0v2)*scale |

-							//  fv0v1+(fv1v1-fv0v1)*scale

-	fadds	float_point5

-	fxch	%st(3)			// fv0v1+(fv1v1-fv0v1)*scale | scale |

-							//  fv0v2+(fv1v2-fv0v2)*scale |

-							//  fv0v0+(fv1v0-fv0v0)*scale

-	fadds	float_point5

-	fxch	%st(2)			// fv0v2+(fv1v2-fv0v2)*scale | scale |

-							//  fv0v1+(fv1v1-fv0v1)*scale |

-							//  fv0v0+(fv1v0-fv0v0)*scale

-	fadds	float_point5

-	fxch	%st(3)			// fv0v0+(fv1v0-fv0v0)*scale | scale |

-							//  fv0v1+(fv1v1-fv0v1)*scale |

-							//  fv0v2+(fv1v2-fv0v2)*scale

-	fistpl	fv_v+0-12(%edx)	// scale | fv0v1+(fv1v1-fv0v1)*scale |

-							//  fv0v2+(fv1v2-fv0v2)*scale

-	fxch	%st(1)			// fv0v1+(fv1v1-fv0v1)*scale | scale |

-							//  fv0v2+(fv1v2-fv0v2)*scale | scale

-	fistpl	fv_v+4-12(%edx)	// scale | fv0v2+(fv1v2-fv0v2)*scale

-	fxch	%st(1)			// fv0v2+(fv1v2-fv0v2)*sc | scale

-	fistpl	fv_v+8-12(%edx)	// scale

-	decl	%eax

-	jnz		LDo3Forward

-	fstp	%st(0)

-	popl	%edi

-	popl	%esi

-	ret

-.globl C(R_Alias_clip_top)

-C(R_Alias_clip_top):

-	pushl	%esi

-	pushl	%edi

-	movl	pfv0(%esp),%esi

-	movl	pfv1(%esp),%edi

-	movl	C(r_refdef)+rd_aliasvrect+4,%eax

-	jmp		LDoForwardOrBackward

-.globl C(R_Alias_clip_right)

-C(R_Alias_clip_right):

-	pushl	%esi

-	pushl	%edi

-	movl	pfv0(%esp),%esi

-	movl	pfv1(%esp),%edi

-	movl	C(r_refdef)+rd_aliasvrectright,%eax

-LRightLeftEntry:

-	movl	fv_v+4(%esi),%edx

-	movl	fv_v+4(%edi),%ecx

-	cmpl	%ecx,%edx

-	movl	fv_v+0(%esi),%edx

-	movl	fv_v+0(%edi),%ecx

-	jl		LDoForward2

-	movl	fv_v+0(%esi),%ecx

-	movl	fv_v+0(%edi),%edx

-	movl	pfv0(%esp),%edi

-	movl	pfv1(%esp),%esi

-LDoForward2:

-	jmp		LDoForward

-.globl C(R_Alias_clip_left)

-C(R_Alias_clip_left):

-	pushl	%esi

-	pushl	%edi

-	movl	pfv0(%esp),%esi

-	movl	pfv1(%esp),%edi

-	movl	C(r_refdef)+rd_aliasvrect+0,%eax

-	jmp		LRightLeftEntry

-#endif	// id386

--- a/r_aliasa.s

+++ /dev/null

@@ -1,218 +1,0 @@

-//

-// r_aliasa.s

-// x86 assembly-language Alias model transform and project code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef id386

-	.data

-Lfloat_1:	.single	1.0

-Ltemp:		.long	0

-Lcoords:	.long	0, 0, 0

-	.text

-#define fv			12+4

-#define pstverts	12+8

-.globl C(R_AliasTransformAndProjectFinalVerts)

-C(R_AliasTransformAndProjectFinalVerts):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-//	int			i, temp;

-//	float		lightcos, *plightnormal, zi;

-//	trivertx_t	*pverts;

-//	pverts = r_apverts;

-	movl	C(r_apverts),%esi

-//	for (i=0 ; i<r_anumverts ; i++, fv++, pverts++, pstverts++)

-//	{

-	movl	pstverts(%esp),%ebp

-	movl	fv(%esp),%edi

-	movl	C(r_anumverts),%ecx

-	subl	%edx,%edx

-Lloop:

-//	// transform and project

-//		zi = 1.0 / (DotProduct(pverts->v, aliastransform[2]) +

-//				aliastransform[2][3]);

-	movb	(%esi),%dl

-	movb	%dl,Lcoords

-	fildl	Lcoords				// v[0]

-	movb	1(%esi),%dl

-	movb	%dl,Lcoords+4

-	fildl	Lcoords+4			// v[1] | v[0]

-	movb	2(%esi),%dl

-	movb	%dl,Lcoords+8

-	fildl	Lcoords+8			// v[2] | v[1] | v[0]

-	fld		%st(2)				// v[0] | v[2] | v[1] | v[0]

-	fmuls	C(aliastransform)+32 // accum | v[2] | v[1] | v[0]

-	fld		%st(2)				// v[1] | accum | v[2] | v[1] | v[0]

-	fmuls	C(aliastransform)+36 // accum2 | accum | v[2] | v[1] | v[0]

-	fxch	%st(1)				// accum | accum2 | v[2] | v[1] | v[0]

-	fadds	C(aliastransform)+44 // accum | accum2 | v[2] | v[1] | v[0]

-	fld		%st(2)				// v[2] | accum | accum2 | v[2] | v[1] | v[0]

-	fmuls	C(aliastransform)+40 // accum3 | accum | accum2 | v[2] | v[1] |

-								 //  v[0]

-	fxch	%st(1)				// accum | accum3 | accum2 | v[2] | v[1] | v[0]

-	faddp	%st(0),%st(2)		// accum3 | accum | v[2] | v[1] | v[0]

-	movb	tv_lightnormalindex(%esi),%dl

-	movl	stv_s(%ebp),%eax

-	movl	%eax,fv_v+8(%edi)

-	faddp	%st(0),%st(1)		// z | v[2] | v[1] | v[0]

-	movl	stv_t(%ebp),%eax

-	movl	%eax,fv_v+12(%edi)

-//	// lighting

-//		plightnormal = r_avertexnormals[pverts->lightnormalindex];

-	fdivrs	Lfloat_1			// zi | v[2] | v[1] | v[0]

-//		fv->v[2] = pstverts->s;

-//		fv->v[3] = pstverts->t;

-//		fv->flags = pstverts->onseam;

-	movl	stv_onseam(%ebp),%eax

-	movl	%eax,fv_flags(%edi)

-	movl	fv_size(%edi),%eax

-	movl	stv_size(%ebp),%eax

-	movl	4(%esi),%eax

-	leal	(%edx,%edx,2),%eax	// index*3

-	fxch	%st(3)				// v[0] | v[2] | v[1] | zi

-//		lightcos = DotProduct (plightnormal, r_plightvec);

-	flds	C(r_avertexnormals)(,%eax,4)

-	fmuls	C(r_plightvec)

-	flds	C(r_avertexnormals)+4(,%eax,4)

-	fmuls	C(r_plightvec)+4

-	flds	C(r_avertexnormals)+8(,%eax,4)

-	fmuls	C(r_plightvec)+8

-	fxch	%st(1)

-	faddp	%st(0),%st(2)

-	fld		%st(2)				 // v[0] | laccum | laccum2 | v[0] | v[2] |

-								 //  v[1] | zi

-	fmuls	C(aliastransform)+0  // xaccum | laccum | laccum2 | v[0] | v[2] |

-								 //  v[1] | zi

-	fxch	%st(2)				 // laccum2 | laccum | xaccum | v[0] | v[2] |

-								 //  v[1] | zi

-	faddp	%st(0),%st(1)		 // laccum | xaccum | v[0] | v[2] | v[1] | zi

-//		temp = r_ambientlight;

-//		if (lightcos < 0)

-//		{

-	fsts	Ltemp

-	movl	C(r_ambientlight),%eax

-	movb	Ltemp+3,%dl

-	testb	$0x80,%dl

-	jz		Lsavelight	// no need to clamp if only ambient lit, because

-						//  r_ambientlight is preclamped

-//			temp += (int)(r_shadelight * lightcos);

-	fmuls	C(r_shadelight)

-// FIXME: fast float->int conversion?

-	fistpl	Ltemp

-	addl	Ltemp,%eax

-//		// clamp; because we limited the minimum ambient and shading light, we

-//		// don't have to clamp low light, just bright

-//			if (temp < 0)

-//				temp = 0;

-	jns		Lp1

-	subl	%eax,%eax

-//		}

-Lp1:

-//		fv->v[4] = temp;

-//

-//	// x, y, and z are scaled down by 1/2**31 in the transform, so 1/z is

-//	// scaled up by 1/2**31, and the scaling cancels out for x and y in the

-//	// projection

-//		fv->v[0] = ((DotProduct(pverts->v, aliastransform[0]) +

-//				aliastransform[0][3]) * zi) + aliasxcenter;

-//		fv->v[1] = ((DotProduct(pverts->v, aliastransform[1]) +

-//				aliastransform[1][3]) * zi) + aliasycenter;

-//		fv->v[5] = zi;

-	fxch	%st(1)				 // v[0] | xaccum | v[2] | v[1] | zi

-	fmuls	C(aliastransform)+16 // yaccum | xaccum | v[2] | v[1] | zi

-	fxch	%st(3)				 // v[1] | xaccum | v[2] | yaccum | zi

-	fld		%st(0)				 // v[1] | v[1] | xaccum | v[2] | yaccum | zi

-	fmuls	C(aliastransform)+4	 // xaccum2 | v[1] | xaccum | v[2] | yaccum |zi

-	fxch	%st(1)				 // v[1] | xaccum2 | xaccum | v[2] | yaccum |zi

-	movl	%eax,fv_v+16(%edi)

-	fmuls	C(aliastransform)+20 // yaccum2 | xaccum2 | xaccum | v[2] | yaccum|

-								 //  zi

-	fxch	%st(2)				 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum|

-								 //  zi

-	fadds	C(aliastransform)+12 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum|

-								 //  zi

-	fxch	%st(4)				 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum|

-								 //  zi

-	fadds	C(aliastransform)+28 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum|

-								 //  zi

-	fxch	%st(3)				 // v[2] | xaccum2 | yaccum2 | yaccum | xaccum|

-								 //  zi

-	fld		%st(0)				 // v[2] | v[2] | xaccum2 | yaccum2 | yaccum |

-								 //  xaccum | zi

-	fmuls	C(aliastransform)+8	 // xaccum3 | v[2] | xaccum2 | yaccum2 |yaccum|

-								 //  xaccum | zi

-	fxch	%st(1)				 // v[2] | xaccum3 | xaccum2 | yaccum2 |yaccum|

-								 //  xaccum | zi

-	fmuls	C(aliastransform)+24 // yaccum3 | xaccum3 | xaccum2 | yaccum2 |

-								 // yaccum | xaccum | zi

-	fxch	%st(5)				 // xaccum | xaccum3 | xaccum2 | yaccum2 |

-								 // yaccum | yaccum3 | zi

-	faddp	%st(0),%st(2)		 // xaccum3 | xaccum | yaccum2 | yaccum |

-								 //  yaccum3 | zi

-	fxch	%st(3)				 // yaccum | xaccum | yaccum2 | xaccum3 |

-								 //  yaccum3 | zi

-	faddp	%st(0),%st(2)		 // xaccum | yaccum | xaccum3 | yaccum3 | zi

-	addl	$(tv_size),%esi

-	faddp	%st(0),%st(2)		 // yaccum | x | yaccum3 | zi

-	faddp	%st(0),%st(2)		 // x | y | zi

-	addl	$(stv_size),%ebp

-	fmul	%st(2),%st(0)		 // x/z | y | zi

-	fxch	%st(1)				 // y | x/z | zi

-	fmul	%st(2),%st(0)		 // y/z | x/z | zi

-	fxch	%st(1)				 // x/z | y/z | zi

-	fadds	C(aliasxcenter)		 // u | y/z | zi

-	fxch	%st(1)				 // y/z | u | zi

-	fadds	C(aliasycenter)		 // v | u | zi

-	fxch	%st(2)				 // zi | u | v

-// FIXME: fast float->int conversion?

-	fistpl	fv_v+20(%edi)		 // u | v

-	fistpl	fv_v+0(%edi)		 // v

-	fistpl	fv_v+4(%edi)

-//	}

-	addl	$(fv_size),%edi

-	decl	%ecx

-	jnz		Lloop

-	popl	%esi				// restore register variables

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-Lsavelight:

-	fstp	%st(0)

-	jmp		Lp1

-#endif	// id386

--- a/r_drawa.s

+++ /dev/null

@@ -1,819 +1,0 @@

-//

-// r_drawa.s

-// x86 assembly-language edge clipping and emission code

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef	id386

-// !!! if these are changed, they must be changed in r_draw.c too !!!

-#define FULLY_CLIPPED_CACHED	0x80000000

-#define FRAMECOUNT_MASK			0x7FFFFFFF

-	.data

-Ld0:			.single		0.0

-Ld1:			.single		0.0

-Lstack:			.long		0

-Lfp_near_clip:	.single		NEAR_CLIP

-Lceilv0:		.long		0

-Lv:				.long		0

-Lu0:			.long		0

-Lv0:			.long		0

-Lzi0:			.long		0

-	.text

-//----------------------------------------------------------------------

-// edge clipping code

-//----------------------------------------------------------------------

-#define pv0		4+12

-#define pv1		8+12

-#define clip	12+12

-	.align 4

-.globl C(R_ClipEdge)

-C(R_ClipEdge):

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-	pushl	%ebx

-	movl	%esp,Lstack			// for clearing the stack later

-//	float		d0, d1, f;

-//	mvertex_t	clipvert;

-	movl	clip(%esp),%ebx

-	movl	pv0(%esp),%esi

-	movl	pv1(%esp),%edx

-//	if (clip)

-//	{

-	testl	%ebx,%ebx

-	jz		Lemit

-//		do

-//		{

-Lcliploop:

-//			d0 = DotProduct (pv0->position, clip->normal) - clip->dist;

-//			d1 = DotProduct (pv1->position, clip->normal) - clip->dist;

-	flds	mv_position+0(%esi)

-	fmuls	cp_normal+0(%ebx)

-	flds	mv_position+4(%esi)

-	fmuls	cp_normal+4(%ebx)

-	flds	mv_position+8(%esi)

-	fmuls	cp_normal+8(%ebx)

-	fxch	%st(1)

-	faddp	%st(0),%st(2)		// d0mul2 | d0add0

-	flds	mv_position+0(%edx)

-	fmuls	cp_normal+0(%ebx)

-	flds	mv_position+4(%edx)

-	fmuls	cp_normal+4(%ebx)

-	flds	mv_position+8(%edx)

-	fmuls	cp_normal+8(%ebx)

-	fxch	%st(1)

-	faddp	%st(0),%st(2)		// d1mul2 | d1add0 | d0mul2 | d0add0

-	fxch	%st(3)				// d0add0 | d1add0 | d0mul2 | d1mul2

-	faddp	%st(0),%st(2)		// d1add0 | dot0 | d1mul2

-	faddp	%st(0),%st(2)		// dot0 | dot1

-	fsubs	cp_dist(%ebx)		// d0 | dot1

-	fxch	%st(1)				// dot1 | d0

-	fsubs	cp_dist(%ebx)		// d1 | d0

-	fxch	%st(1)

-	fstps	Ld0

-	fstps	Ld1

-//			if (d0 >= 0)

-//			{

-	movl	Ld0,%eax

-	movl	Ld1,%ecx

-	orl		%eax,%ecx

-	js		Lp2

-// both points are unclipped

-Lcontinue:

-//

-//				R_ClipEdge (&clipvert, pv1, clip->next);

-//				return;

-//			}

-//		} while ((clip = clip->next) != NULL);

-	movl	cp_next(%ebx),%ebx

-	testl	%ebx,%ebx

-	jnz		Lcliploop

-//	}

-//// add the edge

-//	R_EmitEdge (pv0, pv1);

-Lemit:

-//

-// set integer rounding to ceil mode, set to single precision

-//

-// FIXME: do away with by manually extracting integers from floats?

-// FIXME: set less often

-	fldcw	ceil_cw

-//	edge_t	*edge, *pcheck;

-//	int		u_check;

-//	float	u, u_step;

-//	vec3_t	local, transformed;

-//	float	*world;

-//	int		v, v2, ceilv0;

-//	float	scale, lzi0, u0, v0;

-//	int		side;

-//	if (r_lastvertvalid)

-//	{

-	cmpl	$0,C(r_lastvertvalid)

-	jz		LCalcFirst

-//		u0 = r_u1;

-//		v0 = r_v1;

-//		lzi0 = r_lzi1;

-//		ceilv0 = r_ceilv1;

-	movl	C(r_lzi1),%eax

-	movl	C(r_u1),%ecx

-	movl	%eax,Lzi0

-	movl	%ecx,Lu0

-	movl	C(r_v1),%ecx

-	movl	C(r_ceilv1),%eax

-	movl	%ecx,Lv0

-	movl	%eax,Lceilv0

-	jmp		LCalcSecond

-//	}

-LCalcFirst:

-//	else

-//	{

-//		world = &pv0->position[0];

-	call	LTransformAndProject	// v0 | lzi0 | u0

-	fsts	Lv0

-	fxch	%st(2)					// u0 | lzi0 | v0

-	fstps	Lu0						// lzi0 | v0

-	fstps	Lzi0					// v0

-//		ceilv0 = (int)(v0 - 2000) + 2000; // ceil(v0);

-	fistpl	Lceilv0

-//	}

-LCalcSecond:

-//	world = &pv1->position[0];

-	movl	%edx,%esi

-	call	LTransformAndProject	// v1 | lzi1 | u1

-	flds	Lu0						// u0 | v1 | lzi1 | u1

-	fxch	%st(3)					// u1 | v1 | lzi1 | u0

-	flds	Lzi0					// lzi0 | u1 | v1 | lzi1 | u0

-	fxch	%st(3)					// lzi1 | u1 | v1 | lzi0 | u0

-	flds	Lv0						// v0 | lzi1 | u1 | v1 | lzi0 | u0

-	fxch	%st(3)					// v1 | lzi1 | u1 | v0 | lzi0 | u0

-//	r_ceilv1 = (int)(r_v1 - 2000) + 2000; // ceil(r_v1);

-	fistl	C(r_ceilv1)

-	fldcw	single_cw				// put back normal floating-point state

-	fsts	C(r_v1)

-	fxch	%st(4)					// lzi0 | lzi1 | u1 | v0 | v1 | u0

-//	if (r_lzi1 > lzi0)

-//		lzi0 = r_lzi1;

-	fcom	%st(1)

-	fnstsw	%ax

-	testb	$1,%ah

-	jz		LP0

-	fstp	%st(0)

-	fld		%st(0)

-LP0:

-	fxch	%st(1)					// lzi1 | lzi0 | u1 | v0 | v1 | u0

-	fstps	C(r_lzi1)				// lzi0 | u1 | v0 | v1 | u0

-	fxch	%st(1)

-	fsts	C(r_u1)

-	fxch	%st(1)

-//	if (lzi0 > r_nearzi)	// for mipmap finding

-//		r_nearzi = lzi0;

-	fcoms	C(r_nearzi)

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jnz		LP1

-	fsts	C(r_nearzi)

-LP1:

-// // for right edges, all we want is the effect on 1/z

-//	if (r_nearzionly)

-//		return;

-	movl	C(r_nearzionly),%eax

-	testl	%eax,%eax

-	jz		LP2

-LPop5AndDone:

-	movl	C(cacheoffset),%eax

-	movl	C(r_framecount),%edx

-	cmpl	$0x7FFFFFFF,%eax

-	jz		LDoPop

-	andl	$(FRAMECOUNT_MASK),%edx

-	orl		$(FULLY_CLIPPED_CACHED),%edx

-	movl	%edx,C(cacheoffset)

-LDoPop:

-	fstp	%st(0)			// u1 | v0 | v1 | u0

-	fstp	%st(0)			// v0 | v1 | u0

-	fstp	%st(0)			// v1 | u0

-	fstp	%st(0)			// u0

-	fstp	%st(0)

-	jmp		Ldone

-LP2:

-// // create the edge

-//	if (ceilv0 == r_ceilv1)

-//		return;		// horizontal edge

-	movl	Lceilv0,%ebx

-	movl	C(edge_p),%edi

-	movl	C(r_ceilv1),%ecx

-	movl	%edi,%edx

-	movl	C(r_pedge),%esi

-	addl	$(et_size),%edx

-	cmpl	%ecx,%ebx

-	jz		LPop5AndDone

-	movl	C(r_pedge),%eax

-	movl	%eax,et_owner(%edi)

-//	side = ceilv0 > r_ceilv1;

-//

-//	edge->nearzi = lzi0;

-	fstps	et_nearzi(%edi)		// u1 | v0 | v1 | u0

-//	if (side == 1)

-//	{

-	jc		LSide0

-LSide1:

-//	// leading edge (go from p2 to p1)

-//		u_step = ((u0 - r_u1) / (v0 - r_v1));

-	fsubrp	%st(0),%st(3)		// v0 | v1 | u0-u1

-	fsub	%st(1),%st(0)		// v0-v1 | v1 | u0-u1

-	fdivrp	%st(0),%st(2)		// v1 | ustep

-//	r_emitted = 1;

-	movl	$1,C(r_emitted)

-//	edge = edge_p++;

-	movl	%edx,C(edge_p)

-// pretouch next edge

-	movl	(%edx),%eax

-//		v2 = ceilv0 - 1;

-//		v = r_ceilv1;

-	movl	%ecx,%eax

-	leal	-1(%ebx),%ecx

-	movl	%eax,%ebx

-//		edge->surfs[0] = 0;

-//		edge->surfs[1] = surface_p - surfaces;

-	movl	C(surface_p),%eax

-	movl	C(surfaces),%esi

-	subl	%edx,%edx

-	subl	%esi,%eax

-	shrl	$(SURF_T_SHIFT),%eax

-	movl	%edx,et_surfs(%edi)

-	movl	%eax,et_surfs+2(%edi)

-	subl	%esi,%esi

-//		u = r_u1 + ((float)v - r_v1) * u_step;

-	movl	%ebx,Lv

-	fildl	Lv					// v | v1 | ustep

-	fsubp	%st(0),%st(1)		// v-v1 | ustep

-	fmul	%st(1),%st(0)		// (v-v1)*ustep | ustep

-	fadds	C(r_u1)				// u | ustep

-	jmp		LSideDone

-//	}

-LSide0:

-//	else

-//	{

-//	// trailing edge (go from p1 to p2)

-//		u_step = ((r_u1 - u0) / (r_v1 - v0));

-	fsub	%st(3),%st(0)		// u1-u0 | v0 | v1 | u0

-	fxch	%st(2)				// v1 | v0 | u1-u0 | u0

-	fsub	%st(1),%st(0)		// v1-v0 | v0 | u1-u0 | u0

-	fdivrp	%st(0),%st(2)		// v0 | ustep | u0

-//	r_emitted = 1;

-	movl	$1,C(r_emitted)

-//	edge = edge_p++;

-	movl	%edx,C(edge_p)

-// pretouch next edge

-	movl	(%edx),%eax

-//		v = ceilv0;

-//		v2 = r_ceilv1 - 1;

-	decl	%ecx

-//		edge->surfs[0] = surface_p - surfaces;

-//		edge->surfs[1] = 0;

-	movl	C(surface_p),%eax

-	movl	C(surfaces),%esi

-	subl	%edx,%edx

-	subl	%esi,%eax

-	shrl	$(SURF_T_SHIFT),%eax

-	movl	%edx,et_surfs+2(%edi)

-	movl	%eax,et_surfs(%edi)

-	movl	$1,%esi

-//		u = u0 + ((float)v - v0) * u_step;

-	movl	%ebx,Lv

-	fildl	Lv					// v | v0 | ustep | u0

-	fsubp	%st(0),%st(1)		// v-v0 | ustep | u0

-	fmul	%st(1),%st(0)		// (v-v0)*ustep | ustep | u0

-	faddp	%st(0),%st(2)		// ustep | u

-	fxch	%st(1)				// u | ustep

-//	}

-LSideDone:

-//	edge->u_step = u_step*0x100000;

-//	edge->u = u*0x100000 + 0xFFFFF;

-	fmuls	fp_1m				// u*0x100000 | ustep

-	fxch	%st(1)				// ustep | u*0x100000

-	fmuls	fp_1m				// ustep*0x100000 | u*0x100000

-	fxch	%st(1)				// u*0x100000 | ustep*0x100000

-	fadds	fp_1m_minus_1		// u*0x100000 + 0xFFFFF | ustep*0x100000

-	fxch	%st(1)				// ustep*0x100000 | u*0x100000 + 0xFFFFF

-	fistpl	et_u_step(%edi)		// u*0x100000 + 0xFFFFF

-	fistpl	et_u(%edi)

-// // we need to do this to avoid stepping off the edges if a very nearly

-// // horizontal edge is less than epsilon above a scan, and numeric error

-// // causes it to incorrectly extend to the scan, and the extension of the

-// // line goes off the edge of the screen

-// // FIXME: is this actually needed?

-//	if (edge->u < r_refdef.vrect_x_adj_shift20)

-//		edge->u = r_refdef.vrect_x_adj_shift20;

-//	if (edge->u > r_refdef.vrectright_adj_shift20)

-//		edge->u = r_refdef.vrectright_adj_shift20;

-	movl	et_u(%edi),%eax

-	movl	C(r_refdef)+rd_vrect_x_adj_shift20,%edx

-	cmpl	%edx,%eax

-	jl		LP4

-	movl	C(r_refdef)+rd_vrectright_adj_shift20,%edx

-	cmpl	%edx,%eax

-	jng		LP5

-LP4:

-	movl	%edx,et_u(%edi)

-	movl	%edx,%eax

-LP5:

-// // sort the edge in normally

-//	u_check = edge->u;

-//

-//	if (edge->surfs[0])

-//		u_check++;	// sort trailers after leaders

-	addl	%esi,%eax

-//	if (!newedges[v] || newedges[v]->u >= u_check)

-//	{

-	movl	C(newedges)(,%ebx,4),%esi

-	testl	%esi,%esi

-	jz		LDoFirst

-	cmpl	%eax,et_u(%esi)

-	jl		LNotFirst

-LDoFirst:

-//		edge->next = newedges[v];

-//		newedges[v] = edge;

-	movl	%esi,et_next(%edi)

-	movl	%edi,C(newedges)(,%ebx,4)

-	jmp		LSetRemove

-//	}

-LNotFirst:

-//	else

-//	{

-//		pcheck = newedges[v];

-//

-//		while (pcheck->next && pcheck->next->u < u_check)

-//			pcheck = pcheck->next;

-LFindInsertLoop:

-	movl	%esi,%edx

-	movl	et_next(%esi),%esi

-	testl	%esi,%esi

-	jz		LInsertFound

-	cmpl	%eax,et_u(%esi)

-	jl		LFindInsertLoop

-LInsertFound:

-//		edge->next = pcheck->next;

-//		pcheck->next = edge;

-	movl	%esi,et_next(%edi)

-	movl	%edi,et_next(%edx)

-//	}

-LSetRemove:

-//	edge->nextremove = removeedges[v2];

-//	removeedges[v2] = edge;

-	movl	C(removeedges)(,%ecx,4),%eax

-	movl	%edi,C(removeedges)(,%ecx,4)

-	movl	%eax,et_nextremove(%edi)

-Ldone:

-	movl	Lstack,%esp			// clear temporary variables from stack

-	popl	%ebx				// restore register variables

-	popl	%edi

-	popl	%esi

-	ret

-// at least one point is clipped

-Lp2:

-	testl	%eax,%eax

-	jns		Lp1

-//			else

-//			{

-//			// point 0 is clipped

-//				if (d1 < 0)

-//				{

-	movl	Ld1,%eax

-	testl	%eax,%eax

-	jns		Lp3

-//				// both points are clipped

-//				// we do cache fully clipped edges

-//					if (!leftclipped)

-	movl	C(r_leftclipped),%eax

-	movl	C(r_pedge),%ecx

-	testl	%eax,%eax

-	jnz		Ldone

-//						r_pedge->framecount = r_framecount;

-	movl	C(r_framecount),%eax

-	andl	$(FRAMECOUNT_MASK),%eax

-	orl		$(FULLY_CLIPPED_CACHED),%eax

-	movl	%eax,C(cacheoffset)

-//					return;

-	jmp		Ldone

-//				}

-Lp1:

-//			// point 0 is unclipped

-//				if (d1 >= 0)

-//				{

-//				// both points are unclipped

-//					continue;

-//			// only point 1 is clipped

-//				f = d0 / (d0 - d1);

-	flds	Ld0

-	flds	Ld1

-	fsubr	%st(1),%st(0)

-//			// we don't cache partially clipped edges

-	movl	$0x7FFFFFFF,C(cacheoffset)

-	fdivrp	%st(0),%st(1)

-	subl	$(mv_size),%esp			// allocate space for clipvert

-//				clipvert.position[0] = pv0->position[0] +

-//						f * (pv1->position[0] - pv0->position[0]);

-//				clipvert.position[1] = pv0->position[1] +

-//						f * (pv1->position[1] - pv0->position[1]);

-//				clipvert.position[2] = pv0->position[2] +

-//						f * (pv1->position[2] - pv0->position[2]);

-	flds	mv_position+8(%edx)

-	fsubs	mv_position+8(%esi)

-	flds	mv_position+4(%edx)

-	fsubs	mv_position+4(%esi)

-	flds	mv_position+0(%edx)

-	fsubs	mv_position+0(%esi)		// 0 | 1 | 2

-// replace pv1 with the clip point

-	movl	%esp,%edx

-	movl	cp_leftedge(%ebx),%eax

-	testb	%al,%al

-	fmul	%st(3),%st(0)

-	fxch	%st(1)					// 1 | 0 | 2

-	fmul	%st(3),%st(0)

-	fxch	%st(2)					// 2 | 0 | 1

-	fmulp	%st(0),%st(3)			// 0 | 1 | 2

-	fadds	mv_position+0(%esi)

-	fxch	%st(1)					// 1 | 0 | 2

-	fadds	mv_position+4(%esi)

-	fxch	%st(2)					// 2 | 0 | 1

-	fadds	mv_position+8(%esi)

-	fxch	%st(1)					// 0 | 2 | 1

-	fstps	mv_position+0(%esp)		// 2 | 1

-	fstps	mv_position+8(%esp)		// 1

-	fstps	mv_position+4(%esp)

-//				if (clip->leftedge)

-//				{

-	jz		Ltestright

-//					r_leftclipped = true;

-//					r_leftexit = clipvert;

-	movl	$1,C(r_leftclipped)

-	movl	mv_position+0(%esp),%eax

-	movl	%eax,C(r_leftexit)+mv_position+0

-	movl	mv_position+4(%esp),%eax

-	movl	%eax,C(r_leftexit)+mv_position+4

-	movl	mv_position+8(%esp),%eax

-	movl	%eax,C(r_leftexit)+mv_position+8

-	jmp		Lcontinue

-//				}

-Ltestright:

-//				else if (clip->rightedge)

-//				{

-	testb	%ah,%ah

-	jz		Lcontinue

-//					r_rightclipped = true;

-//					r_rightexit = clipvert;

-	movl	$1,C(r_rightclipped)

-	movl	mv_position+0(%esp),%eax

-	movl	%eax,C(r_rightexit)+mv_position+0

-	movl	mv_position+4(%esp),%eax

-	movl	%eax,C(r_rightexit)+mv_position+4

-	movl	mv_position+8(%esp),%eax

-	movl	%eax,C(r_rightexit)+mv_position+8

-//				}

-//

-//				R_ClipEdge (pv0, &clipvert, clip->next);

-//				return;

-//			}

-	jmp		Lcontinue

-//			}

-Lp3:

-//			// only point 0 is clipped

-//				r_lastvertvalid = false;

-	movl	$0,C(r_lastvertvalid)

-//				f = d0 / (d0 - d1);

-	flds	Ld0

-	flds	Ld1

-	fsubr	%st(1),%st(0)

-//			// we don't cache partially clipped edges

-	movl	$0x7FFFFFFF,C(cacheoffset)

-	fdivrp	%st(0),%st(1)

-	subl	$(mv_size),%esp			// allocate space for clipvert

-//				clipvert.position[0] = pv0->position[0] +

-//						f * (pv1->position[0] - pv0->position[0]);

-//				clipvert.position[1] = pv0->position[1] +

-//						f * (pv1->position[1] - pv0->position[1]);

-//				clipvert.position[2] = pv0->position[2] +

-//						f * (pv1->position[2] - pv0->position[2]);

-	flds	mv_position+8(%edx)

-	fsubs	mv_position+8(%esi)

-	flds	mv_position+4(%edx)

-	fsubs	mv_position+4(%esi)

-	flds	mv_position+0(%edx)

-	fsubs	mv_position+0(%esi)		// 0 | 1 | 2

-	movl	cp_leftedge(%ebx),%eax

-	testb	%al,%al

-	fmul	%st(3),%st(0)

-	fxch	%st(1)					// 1 | 0 | 2

-	fmul	%st(3),%st(0)

-	fxch	%st(2)					// 2 | 0 | 1

-	fmulp	%st(0),%st(3)			// 0 | 1 | 2

-	fadds	mv_position+0(%esi)

-	fxch	%st(1)					// 1 | 0 | 2

-	fadds	mv_position+4(%esi)

-	fxch	%st(2)					// 2 | 0 | 1

-	fadds	mv_position+8(%esi)

-	fxch	%st(1)					// 0 | 2 | 1

-	fstps	mv_position+0(%esp)		// 2 | 1

-	fstps	mv_position+8(%esp)		// 1

-	fstps	mv_position+4(%esp)

-// replace pv0 with the clip point

-	movl	%esp,%esi

-//				if (clip->leftedge)

-//				{

-	jz		Ltestright2

-//					r_leftclipped = true;

-//					r_leftenter = clipvert;

-	movl	$1,C(r_leftclipped)

-	movl	mv_position+0(%esp),%eax

-	movl	%eax,C(r_leftenter)+mv_position+0

-	movl	mv_position+4(%esp),%eax

-	movl	%eax,C(r_leftenter)+mv_position+4

-	movl	mv_position+8(%esp),%eax

-	movl	%eax,C(r_leftenter)+mv_position+8

-	jmp		Lcontinue

-//				}

-Ltestright2:

-//				else if (clip->rightedge)

-//				{

-	testb	%ah,%ah

-	jz		Lcontinue

-//					r_rightclipped = true;

-//					r_rightenter = clipvert;

-	movl	$1,C(r_rightclipped)

-	movl	mv_position+0(%esp),%eax

-	movl	%eax,C(r_rightenter)+mv_position+0

-	movl	mv_position+4(%esp),%eax

-	movl	%eax,C(r_rightenter)+mv_position+4

-	movl	mv_position+8(%esp),%eax

-	movl	%eax,C(r_rightenter)+mv_position+8

-//				}

-	jmp		Lcontinue

-// %esi = vec3_t point to transform and project

-// %edx preserved

-LTransformAndProject:

-//	// transform and project

-//		VectorSubtract (world, modelorg, local);

-	flds	mv_position+0(%esi)

-	fsubs	C(modelorg)+0

-	flds	mv_position+4(%esi)

-	fsubs	C(modelorg)+4

-	flds	mv_position+8(%esi)

-	fsubs	C(modelorg)+8

-	fxch	%st(2)				// local[0] | local[1] | local[2]

-//		TransformVector (local, transformed);

-//

-//		if (transformed[2] < NEAR_CLIP)

-//			transformed[2] = NEAR_CLIP;

-//

-//		lzi0 = 1.0 / transformed[2];

-	fld		%st(0)				// local[0] | local[0] | local[1] | local[2]

-	fmuls	C(vpn)+0			// zm0 | local[0] | local[1] | local[2]

-	fld		%st(1)				// local[0] | zm0 | local[0] | local[1] |

-								//  local[2]

-	fmuls	C(vright)+0			// xm0 | zm0 | local[0] | local[1] | local[2]

-	fxch	%st(2)				// local[0] | zm0 | xm0 | local[1] | local[2]

-	fmuls	C(vup)+0			// ym0 |  zm0 | xm0 | local[1] | local[2]

-	fld		%st(3)				// local[1] | ym0 |  zm0 | xm0 | local[1] |

-								//  local[2]

-	fmuls	C(vpn)+4			// zm1 | ym0 | zm0 | xm0 | local[1] |

-								//  local[2]

-	fld		%st(4)				// local[1] | zm1 | ym0 | zm0 | xm0 |

-								//  local[1] | local[2]

-	fmuls	C(vright)+4			// xm1 | zm1 | ym0 |  zm0 | xm0 |

-								//  local[1] | local[2]

-	fxch	%st(5)				// local[1] | zm1 | ym0 | zm0 | xm0 |

-								//  xm1 | local[2]

-	fmuls	C(vup)+4			// ym1 | zm1 | ym0 | zm0 | xm0 |

-								//  xm1 | local[2]

-	fxch	%st(1)				// zm1 | ym1 | ym0 | zm0 | xm0 |

-								//  xm1 | local[2]

-	faddp	%st(0),%st(3)		// ym1 | ym0 | zm2 | xm0 | xm1 | local[2]

-	fxch	%st(3)				// xm0 | ym0 | zm2 | ym1 | xm1 | local[2]

-	faddp	%st(0),%st(4)		// ym0 | zm2 | ym1 | xm2 | local[2]

-	faddp	%st(0),%st(2)		// zm2 | ym2 | xm2 | local[2]

-	fld		%st(3)				// local[2] | zm2 | ym2 | xm2 | local[2]

-	fmuls	C(vpn)+8			// zm3 | zm2 | ym2 | xm2 | local[2]

-	fld		%st(4)				// local[2] | zm3 | zm2 | ym2 | xm2 | local[2]

-	fmuls	C(vright)+8			// xm3 | zm3 | zm2 | ym2 | xm2 | local[2]

-	fxch	%st(5)				// local[2] | zm3 | zm2 | ym2 | xm2 | xm3

-	fmuls	C(vup)+8			// ym3 | zm3 | zm2 | ym2 | xm2 | xm3

-	fxch	%st(1)				// zm3 | ym3 | zm2 | ym2 | xm2 | xm3

-	faddp	%st(0),%st(2)		// ym3 | zm4 | ym2 | xm2 | xm3

-	fxch	%st(4)				// xm3 | zm4 | ym2 | xm2 | ym3

-	faddp	%st(0),%st(3)		// zm4 | ym2 | xm4 | ym3

-	fxch	%st(1)				// ym2 | zm4 | xm4 | ym3

-	faddp	%st(0),%st(3)		// zm4 | xm4 | ym4

-	fcoms	Lfp_near_clip

-	fnstsw	%ax

-	testb	$1,%ah

-	jz		LNoClip

-	fstp	%st(0)

-	flds	Lfp_near_clip

-LNoClip:

-	fdivrs	float_1				// lzi0 | x | y

-	fxch	%st(1)				// x | lzi0 | y

-//	// FIXME: build x/yscale into transform?

-//		scale = xscale * lzi0;

-//		u0 = (xcenter + scale*transformed[0]);

-	flds	C(xscale)			// xscale | x | lzi0 | y

-	fmul	%st(2),%st(0)		// scale | x | lzi0 | y

-	fmulp	%st(0),%st(1)		// scale*x | lzi0 | y

-	fadds	C(xcenter)			// u0 | lzi0 | y

-//		if (u0 < r_refdef.fvrectx_adj)

-//			u0 = r_refdef.fvrectx_adj;

-//		if (u0 > r_refdef.fvrectright_adj)

-//			u0 = r_refdef.fvrectright_adj;

-// FIXME: use integer compares of floats?

-	fcoms	C(r_refdef)+rd_fvrectx_adj

-	fnstsw	%ax

-	testb	$1,%ah

-	jz		LClampP0

-	fstp	%st(0)

-	flds	C(r_refdef)+rd_fvrectx_adj

-LClampP0:

-	fcoms	C(r_refdef)+rd_fvrectright_adj

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jnz		LClampP1

-	fstp	%st(0)

-	flds	C(r_refdef)+rd_fvrectright_adj

-LClampP1:

-	fld		%st(1)				// lzi0 | u0 | lzi0 | y

-//		scale = yscale * lzi0;

-//		v0 = (ycenter - scale*transformed[1]);

-	fmuls	C(yscale)			// scale | u0 | lzi0 | y

-	fmulp	%st(0),%st(3)		// u0 | lzi0 | scale*y

-	fxch	%st(2)				// scale*y | lzi0 | u0

-	fsubrs	C(ycenter)			// v0 | lzi0 | u0

-//		if (v0 < r_refdef.fvrecty_adj)

-//			v0 = r_refdef.fvrecty_adj;

-//		if (v0 > r_refdef.fvrectbottom_adj)

-//			v0 = r_refdef.fvrectbottom_adj;

-// FIXME: use integer compares of floats?

-	fcoms	C(r_refdef)+rd_fvrecty_adj

-	fnstsw	%ax

-	testb	$1,%ah

-	jz		LClampP2

-	fstp	%st(0)

-	flds	C(r_refdef)+rd_fvrecty_adj

-LClampP2:

-	fcoms	C(r_refdef)+rd_fvrectbottom_adj

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jnz		LClampP3

-	fstp	%st(0)

-	flds	C(r_refdef)+rd_fvrectbottom_adj

-LClampP3:

-	ret

-#endif	// id386

--- a/r_edgea.s

+++ /dev/null

@@ -1,731 +1,0 @@

-//

-// r_edgea.s

-// x86 assembly-language edge-processing code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#ifdef	id386

-	.data

-Ltemp:					.long	0

-float_1_div_0100000h:	.long	0x35800000	// 1.0/(float)0x100000

-float_point_999:		.single	0.999

-float_1_point_001:		.single	1.001

-	.text

-//--------------------------------------------------------------------

-#define edgestoadd	4+8		// note odd stack offsets because of interleaving

-#define edgelist	8+12	// with pushes

-.globl C(R_EdgeCodeStart)

-C(R_EdgeCodeStart):

-.globl C(R_InsertNewEdges)

-C(R_InsertNewEdges):

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	movl	edgestoadd(%esp),%edx

-	pushl	%ebx

-	movl	edgelist(%esp),%ecx

-LDoNextEdge:

-	movl	et_u(%edx),%eax

-	movl	%edx,%edi

-LContinueSearch:

-	movl	et_u(%ecx),%ebx

-	movl	et_next(%ecx),%esi

-	cmpl	%ebx,%eax

-	jle		LAddedge

-	movl	et_u(%esi),%ebx

-	movl	et_next(%esi),%ecx

-	cmpl	%ebx,%eax

-	jle		LAddedge2

-	movl	et_u(%ecx),%ebx

-	movl	et_next(%ecx),%esi

-	cmpl	%ebx,%eax

-	jle		LAddedge

-	movl	et_u(%esi),%ebx

-	movl	et_next(%esi),%ecx

-	cmpl	%ebx,%eax

-	jg		LContinueSearch

-LAddedge2:

-	movl	et_next(%edx),%edx

-	movl	et_prev(%esi),%ebx

-	movl	%esi,et_next(%edi)

-	movl	%ebx,et_prev(%edi)

-	movl	%edi,et_next(%ebx)

-	movl	%edi,et_prev(%esi)

-	movl	%esi,%ecx

-	cmpl	$0,%edx

-	jnz		LDoNextEdge

-	jmp		LDone

-	.align 4

-LAddedge:

-	movl	et_next(%edx),%edx

-	movl	et_prev(%ecx),%ebx

-	movl	%ecx,et_next(%edi)

-	movl	%ebx,et_prev(%edi)

-	movl	%edi,et_next(%ebx)

-	movl	%edi,et_prev(%ecx)

-	cmpl	$0,%edx

-	jnz		LDoNextEdge

-LDone:

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	ret

-//--------------------------------------------------------------------

-#define predge	4+4

-.globl C(R_RemoveEdges)

-C(R_RemoveEdges):

-	pushl	%ebx

-	movl	predge(%esp),%eax

-Lre_loop:

-	movl	et_next(%eax),%ecx

-	movl	et_nextremove(%eax),%ebx

-	movl	et_prev(%eax),%edx

-	testl	%ebx,%ebx

-	movl	%edx,et_prev(%ecx)

-	jz		Lre_done

-	movl	%ecx,et_next(%edx)

-	movl	et_next(%ebx),%ecx

-	movl	et_prev(%ebx),%edx

-	movl	et_nextremove(%ebx),%eax

-	movl	%edx,et_prev(%ecx)

-	testl	%eax,%eax

-	movl	%ecx,et_next(%edx)

-	jnz		Lre_loop

-	popl	%ebx

-	ret

-Lre_done:

-	movl	%ecx,et_next(%edx)

-	popl	%ebx

-	ret

-//--------------------------------------------------------------------

-#define pedgelist	4+4		// note odd stack offset because of interleaving

-							// with pushes

-.globl C(R_StepActiveU)

-C(R_StepActiveU):

-	pushl	%edi

-	movl	pedgelist(%esp),%edx

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-	movl	et_prev(%edx),%esi

-LNewEdge:

-	movl	et_u(%esi),%edi

-LNextEdge:

-	movl	et_u(%edx),%eax

-	movl	et_u_step(%edx),%ebx

-	addl	%ebx,%eax

-	movl	et_next(%edx),%esi

-	movl	%eax,et_u(%edx)

-	cmpl	%edi,%eax

-	jl		LPushBack

-	movl	et_u(%esi),%edi

-	movl	et_u_step(%esi),%ebx

-	addl	%ebx,%edi

-	movl	et_next(%esi),%edx

-	movl	%edi,et_u(%esi)

-	cmpl	%eax,%edi

-	jl		LPushBack2

-	movl	et_u(%edx),%eax

-	movl	et_u_step(%edx),%ebx

-	addl	%ebx,%eax

-	movl	et_next(%edx),%esi

-	movl	%eax,et_u(%edx)

-	cmpl	%edi,%eax

-	jl		LPushBack

-	movl	et_u(%esi),%edi

-	movl	et_u_step(%esi),%ebx

-	addl	%ebx,%edi

-	movl	et_next(%esi),%edx

-	movl	%edi,et_u(%esi)

-	cmpl	%eax,%edi

-	jnl		LNextEdge

-LPushBack2:

-	movl	%edx,%ebx

-	movl	%edi,%eax

-	movl	%esi,%edx

-	movl	%ebx,%esi

-LPushBack:

-// push it back to keep it sorted

-	movl	et_prev(%edx),%ecx

-	movl	et_next(%edx),%ebx

-// done if the -1 in edge_aftertail triggered this

-	cmpl	$(C(edge_aftertail)),%edx

-	jz		LUDone

-// pull the edge out of the edge list

-	movl	et_prev(%ecx),%edi

-	movl	%ecx,et_prev(%esi)

-	movl	%ebx,et_next(%ecx)

-// find out where the edge goes in the edge list

-LPushBackLoop:

-	movl	et_prev(%edi),%ecx

-	movl	et_u(%edi),%ebx

-	cmpl	%ebx,%eax

-	jnl		LPushBackFound

-	movl	et_prev(%ecx),%edi

-	movl	et_u(%ecx),%ebx

-	cmpl	%ebx,%eax

-	jl		LPushBackLoop

-	movl	%ecx,%edi

-// put the edge back into the edge list

-LPushBackFound:

-	movl	et_next(%edi),%ebx

-	movl	%edi,et_prev(%edx)

-	movl	%ebx,et_next(%edx)

-	movl	%edx,et_next(%edi)

-	movl	%edx,et_prev(%ebx)

-	movl	%esi,%edx

-	movl	et_prev(%esi),%esi

-	cmpl	$(C(edge_tail)),%edx

-	jnz		LNewEdge

-LUDone:

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	ret

-//--------------------------------------------------------------------

-#define surf	4		// note this is loaded before any pushes

-	.align 4

-TrailingEdge:

-	movl	st_spanstate(%esi),%eax	// check for edge inversion

-	decl	%eax

-	jnz		LInverted

-	movl	%eax,st_spanstate(%esi)

-	movl	st_insubmodel(%esi),%ecx

-	movl	0x12345678,%edx		// surfaces[1].st_next

-LPatch0:

-	movl	C(r_bmodelactive),%eax

-	subl	%ecx,%eax

-	cmpl	%esi,%edx

-	movl	%eax,C(r_bmodelactive)

-	jnz		LNoEmit				// surface isn't on top, just remove

-// emit a span (current top going away)

-	movl	et_u(%ebx),%eax

-	shrl	$20,%eax				// iu = integral pixel u

-	movl	st_last_u(%esi),%edx

-	movl	st_next(%esi),%ecx

-	cmpl	%edx,%eax

-	jle		LNoEmit2				// iu <= surf->last_u, so nothing to emit

-	movl	%eax,st_last_u(%ecx)	// surf->next->last_u = iu;

-	subl	%edx,%eax

-	movl	%edx,espan_t_u(%ebp)		// span->u = surf->last_u;

-	movl	%eax,espan_t_count(%ebp)	// span->count = iu - span->u;

-	movl	C(current_iv),%eax

-	movl	%eax,espan_t_v(%ebp)		// span->v = current_iv;

-	movl	st_spans(%esi),%eax

-	movl	%eax,espan_t_pnext(%ebp)	// span->pnext = surf->spans;

-	movl	%ebp,st_spans(%esi)			// surf->spans = span;

-	addl	$(espan_t_size),%ebp

-	movl	st_next(%esi),%edx		// remove the surface from the surface

-	movl	st_prev(%esi),%esi		// stack

-	movl	%edx,st_next(%esi)

-	movl	%esi,st_prev(%edx)

-	ret

-LNoEmit2:

-	movl	%eax,st_last_u(%ecx)	// surf->next->last_u = iu;

-	movl	st_next(%esi),%edx		// remove the surface from the surface

-	movl	st_prev(%esi),%esi		// stack

-	movl	%edx,st_next(%esi)

-	movl	%esi,st_prev(%edx)

-	ret

-LNoEmit:

-	movl	st_next(%esi),%edx		// remove the surface from the surface

-	movl	st_prev(%esi),%esi		// stack

-	movl	%edx,st_next(%esi)

-	movl	%esi,st_prev(%edx)

-	ret

-LInverted:

-	movl	%eax,st_spanstate(%esi)

-	ret

-//--------------------------------------------------------------------

-// trailing edge only

-Lgs_trailing:

-	pushl	$Lgs_nextedge

-	jmp		TrailingEdge

-.globl C(R_GenerateSpans)

-C(R_GenerateSpans):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-// clear active surfaces to just the background surface

-	movl	C(surfaces),%eax

-	movl	C(edge_head_u_shift20),%edx

-	addl	$(st_size),%eax

-// %ebp = span_p throughout

-	movl	C(span_p),%ebp

-	movl	$0,C(r_bmodelactive)

-	movl	%eax,st_next(%eax)

-	movl	%eax,st_prev(%eax)

-	movl	%edx,st_last_u(%eax)

-	movl	C(edge_head)+et_next,%ebx		// edge=edge_head.next

-// generate spans

-	cmpl	$(C(edge_tail)),%ebx		// done if empty list

-	jz		Lgs_lastspan

-Lgs_edgeloop:

-	movl	et_surfs(%ebx),%edi

-	movl	C(surfaces),%eax

-	movl	%edi,%esi

-	andl	$0xFFFF0000,%edi

-	andl	$0xFFFF,%esi

-	jz		Lgs_leading		// not a trailing edge

-// it has a left surface, so a surface is going away for this span

-	shll	$(SURF_T_SHIFT),%esi

-	addl	%eax,%esi

-	testl	%edi,%edi

-	jz		Lgs_trailing

-// both leading and trailing

-	call	TrailingEdge

-	movl	C(surfaces),%eax

-// ---------------------------------------------------------------

-// handle a leading edge

-// ---------------------------------------------------------------

-Lgs_leading:

-	shrl	$16-SURF_T_SHIFT,%edi

-	movl	C(surfaces),%eax

-	addl	%eax,%edi

-	movl	0x12345678,%esi		// surf2 = surfaces[1].next;

-LPatch2:

-	movl	st_spanstate(%edi),%edx

-	movl	st_insubmodel(%edi),%eax

-	testl	%eax,%eax

-	jnz		Lbmodel_leading

-// handle a leading non-bmodel edge

-// don't start a span if this is an inverted span, with the end edge preceding

-// the start edge (that is, we've already seen the end edge)

-	testl	%edx,%edx

-	jnz		Lxl_done

-// if (surf->key < surf2->key)

-//		goto newtop;

-	incl	%edx

-	movl	st_key(%edi),%eax

-	movl	%edx,st_spanstate(%edi)

-	movl	st_key(%esi),%ecx

-	cmpl	%ecx,%eax

-	jl		Lnewtop

-// main sorting loop to search through surface stack until insertion point

-// found. Always terminates because background surface is sentinel

-// do

-// {

-// 		surf2 = surf2->next;

-// } while (surf->key >= surf2->key);

-Lsortloopnb:

-	movl	st_next(%esi),%esi

-	movl	st_key(%esi),%ecx

-	cmpl	%ecx,%eax

-	jge		Lsortloopnb

-	jmp		LInsertAndExit

-// handle a leading bmodel edge

-	.align	4

-Lbmodel_leading:

-// don't start a span if this is an inverted span, with the end edge preceding

-// the start edge (that is, we've already seen the end edge)

-	testl	%edx,%edx

-	jnz		Lxl_done

-	movl	C(r_bmodelactive),%ecx

-	incl	%edx

-	incl	%ecx

-	movl	%edx,st_spanstate(%edi)

-	movl	%ecx,C(r_bmodelactive)

-// if (surf->key < surf2->key)

-//		goto newtop;

-	movl	st_key(%edi),%eax

-	movl	st_key(%esi),%ecx

-	cmpl	%ecx,%eax

-	jl		Lnewtop

-// if ((surf->key == surf2->key) && surf->insubmodel)

-// {

-	jz		Lzcheck_for_newtop

-// main sorting loop to search through surface stack until insertion point

-// found. Always terminates because background surface is sentinel

-// do

-// {

-// 		surf2 = surf2->next;

-// } while (surf->key > surf2->key);

-Lsortloop:

-	movl	st_next(%esi),%esi

-	movl	st_key(%esi),%ecx

-	cmpl	%ecx,%eax

-	jg		Lsortloop

-	jne		LInsertAndExit

-// Do 1/z sorting to see if we've arrived in the right position

-	movl	et_u(%ebx),%eax

-	subl	$0xFFFFF,%eax

-	movl	%eax,Ltemp

-	fildl	Ltemp

-	fmuls	float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) *

-								//      (1.0 / 0x100000);

-	fld		%st(0)				// fu | fu

-	fmuls	st_d_zistepu(%edi)	// fu*surf->d_zistepu | fu

-	flds	C(fv)					// fv | fu*surf->d_zistepu | fu

-	fmuls	st_d_zistepv(%edi)	// fv*surf->d_zistepv | fu*surf->d_zistepu | fu

-	fxch	%st(1)				// fu*surf->d_zistepu | fv*surf->d_zistepv | fu

-	fadds	st_d_ziorigin(%edi)	// fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fv*surf->d_zistepv | fu

-	flds	st_d_zistepu(%esi)	// surf2->d_zistepu |

-								//  fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fv*surf->d_zistepv | fu

-	fmul	%st(3),%st(0)		// fu*surf2->d_zistepu |

-								//  fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fv*surf->d_zistepv | fu

-	fxch	%st(1)				// fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fu*surf2->d_zistepu |

-								//  fv*surf->d_zistepv | fu

-	faddp	%st(0),%st(2)		// fu*surf2->d_zistepu | newzi | fu

-	flds	C(fv)					// fv | fu*surf2->d_zistepu | newzi | fu

-	fmuls	st_d_zistepv(%esi)	// fv*surf2->d_zistepv |

-								//  fu*surf2->d_zistepu | newzi | fu

-	fld		%st(2)				// newzi | fv*surf2->d_zistepv |

-								//  fu*surf2->d_zistepu | newzi | fu

-	fmuls	float_point_999		// newzibottom | fv*surf2->d_zistepv |

-								//  fu*surf2->d_zistepu | newzi | fu

-	fxch	%st(2)				// fu*surf2->d_zistepu | fv*surf2->d_zistepv |

-								//  newzibottom | newzi | fu

-	fadds	st_d_ziorigin(%esi)	// fu*surf2->d_zistepu + surf2->d_ziorigin |

-								//  fv*surf2->d_zistepv | newzibottom | newzi |

-								//  fu

-	faddp	%st(0),%st(1)		// testzi | newzibottom | newzi | fu

-	fxch	%st(1)				// newzibottom | testzi | newzi | fu

-// if (newzibottom >= testzi)

-//     goto Lgotposition;

-	fcomp	%st(1)				// testzi | newzi | fu

-	fxch	%st(1)				// newzi | testzi | fu

-	fmuls	float_1_point_001	// newzitop | testzi | fu

-	fxch	%st(1)				// testzi | newzitop | fu

-	fnstsw	%ax

-	testb	$0x01,%ah

-	jz		Lgotposition_fpop3

-// if (newzitop >= testzi)

-// {

-	fcomp	%st(1)				// newzitop | fu

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jz		Lsortloop_fpop2

-// if (surf->d_zistepu >= surf2->d_zistepu)

-//     goto newtop;

-	flds	st_d_zistepu(%edi)	// surf->d_zistepu | newzitop| fu

-	fcomps	st_d_zistepu(%esi)	// newzitop | fu

-	fnstsw	%ax

-	testb	$0x01,%ah

-	jz		Lgotposition_fpop2

-	fstp	%st(0)				// clear the FPstack

-	fstp	%st(0)

-	movl	st_key(%edi),%eax

-	jmp		Lsortloop

-Lgotposition_fpop3:

-	fstp	%st(0)

-Lgotposition_fpop2:

-	fstp	%st(0)

-	fstp	%st(0)

-	jmp		LInsertAndExit

-// emit a span (obscures current top)

-Lnewtop_fpop3:

-	fstp	%st(0)

-Lnewtop_fpop2:

-	fstp	%st(0)

-	fstp	%st(0)

-	movl	st_key(%edi),%eax		// reload the sorting key

-Lnewtop:

-	movl	et_u(%ebx),%eax

-	movl	st_last_u(%esi),%edx

-	shrl	$20,%eax				// iu = integral pixel u

-	movl	%eax,st_last_u(%edi)	// surf->last_u = iu;

-	cmpl	%edx,%eax

-	jle		LInsertAndExit			// iu <= surf->last_u, so nothing to emit

-	subl	%edx,%eax

-	movl	%edx,espan_t_u(%ebp)		// span->u = surf->last_u;

-	movl	%eax,espan_t_count(%ebp)	// span->count = iu - span->u;

-	movl	C(current_iv),%eax

-	movl	%eax,espan_t_v(%ebp)		// span->v = current_iv;

-	movl	st_spans(%esi),%eax

-	movl	%eax,espan_t_pnext(%ebp)	// span->pnext = surf->spans;

-	movl	%ebp,st_spans(%esi)			// surf->spans = span;

-	addl	$(espan_t_size),%ebp

-LInsertAndExit:

-// insert before surf2

-	movl	%esi,st_next(%edi)		// surf->next = surf2;

-	movl	st_prev(%esi),%eax

-	movl	%eax,st_prev(%edi)		// surf->prev = surf2->prev;

-	movl	%edi,st_prev(%esi)		// surf2->prev = surf;

-	movl	%edi,st_next(%eax)		// surf2->prev->next = surf;

-// ---------------------------------------------------------------

-// leading edge done

-// ---------------------------------------------------------------

-// ---------------------------------------------------------------

-// see if there are any more edges

-// ---------------------------------------------------------------

-Lgs_nextedge:

-	movl	et_next(%ebx),%ebx

-	cmpl	$(C(edge_tail)),%ebx

-	jnz		Lgs_edgeloop

-// clean up at the right edge

-Lgs_lastspan:

-// now that we've reached the right edge of the screen, we're done with any

-// unfinished surfaces, so emit a span for whatever's on top

-	movl	0x12345678,%esi		// surfaces[1].st_next

-LPatch3:

-	movl	C(edge_tail_u_shift20),%eax

-	xorl	%ecx,%ecx

-	movl	st_last_u(%esi),%edx

-	subl	%edx,%eax

-	jle		Lgs_resetspanstate

-	movl	%edx,espan_t_u(%ebp)

-	movl	%eax,espan_t_count(%ebp)

-	movl	C(current_iv),%eax

-	movl	%eax,espan_t_v(%ebp)

-	movl	st_spans(%esi),%eax

-	movl	%eax,espan_t_pnext(%ebp)

-	movl	%ebp,st_spans(%esi)

-	addl	$(espan_t_size),%ebp

-// reset spanstate for all surfaces in the surface stack

-Lgs_resetspanstate:

-	movl	%ecx,st_spanstate(%esi)

-	movl	st_next(%esi),%esi

-	cmpl	$0x12345678,%esi		// &surfaces[1]

-LPatch4:

-	jnz		Lgs_resetspanstate

-// store the final span_p

-	movl	%ebp,C(span_p)

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-// ---------------------------------------------------------------

-// 1/z sorting for bmodels in the same leaf

-// ---------------------------------------------------------------

-	.align	4

-Lxl_done:

-	incl	%edx

-	movl	%edx,st_spanstate(%edi)

-	jmp		Lgs_nextedge

-	.align	4

-Lzcheck_for_newtop:

-	movl	et_u(%ebx),%eax

-	subl	$0xFFFFF,%eax

-	movl	%eax,Ltemp

-	fildl	Ltemp

-	fmuls	float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) *

-								//      (1.0 / 0x100000);

-	fld		%st(0)				// fu | fu

-	fmuls	st_d_zistepu(%edi)	// fu*surf->d_zistepu | fu

-	flds	C(fv)				// fv | fu*surf->d_zistepu | fu

-	fmuls	st_d_zistepv(%edi)	// fv*surf->d_zistepv | fu*surf->d_zistepu | fu

-	fxch	%st(1)				// fu*surf->d_zistepu | fv*surf->d_zistepv | fu

-	fadds	st_d_ziorigin(%edi)	// fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fv*surf->d_zistepv | fu

-	flds	st_d_zistepu(%esi)	// surf2->d_zistepu |

-								//  fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fv*surf->d_zistepv | fu

-	fmul	%st(3),%st(0)		// fu*surf2->d_zistepu |

-								//  fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fv*surf->d_zistepv | fu

-	fxch	%st(1)				// fu*surf->d_zistepu + surf->d_ziorigin |

-								//  fu*surf2->d_zistepu |

-								//  fv*surf->d_zistepv | fu

-	faddp	%st(0),%st(2)		// fu*surf2->d_zistepu | newzi | fu

-	flds	C(fv)				// fv | fu*surf2->d_zistepu | newzi | fu

-	fmuls	st_d_zistepv(%esi)	// fv*surf2->d_zistepv |

-								//  fu*surf2->d_zistepu | newzi | fu

-	fld		%st(2)				// newzi | fv*surf2->d_zistepv |

-								//  fu*surf2->d_zistepu | newzi | fu

-	fmuls	float_point_999		// newzibottom | fv*surf2->d_zistepv |

-								//  fu*surf2->d_zistepu | newzi | fu

-	fxch	%st(2)				// fu*surf2->d_zistepu | fv*surf2->d_zistepv |

-								//  newzibottom | newzi | fu

-	fadds	st_d_ziorigin(%esi)	// fu*surf2->d_zistepu + surf2->d_ziorigin |

-								//  fv*surf2->d_zistepv | newzibottom | newzi |

-								//  fu

-	faddp	%st(0),%st(1)		// testzi | newzibottom | newzi | fu

-	fxch	%st(1)				// newzibottom | testzi | newzi | fu

-// if (newzibottom >= testzi)

-//     goto newtop;

-	fcomp	%st(1)				// testzi | newzi | fu

-	fxch	%st(1)				// newzi | testzi | fu

-	fmuls	float_1_point_001	// newzitop | testzi | fu

-	fxch	%st(1)				// testzi | newzitop | fu

-	fnstsw	%ax

-	testb	$0x01,%ah

-	jz		Lnewtop_fpop3

-// if (newzitop >= testzi)

-// {

-	fcomp	%st(1)				// newzitop | fu

-	fnstsw	%ax

-	testb	$0x45,%ah

-	jz		Lsortloop_fpop2

-// if (surf->d_zistepu >= surf2->d_zistepu)

-//     goto newtop;

-	flds	st_d_zistepu(%edi)	// surf->d_zistepu | newzitop | fu

-	fcomps	st_d_zistepu(%esi)	// newzitop | fu

-	fnstsw	%ax

-	testb	$0x01,%ah

-	jz		Lnewtop_fpop2

-Lsortloop_fpop2:

-	fstp	%st(0)				// clear the FP stack

-	fstp	%st(0)

-	movl	st_key(%edi),%eax

-	jmp		Lsortloop

-.globl C(R_EdgeCodeEnd)

-C(R_EdgeCodeEnd):

-//----------------------------------------------------------------------

-// Surface array address code patching routine

-//----------------------------------------------------------------------

-	.align 4

-.globl C(R_SurfacePatch)

-C(R_SurfacePatch):

-	movl	C(surfaces),%eax

-	addl	$(st_size),%eax

-	movl	%eax,LPatch4-4

-	addl	$(st_next),%eax

-	movl	%eax,LPatch0-4

-	movl	%eax,LPatch2-4

-	movl	%eax,LPatch3-4

-	ret

-#endif	// id386

--- a/r_varsa.s

+++ /dev/null

@@ -1,45 +1,0 @@

-//

-// r_varsa.s

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#include "d_ifacea.h"

-#ifdef id386

-	.data

-//-------------------------------------------------------

-// ASM-only variables

-//-------------------------------------------------------

-.globl	float_1, float_particle_z_clip, float_point5

-.globl	float_minus_1, float_0

-float_0:		.single	0.0

-float_1:		.single	1.0

-float_minus_1:	.single	-1.0

-float_particle_z_clip:	.single	PARTICLE_Z_CLIP

-float_point5:	.single	0.5

-.globl	fp_16, fp_64k, fp_1m, fp_64kx64k

-.globl	fp_1m_minus_1

-.globl	fp_8

-fp_1m:			.single	1048576.0

-fp_1m_minus_1:	.single	1048575.0

-fp_64k:			.single	65536.0

-fp_8:			.single	8.0

-fp_16:			.single	16.0

-fp_64kx64k:		.long	0x4f000000	// (float)0x8000*0x10000

-.globl	FloatZero, Float2ToThe31nd, FloatMinus2ToThe31nd

-FloatZero:				.long	0

-Float2ToThe31nd:		.long	0x4f000000

-FloatMinus2ToThe31nd:	.long	0xcf000000

-.globl	C(r_bmodelactive)

-C(r_bmodelactive):	.long	0

-#endif	// id386

--- a/snd_mixa.s

+++ /dev/null

@@ -1,199 +1,0 @@

-//

-// snd_mixa.s

-// x86 assembly-language sound code

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#ifdef	id386

-	.text

-//----------------------------------------------------------------------

-// 8-bit sound-mixing code

-//----------------------------------------------------------------------

-#define ch		4+16

-#define sc		8+16

-#define count	12+16

-.globl C(SND_PaintChannelFrom8)

-C(SND_PaintChannelFrom8):

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-	pushl	%ebx

-	pushl	%ebp

-//	int 	data;

-//	short	*lscale, *rscale;

-//	unsigned char *sfx;

-//	int		i;

-	movl	ch(%esp),%ebx

-	movl	sc(%esp),%esi

-//	if (ch->leftvol > 255)

-//		ch->leftvol = 255;

-//	if (ch->rightvol > 255)

-//		ch->rightvol = 255;

-	movl	ch_leftvol(%ebx),%eax

-	movl	ch_rightvol(%ebx),%edx

-	cmpl	$255,%eax

-	jna		LLeftSet

-	movl	$255,%eax

-LLeftSet:

-	cmpl	$255,%edx

-	jna		LRightSet

-	movl	$255,%edx

-LRightSet:

-//	lscale = snd_scaletable[ch->leftvol >> 3];

-//	rscale = snd_scaletable[ch->rightvol >> 3];

-//	sfx = (signed char *)sc->data + ch->pos;

-//	ch->pos += count;

-	andl	$0xF8,%eax

-	addl	$(sfxc_data),%esi

-	andl	$0xF8,%edx

-	movl	ch_pos(%ebx),%edi

-	movl	count(%esp),%ecx

-	addl	%edi,%esi

-	shll	$7,%eax

-	addl	%ecx,%edi

-	shll	$7,%edx

-	movl	%edi,ch_pos(%ebx)

-	addl	$(C(snd_scaletable)),%eax

-	addl	$(C(snd_scaletable)),%edx

-	subl	%ebx,%ebx

-	movb	-1(%esi,%ecx,1),%bl

-	testl	$1,%ecx

-	jz		LMix8Loop

-	movl	(%eax,%ebx,4),%edi

-	movl	(%edx,%ebx,4),%ebp

-	addl	C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi

-	addl	C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp

-	movl	%edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size)

-	movl	%ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size)

-	movb	-2(%esi,%ecx,1),%bl

-	decl	%ecx

-	jz		LDone

-//	for (i=0 ; i<count ; i++)

-//	{

-LMix8Loop:

-//		data = sfx[i];

-//		paintbuffer[i].left += lscale[data];

-//		paintbuffer[i].right += rscale[data];

-	movl	(%eax,%ebx,4),%edi

-	movl	(%edx,%ebx,4),%ebp

-	addl	C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi

-	addl	C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp

-	movb	-2(%esi,%ecx,1),%bl

-	movl	%edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size)

-	movl	%ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size)

-	movl	(%eax,%ebx,4),%edi

-	movl	(%edx,%ebx,4),%ebp

-	movb	-3(%esi,%ecx,1),%bl

-	addl	C(paintbuffer)+psp_left-psp_size*2(,%ecx,psp_size),%edi

-	addl	C(paintbuffer)+psp_right-psp_size*2(,%ecx,psp_size),%ebp

-	movl	%edi,C(paintbuffer)+psp_left-psp_size*2(,%ecx,psp_size)

-	movl	%ebp,C(paintbuffer)+psp_right-psp_size*2(,%ecx,psp_size)

-//	}

-	subl	$2,%ecx

-	jnz		LMix8Loop

-LDone:

-	popl	%ebp

-	popl	%ebx

-	popl	%edi

-	popl	%esi

-	ret

-//----------------------------------------------------------------------

-// Transfer of stereo buffer to 16-bit DMA buffer code

-//----------------------------------------------------------------------

-.globl C(Snd_WriteLinearBlastStereo16)

-C(Snd_WriteLinearBlastStereo16):

-	pushl	%esi				// preserve register variables

-	pushl	%edi

-	pushl	%ebx

-//	int		i;

-//	int		val;

-	movl	C(snd_linear_count),%ecx

-	movl	C(snd_p),%ebx

-	movl	C(snd_vol),%esi

-	movl	C(snd_out),%edi

-//	for (i=0 ; i<snd_linear_count ; i+=2)

-//	{

-LWLBLoopTop:

-//		val = (snd_p[i]*snd_vol)>>8;

-//		if (val > 0x7fff)

-//			snd_out[i] = 0x7fff;

-//		else if (val < (short)0x8000)

-//			snd_out[i] = (short)0x8000;

-//		else

-//			snd_out[i] = val;

-	movl	-8(%ebx,%ecx,4),%eax

-	imull	%esi,%eax

-	sarl	$8,%eax

-	cmpl	$0x7FFF,%eax

-	jg		LClampHigh

-	cmpl	$0xFFFF8000,%eax

-	jnl		LClampDone

-	movl	$0xFFFF8000,%eax

-	jmp		LClampDone

-LClampHigh:

-	movl	$0x7FFF,%eax

-LClampDone:

-//		val = (snd_p[i+1]*snd_vol)>>8;

-//		if (val > 0x7fff)

-//			snd_out[i+1] = 0x7fff;

-//		else if (val < (short)0x8000)

-//			snd_out[i+1] = (short)0x8000;

-//		else

-//			snd_out[i+1] = val;

-	movl	-4(%ebx,%ecx,4),%edx

-	imull	%esi,%edx

-	sarl	$8,%edx

-	cmpl	$0x7FFF,%edx

-	jg		LClampHigh2

-	cmpl	$0xFFFF8000,%edx

-	jnl		LClampDone2

-	movl	$0xFFFF8000,%edx

-	jmp		LClampDone2

-LClampHigh2:

-	movl	$0x7FFF,%edx

-LClampDone2:

-	shll	$16,%edx

-	andl	$0xFFFF,%eax

-	orl		%eax,%edx

-	movl	%edx,-4(%edi,%ecx,2)

-//	}

-	subl	$2,%ecx

-	jnz		LWLBLoopTop

-//	snd_p += snd_linear_count;

-	popl	%ebx

-	popl	%edi

-	popl	%esi

-	ret

-#endif	// id386

--- a/surf16.s

+++ /dev/null

@@ -1,153 +1,0 @@

-//

-// surf16.s

-// x86 assembly-language 16 bpp surface block drawing code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#ifdef id386

-//----------------------------------------------------------------------

-// Surface block drawer

-//----------------------------------------------------------------------

-	.data

-k:			.long	0

-loopentry:	.long	0

-	.align	4

-blockjumptable16:

-	.long	LEnter2_16

-	.long	LEnter4_16

-	.long	0, LEnter8_16

-	.long	0, 0, 0, LEnter16_16

-	.text

-	.align 4

-.globl C(R_Surf16Start)

-C(R_Surf16Start):

-	.align 4

-.globl C(R_DrawSurfaceBlock16)

-C(R_DrawSurfaceBlock16):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-	movl	C(blocksize),%eax

-	movl	C(prowdestbase),%edi

-	movl	C(pbasesource),%esi

-	movl	C(sourcesstep),%ebx

-	movl	blockjumptable16-4(,%eax,2),%ecx

-	movl	%eax,k

-	movl	%ecx,loopentry

-	movl	C(lightleft),%edx

-	movl	C(lightright),%ebp

-Lblockloop16:

-	subl	%edx,%ebp

-	movb	C(blockdivshift),%cl

-	sarl	%cl,%ebp

-	jns		Lp1_16

-	testl	C(blockdivmask),%ebp

-	jz		Lp1_16

-	incl	%ebp

-Lp1_16:

-	subl	%eax,%eax

-	subl	%ecx,%ecx	// high words must be 0 in loop for addressing

-	jmp		*loopentry

-	.align	4

-#include "block16.h"

-	movl	C(pbasesource),%esi

-	movl	C(lightleft),%edx

-	movl	C(lightright),%ebp

-	movl	C(sourcetstep),%eax

-	movl	C(lightrightstep),%ecx

-	movl	C(prowdestbase),%edi

-	addl	%eax,%esi

-	addl	%ecx,%ebp

-	movl	C(lightleftstep),%eax

-	movl	C(surfrowbytes),%ecx

-	addl	%eax,%edx

-	addl	%ecx,%edi

-	movl	%esi,C(pbasesource)

-	movl	%ebp,C(lightright)

-	movl	k,%eax

-	movl	%edx,C(lightleft)

-	decl	%eax

-	movl	%edi,C(prowdestbase)

-	movl	%eax,k

-	jnz		Lblockloop16

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-.globl C(R_Surf16End)

-C(R_Surf16End):

-//----------------------------------------------------------------------

-// Code patching routines

-//----------------------------------------------------------------------

-	.data

-	.align 4

-LPatchTable16:

-	.long	LBPatch0-4

-	.long	LBPatch1-4

-	.long	LBPatch2-4

-	.long	LBPatch3-4

-	.long	LBPatch4-4

-	.long	LBPatch5-4

-	.long	LBPatch6-4

-	.long	LBPatch7-4

-	.long	LBPatch8-4

-	.long	LBPatch9-4

-	.long	LBPatch10-4

-	.long	LBPatch11-4

-	.long	LBPatch12-4

-	.long	LBPatch13-4

-	.long	LBPatch14-4

-	.long	LBPatch15-4

-	.text

-	.align 4

-.globl C(R_Surf16Patch)

-C(R_Surf16Patch):

-	pushl	%ebx

-	movl	C(colormap),%eax

-	movl	$LPatchTable16,%ebx

-	movl	$16,%ecx

-LPatchLoop16:

-	movl	(%ebx),%edx

-	addl	$4,%ebx

-	movl	%eax,(%edx)

-	decl	%ecx

-	jnz		LPatchLoop16

-	popl	%ebx

-	ret

-#endif	// id386

--- a/surf8.s

+++ /dev/null

@@ -1,764 +1,0 @@

-//

-// surf8.s

-// x86 assembly-language 8 bpp surface block drawing code.

-//

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "asm_draw.h"

-#ifdef	id386

-	.data

-sb_v:		.long	0

-	.text

-	.align 4

-.globl C(R_Surf8Start)

-C(R_Surf8Start):

-//----------------------------------------------------------------------

-// Surface block drawer for mip level 0

-//----------------------------------------------------------------------

-	.align 4

-.globl C(R_DrawSurfaceBlock8_mip0)

-C(R_DrawSurfaceBlock8_mip0):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//		for (v=0 ; v<numvblocks ; v++)

-//		{

-	movl	C(r_lightptr),%ebx

-	movl	C(r_numvblocks),%eax

-	movl	%eax,sb_v

-	movl	C(prowdestbase),%edi

-	movl	C(pbasesource),%esi

-Lv_loop_mip0:

-//			lightleft = lightptr[0];

-//			lightright = lightptr[1];

-//			lightdelta = (lightleft - lightright) & 0xFFFFF;

-	movl	(%ebx),%eax			// lightleft

-	movl	4(%ebx),%edx		// lightright

-	movl	%eax,%ebp

-	movl	C(r_lightwidth),%ecx

-	movl	%edx,C(lightright)

-	subl	%edx,%ebp

-	andl	$0xFFFFF,%ebp

-	leal	(%ebx,%ecx,4),%ebx

-//			lightptr += lightwidth;

-	movl	%ebx,C(r_lightptr)

-//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

-//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

-//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

-//					0xF0000000;

-	movl	4(%ebx),%ecx	// lightptr[1]

-	movl	(%ebx),%ebx		// lightptr[0]

-	subl	%eax,%ebx

-	subl	%edx,%ecx

-	sarl	$4,%ecx

-	orl		$0xF0000000,%ebp

-	sarl	$4,%ebx

-	movl	%ecx,C(lightrightstep)

-	subl	%ecx,%ebx

-	andl	$0xFFFFF,%ebx

-	orl		$0xF0000000,%ebx

-	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

-	movl	%ebx,C(lightdeltastep)

-	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

-Lblockloop8_mip0:

-	movl	%ebp,C(lightdelta)

-	movb	14(%esi),%cl

-	sarl	$4,%ebp

-	movb	%dh,%bh

-	movb	15(%esi),%bl

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch0:

-	movb	13(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch1:

-	movb	12(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch2:

-	movb	11(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch3:

-	movb	10(%esi),%cl

-	movl	%eax,12(%edi)

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch4:

-	movb	9(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch5:

-	movb	8(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch6:

-	movb	7(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch7:

-	movb	6(%esi),%cl

-	movl	%eax,8(%edi)

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch8:

-	movb	5(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch9:

-	movb	4(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch10:

-	movb	3(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch11:

-	movb	2(%esi),%cl

-	movl	%eax,4(%edi)

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch12:

-	movb	1(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch13:

-	movb	(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	movb	0x12345678(%ebx),%ah

-LBPatch14:

-	movl	C(lightright),%edx

-	movb	0x12345678(%ecx),%al

-LBPatch15:

-	movl	C(lightdelta),%ebp

-	movl	%eax,(%edi)

-	addl	C(sourcetstep),%esi

-	addl	C(surfrowbytes),%edi

-	addl	C(lightrightstep),%edx

-	addl	C(lightdeltastep),%ebp

-	movl	%edx,C(lightright)

-	jc		Lblockloop8_mip0

-//			if (pbasesource >= r_sourcemax)

-//				pbasesource -= stepback;

-	cmpl	C(r_sourcemax),%esi

-	jb		LSkip_mip0

-	subl	C(r_stepback),%esi

-LSkip_mip0:

-	movl	C(r_lightptr),%ebx

-	decl	sb_v

-	jnz		Lv_loop_mip0

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-//----------------------------------------------------------------------

-// Surface block drawer for mip level 1

-//----------------------------------------------------------------------

-	.align 4

-.globl C(R_DrawSurfaceBlock8_mip1)

-C(R_DrawSurfaceBlock8_mip1):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//		for (v=0 ; v<numvblocks ; v++)

-//		{

-	movl	C(r_lightptr),%ebx

-	movl	C(r_numvblocks),%eax

-	movl	%eax,sb_v

-	movl	C(prowdestbase),%edi

-	movl	C(pbasesource),%esi

-Lv_loop_mip1:

-//			lightleft = lightptr[0];

-//			lightright = lightptr[1];

-//			lightdelta = (lightleft - lightright) & 0xFFFFF;

-	movl	(%ebx),%eax			// lightleft

-	movl	4(%ebx),%edx		// lightright

-	movl	%eax,%ebp

-	movl	C(r_lightwidth),%ecx

-	movl	%edx,C(lightright)

-	subl	%edx,%ebp

-	andl	$0xFFFFF,%ebp

-	leal	(%ebx,%ecx,4),%ebx

-//			lightptr += lightwidth;

-	movl	%ebx,C(r_lightptr)

-//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

-//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

-//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

-//					0xF0000000;

-	movl	4(%ebx),%ecx	// lightptr[1]

-	movl	(%ebx),%ebx		// lightptr[0]

-	subl	%eax,%ebx

-	subl	%edx,%ecx

-	sarl	$3,%ecx

-	orl		$0x70000000,%ebp

-	sarl	$3,%ebx

-	movl	%ecx,C(lightrightstep)

-	subl	%ecx,%ebx

-	andl	$0xFFFFF,%ebx

-	orl		$0xF0000000,%ebx

-	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

-	movl	%ebx,C(lightdeltastep)

-	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

-Lblockloop8_mip1:

-	movl	%ebp,C(lightdelta)

-	movb	6(%esi),%cl

-	sarl	$3,%ebp

-	movb	%dh,%bh

-	movb	7(%esi),%bl

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch22:

-	movb	5(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch23:

-	movb	4(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch24:

-	movb	3(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch25:

-	movb	2(%esi),%cl

-	movl	%eax,4(%edi)

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch26:

-	movb	1(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch27:

-	movb	(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	movb	0x12345678(%ebx),%ah

-LBPatch28:

-	movl	C(lightright),%edx

-	movb	0x12345678(%ecx),%al

-LBPatch29:

-	movl	C(lightdelta),%ebp

-	movl	%eax,(%edi)

-	movl	C(sourcetstep),%eax

-	addl	%eax,%esi

-	movl	C(surfrowbytes),%eax

-	addl	%eax,%edi

-	movl	C(lightrightstep),%eax

-	addl	%eax,%edx

-	movl	C(lightdeltastep),%eax

-	addl	%eax,%ebp

-	movl	%edx,C(lightright)

-	jc		Lblockloop8_mip1

-//			if (pbasesource >= r_sourcemax)

-//				pbasesource -= stepback;

-	cmpl	C(r_sourcemax),%esi

-	jb		LSkip_mip1

-	subl	C(r_stepback),%esi

-LSkip_mip1:

-	movl	C(r_lightptr),%ebx

-	decl	sb_v

-	jnz		Lv_loop_mip1

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-//----------------------------------------------------------------------

-// Surface block drawer for mip level 2

-//----------------------------------------------------------------------

-	.align 4

-.globl C(R_DrawSurfaceBlock8_mip2)

-C(R_DrawSurfaceBlock8_mip2):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//		for (v=0 ; v<numvblocks ; v++)

-//		{

-	movl	C(r_lightptr),%ebx

-	movl	C(r_numvblocks),%eax

-	movl	%eax,sb_v

-	movl	C(prowdestbase),%edi

-	movl	C(pbasesource),%esi

-Lv_loop_mip2:

-//			lightleft = lightptr[0];

-//			lightright = lightptr[1];

-//			lightdelta = (lightleft - lightright) & 0xFFFFF;

-	movl	(%ebx),%eax			// lightleft

-	movl	4(%ebx),%edx		// lightright

-	movl	%eax,%ebp

-	movl	C(r_lightwidth),%ecx

-	movl	%edx,C(lightright)

-	subl	%edx,%ebp

-	andl	$0xFFFFF,%ebp

-	leal	(%ebx,%ecx,4),%ebx

-//			lightptr += lightwidth;

-	movl	%ebx,C(r_lightptr)

-//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

-//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

-//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

-//					0xF0000000;

-	movl	4(%ebx),%ecx	// lightptr[1]

-	movl	(%ebx),%ebx		// lightptr[0]

-	subl	%eax,%ebx

-	subl	%edx,%ecx

-	sarl	$2,%ecx

-	orl		$0x30000000,%ebp

-	sarl	$2,%ebx

-	movl	%ecx,C(lightrightstep)

-	subl	%ecx,%ebx

-	andl	$0xFFFFF,%ebx

-	orl		$0xF0000000,%ebx

-	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

-	movl	%ebx,C(lightdeltastep)

-	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

-Lblockloop8_mip2:

-	movl	%ebp,C(lightdelta)

-	movb	2(%esi),%cl

-	sarl	$2,%ebp

-	movb	%dh,%bh

-	movb	3(%esi),%bl

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	addl	%ebp,%edx

-	movb	0x12345678(%ebx),%ah

-LBPatch18:

-	movb	1(%esi),%bl

-	movb	0x12345678(%ecx),%al

-LBPatch19:

-	movb	(%esi),%cl

-	movb	%dh,%bh

-	addl	%ebp,%edx

-	rorl	$16,%eax

-	movb	%dh,%ch

-	movb	0x12345678(%ebx),%ah

-LBPatch20:

-	movl	C(lightright),%edx

-	movb	0x12345678(%ecx),%al

-LBPatch21:

-	movl	C(lightdelta),%ebp

-	movl	%eax,(%edi)

-	movl	C(sourcetstep),%eax

-	addl	%eax,%esi

-	movl	C(surfrowbytes),%eax

-	addl	%eax,%edi

-	movl	C(lightrightstep),%eax

-	addl	%eax,%edx

-	movl	C(lightdeltastep),%eax

-	addl	%eax,%ebp

-	movl	%edx,C(lightright)

-	jc		Lblockloop8_mip2

-//			if (pbasesource >= r_sourcemax)

-//				pbasesource -= stepback;

-	cmpl	C(r_sourcemax),%esi

-	jb		LSkip_mip2

-	subl	C(r_stepback),%esi

-LSkip_mip2:

-	movl	C(r_lightptr),%ebx

-	decl	sb_v

-	jnz		Lv_loop_mip2

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-//----------------------------------------------------------------------

-// Surface block drawer for mip level 3

-//----------------------------------------------------------------------

-	.align 4

-.globl C(R_DrawSurfaceBlock8_mip3)

-C(R_DrawSurfaceBlock8_mip3):

-	pushl	%ebp				// preserve caller's stack frame

-	pushl	%edi

-	pushl	%esi				// preserve register variables

-	pushl	%ebx

-//		for (v=0 ; v<numvblocks ; v++)

-//		{

-	movl	C(r_lightptr),%ebx

-	movl	C(r_numvblocks),%eax

-	movl	%eax,sb_v

-	movl	C(prowdestbase),%edi

-	movl	C(pbasesource),%esi

-Lv_loop_mip3:

-//			lightleft = lightptr[0];

-//			lightright = lightptr[1];

-//			lightdelta = (lightleft - lightright) & 0xFFFFF;

-	movl	(%ebx),%eax			// lightleft

-	movl	4(%ebx),%edx		// lightright

-	movl	%eax,%ebp

-	movl	C(r_lightwidth),%ecx

-	movl	%edx,C(lightright)

-	subl	%edx,%ebp

-	andl	$0xFFFFF,%ebp

-	leal	(%ebx,%ecx,4),%ebx

-	movl	%ebp,C(lightdelta)

-//			lightptr += lightwidth;

-	movl	%ebx,C(r_lightptr)

-//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

-//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

-//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

-//					0xF0000000;

-	movl	4(%ebx),%ecx	// lightptr[1]

-	movl	(%ebx),%ebx		// lightptr[0]

-	subl	%eax,%ebx

-	subl	%edx,%ecx

-	sarl	$1,%ecx

-	sarl	$1,%ebx

-	movl	%ecx,C(lightrightstep)

-	subl	%ecx,%ebx

-	andl	$0xFFFFF,%ebx

-	sarl	$1,%ebp

-	orl		$0xF0000000,%ebx

-	movl	%ebx,C(lightdeltastep)

-	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

-	movb	1(%esi),%bl

-	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

-	movb	%dh,%bh

-	movb	(%esi),%cl

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	movb	0x12345678(%ebx),%al

-LBPatch16:

-	movl	C(lightright),%edx

-	movb	%al,1(%edi)

-	movb	0x12345678(%ecx),%al

-LBPatch17:

-	movb	%al,(%edi)

-	movl	C(sourcetstep),%eax

-	addl	%eax,%esi

-	movl	C(surfrowbytes),%eax

-	addl	%eax,%edi

-	movl	C(lightdeltastep),%eax

-	movl	C(lightdelta),%ebp

-	movb	(%esi),%cl

-	addl	%eax,%ebp

-	movl	C(lightrightstep),%eax

-	sarl	$1,%ebp

-	addl	%eax,%edx

-	movb	%dh,%bh

-	movb	1(%esi),%bl

-	addl	%ebp,%edx

-	movb	%dh,%ch

-	movb	0x12345678(%ebx),%al

-LBPatch30:

-	movl	C(sourcetstep),%edx

-	movb	%al,1(%edi)

-	movb	0x12345678(%ecx),%al

-LBPatch31:

-	movb	%al,(%edi)

-	movl	C(surfrowbytes),%ebp

-	addl	%edx,%esi

-	addl	%ebp,%edi

-//			if (pbasesource >= r_sourcemax)

-//				pbasesource -= stepback;

-	cmpl	C(r_sourcemax),%esi

-	jb		LSkip_mip3

-	subl	C(r_stepback),%esi

-LSkip_mip3:

-	movl	C(r_lightptr),%ebx

-	decl	sb_v

-	jnz		Lv_loop_mip3

-	popl	%ebx				// restore register variables

-	popl	%esi

-	popl	%edi

-	popl	%ebp				// restore the caller's stack frame

-	ret

-.globl C(R_Surf8End)

-C(R_Surf8End):

-//----------------------------------------------------------------------

-// Code patching routines

-//----------------------------------------------------------------------

-	.data

-	.align 4

-LPatchTable8:

-	.long	LBPatch0-4

-	.long	LBPatch1-4

-	.long	LBPatch2-4

-	.long	LBPatch3-4

-	.long	LBPatch4-4

-	.long	LBPatch5-4

-	.long	LBPatch6-4

-	.long	LBPatch7-4

-	.long	LBPatch8-4

-	.long	LBPatch9-4

-	.long	LBPatch10-4

-	.long	LBPatch11-4

-	.long	LBPatch12-4

-	.long	LBPatch13-4

-	.long	LBPatch14-4

-	.long	LBPatch15-4

-	.long	LBPatch16-4

-	.long	LBPatch17-4

-	.long	LBPatch18-4

-	.long	LBPatch19-4

-	.long	LBPatch20-4

-	.long	LBPatch21-4

-	.long	LBPatch22-4

-	.long	LBPatch23-4

-	.long	LBPatch24-4

-	.long	LBPatch25-4

-	.long	LBPatch26-4

-	.long	LBPatch27-4

-	.long	LBPatch28-4

-	.long	LBPatch29-4

-	.long	LBPatch30-4

-	.long	LBPatch31-4

-	.text

-	.align 4

-.globl C(R_Surf8Patch)

-C(R_Surf8Patch):

-	pushl	%ebx

-	movl	C(colormap),%eax

-	movl	$LPatchTable8,%ebx

-	movl	$32,%ecx

-LPatchLoop8:

-	movl	(%ebx),%edx

-	addl	$4,%ebx

-	movl	%eax,(%edx)

-	decl	%ecx

-	jnz		LPatchLoop8

-	popl	%ebx

-	ret

-#endif	// id386

--- a/sys_dosa.s

+++ /dev/null

@@ -1,95 +1,0 @@

-//

-// sys_dosa.s

-// x86 assembly-language DOS-dependent routines.

-#include "asm_i386.h"

-#include "quakeasm.h"

-	.data

-	.align	4

-fpenv:

-	.long	0, 0, 0, 0, 0, 0, 0, 0

-	.text

-.globl C(MaskExceptions)

-C(MaskExceptions):

-	fnstenv	fpenv

-	orl		$0x3F,fpenv

-	fldenv	fpenv

-	ret

-/*

-.globl C(unmaskexceptions)

-C(unmaskexceptions):

-	fnstenv	fpenv

-	andl		$0xFFFFFFE0,fpenv

-	fldenv	fpenv

-	ret

-*/

-	.data

-	.align	4

-.globl	ceil_cw, single_cw, full_cw, cw, pushed_cw

-ceil_cw:	.long	0

-single_cw:	.long	0

-full_cw:	.long	0

-cw:			.long	0

-pushed_cw:	.long	0

-	.text

-.globl C(Sys_LowFPPrecision)

-C(Sys_LowFPPrecision):

-	fldcw	single_cw

-	ret

-.globl C(Sys_HighFPPrecision)

-C(Sys_HighFPPrecision):

-	fldcw	full_cw

-	ret

-.globl C(Sys_PushFPCW_SetHigh)

-C(Sys_PushFPCW_SetHigh):

-	fnstcw	pushed_cw

-	fldcw	full_cw

-	ret

-.globl C(Sys_PopFPCW)

-C(Sys_PopFPCW):

-	fldcw	pushed_cw

-	ret

-.globl C(Sys_SetFPCW)

-C(Sys_SetFPCW):

-	fnstcw	cw

-	movl	cw,%eax

-#ifdef	id386

-	andb	$0xF0,%ah

-	orb		$0x03,%ah	// round mode, 64-bit precision

-#endif

-	movl	%eax,full_cw

-#ifdef	id386

-	andb	$0xF0,%ah

-	orb		$0x0C,%ah	// chop mode, single precision

-#endif

-	movl	%eax,single_cw

-#ifdef	id386

-	andb	$0xF0,%ah

-	orb		$0x08,%ah	// ceil mode, single precision

-#endif

-	movl	%eax,ceil_cw

-	ret

--- /dev/null

+++ b/u/asm_draw.h

@@ -1,0 +1,132 @@

+//

+// asm_draw.h

+//

+// Include file for asm drawing routines.

+//

+//

+// !!! note that this file must match the corresponding C structures at all

+// times !!!

+//

+// !!! if this is changed, it must be changed in r_local.h too !!!

+#define	NEAR_CLIP	0.01

+// !!! if this is changed, it must be changed in r_local.h too !!!

+#define	CYCLE	128

+// espan_t structure

+// !!! if this is changed, it must be changed in r_shared.h too !!!

+#define espan_t_u    	0

+#define espan_t_v	    4

+#define espan_t_count   8

+#define espan_t_pnext	12

+#define espan_t_size    16

+// sspan_t structure

+// !!! if this is changed, it must be changed in d_local.h too !!!

+#define sspan_t_u    	0

+#define sspan_t_v	    4

+#define sspan_t_count   8

+#define sspan_t_size    12

+// spanpackage_t structure

+// !!! if this is changed, it must be changed in d_polyset.c too !!!

+#define spanpackage_t_pdest				0

+#define spanpackage_t_pz				4

+#define spanpackage_t_count				8

+#define spanpackage_t_ptex				12

+#define spanpackage_t_sfrac				16

+#define spanpackage_t_tfrac				20

+#define spanpackage_t_light				24

+#define spanpackage_t_zi				28

+#define spanpackage_t_size				32

+// edge_t structure

+// !!! if this is changed, it must be changed in r_shared.h too !!!

+#define et_u			0

+#define et_u_step		4

+#define et_prev			8

+#define et_next			12

+#define et_surfs		16

+#define et_nextremove	20

+#define et_nearzi		24

+#define et_owner		28

+#define et_size			32

+// surf_t structure

+// !!! if this is changed, it must be changed in r_shared.h too !!!

+#define SURF_T_SHIFT	6

+#define st_next			0

+#define st_prev			4

+#define st_spans		8

+#define st_key			12

+#define st_last_u		16

+#define st_spanstate	20

+#define st_flags		24

+#define st_data			28

+#define st_entity		32

+#define st_nearzi		36

+#define st_insubmodel	40

+#define st_d_ziorigin	44

+#define st_d_zistepu	48

+#define st_d_zistepv	52

+#define st_pad			56

+#define st_size			64

+// clipplane_t structure

+// !!! if this is changed, it must be changed in r_local.h too !!!

+#define cp_normal		0

+#define cp_dist			12

+#define cp_next			16

+#define cp_leftedge		20

+#define cp_rightedge	21

+#define cp_reserved		22

+#define cp_size			24

+// medge_t structure

+// !!! if this is changed, it must be changed in model.h too !!!

+#define me_v				0

+#define me_cachededgeoffset	4

+#define me_size				8

+// mvertex_t structure

+// !!! if this is changed, it must be changed in model.h too !!!

+#define mv_position		0

+#define mv_size			12

+// refdef_t structure

+// !!! if this is changed, it must be changed in render.h too !!!

+#define rd_vrect					0

+#define rd_aliasvrect				20

+#define rd_vrectright				40

+#define rd_vrectbottom				44

+#define rd_aliasvrectright			48

+#define rd_aliasvrectbottom			52

+#define rd_vrectrightedge			56

+#define rd_fvrectx					60

+#define rd_fvrecty					64

+#define rd_fvrectx_adj				68

+#define rd_fvrecty_adj				72

+#define rd_vrect_x_adj_shift20		76

+#define rd_vrectright_adj_shift20	80

+#define rd_fvrectright_adj			84

+#define rd_fvrectbottom_adj			88

+#define rd_fvrectright				92

+#define rd_fvrectbottom				96

+#define rd_horizontalFieldOfView	100

+#define rd_xOrigin					104

+#define rd_yOrigin					108

+#define rd_vieworg					112

+#define rd_viewangles				124

+#define rd_ambientlight				136

+#define rd_size						140

+// mtriangle_t structure

+// !!! if this is changed, it must be changed in model.h too !!!

+#define mtri_facesfront		0

+#define mtri_vertindex		4

+#define mtri_size			16	// !!! if this changes, array indexing in !!!

+								// !!! d_polysa.s must be changed to match !!!

+#define mtri_shift			4

--- /dev/null

+++ b/u/asm_i386.h

@@ -1,0 +1,78 @@

+#ifndef __ASM_I386__

+#define __ASM_I386__

+#ifdef ELF

+#define C(label) label

+#endif

+#ifndef ELF

+#define C(label) _##label

+#endif

+//

+// !!! note that this file must match the corresponding C structures at all

+// times !!!

+//

+// plane_t structure

+// !!! if this is changed, it must be changed in model.h too !!!

+// !!! if the size of this is changed, the array lookup in SV_HullPointContents

+//     must be changed too !!!

+#define pl_normal	0

+#define pl_dist		12

+#define pl_type		16

+#define pl_signbits	17

+#define pl_pad		18

+#define pl_size		20

+// hull_t structure

+// !!! if this is changed, it must be changed in model.h too !!!

+#define	hu_clipnodes		0

+#define	hu_planes			4

+#define	hu_firstclipnode	8

+#define	hu_lastclipnode		12

+#define	hu_clip_mins		16

+#define	hu_clip_maxs		28

+#define hu_size  			40

+// dnode_t structure

+// !!! if this is changed, it must be changed in bspfile.h too !!!

+#define	nd_planenum		0

+#define	nd_children		4

+#define	nd_mins			8

+#define	nd_maxs			20

+#define	nd_firstface	32

+#define	nd_numfaces		36

+#define nd_size			40

+// sfxcache_t structure

+// !!! if this is changed, it much be changed in sound.h too !!!

+#define sfxc_length		0

+#define sfxc_loopstart	4

+#define sfxc_speed		8

+#define sfxc_width		12

+#define sfxc_stereo		16

+#define sfxc_data		20

+// channel_t structure

+// !!! if this is changed, it much be changed in sound.h too !!!

+#define ch_sfx			0

+#define ch_leftvol		4

+#define ch_rightvol		8

+#define ch_end			12

+#define ch_pos			16

+#define ch_looping		20

+#define ch_entnum		24

+#define ch_entchannel	28

+#define ch_origin		32

+#define ch_dist_mult	44

+#define ch_master_vol	48

+#define ch_size			52

+// portable_samplepair_t structure

+// !!! if this is changed, it much be changed in sound.h too !!!

+#define psp_left		0

+#define psp_right		4

+#define psp_size		8

+#endif

--- /dev/null

+++ b/u/block16.h

@@ -1,0 +1,123 @@

+LEnter16_16:

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch0:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch1:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch2:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch3:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch4:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch5:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch6:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch7:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+LEnter8_16:

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch8:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch9:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch10:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch11:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+LEnter4_16:

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch12:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch13:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

+LEnter2_16:

+	movb	(%esi),%al

+	movb	(%esi,%ebx,),%cl

+	movb	%dh,%ah

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	leal	(%esi,%ebx,2),%esi

+	movw	0x12345678(,%eax,2),%ax

+LBPatch14:

+	addl	%ebp,%edx

+	movw	%ax,(%edi)

+	movw	0x12345678(,%ecx,2),%cx

+LBPatch15:

+	movw	%cx,2(%edi)

+	addl	$0x4,%edi

--- /dev/null

+++ b/u/d_draw.s

@@ -1,0 +1,1018 @@

+//

+// d_draw.s

+// x86 assembly-language horizontal 8-bpp span-drawing code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef	id386

+//----------------------------------------------------------------------

+// 8-bpp horizontal span drawing code for polygons, with no transparency.

+//

+// Assumes there is at least one span in pspans, and that every span

+// contains at least one pixel

+//----------------------------------------------------------------------

+	.text

+// out-of-line, rarely-needed clamping code

+LClampHigh0:

+	movl	C(bbextents),%esi

+	jmp		LClampReentry0

+LClampHighOrLow0:

+	jg		LClampHigh0

+	xorl	%esi,%esi

+	jmp		LClampReentry0

+LClampHigh1:

+	movl	C(bbextentt),%edx

+	jmp		LClampReentry1

+LClampHighOrLow1:

+	jg		LClampHigh1

+	xorl	%edx,%edx

+	jmp		LClampReentry1

+LClampLow2:

+	movl	$2048,%ebp

+	jmp		LClampReentry2

+LClampHigh2:

+	movl	C(bbextents),%ebp

+	jmp		LClampReentry2

+LClampLow3:

+	movl	$2048,%ecx

+	jmp		LClampReentry3

+LClampHigh3:

+	movl	C(bbextentt),%ecx

+	jmp		LClampReentry3

+LClampLow4:

+	movl	$2048,%eax

+	jmp		LClampReentry4

+LClampHigh4:

+	movl	C(bbextents),%eax

+	jmp		LClampReentry4

+LClampLow5:

+	movl	$2048,%ebx

+	jmp		LClampReentry5

+LClampHigh5:

+	movl	C(bbextentt),%ebx

+	jmp		LClampReentry5

+#define pspans	4+16

+	.align 4

+.globl C(D_DrawSpans8)

+C(D_DrawSpans8):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//

+// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock

+// and span list pointers

+//

+// TODO: any overlap from rearranging?

+	flds	C(d_sdivzstepu)

+	fmuls	fp_8

+	movl	C(cacheblock),%edx

+	flds	C(d_tdivzstepu)

+	fmuls	fp_8

+	movl	pspans(%esp),%ebx	// point to the first span descriptor

+	flds	C(d_zistepu)

+	fmuls	fp_8

+	movl	%edx,pbase			// pbase = cacheblock

+	fstps	zi8stepu

+	fstps	tdivz8stepu

+	fstps	sdivz8stepu

+LSpanLoop:

+//

+// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the

+// initial s and t values

+//

+// FIXME: pipeline FILD?

+	fildl	espan_t_v(%ebx)

+	fildl	espan_t_u(%ebx)

+	fld		%st(1)			// dv | du | dv

+	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv

+	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv

+	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

+	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

+	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |

+							//  dv*d_sdivzstepv | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |

+							//  dv*d_sdivzstepv | du | dv

+	faddp	%st(0),%st(2)	// du*d_tdivzstepu |

+							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |

+							//  du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv

+	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +

+							//  du*d_sdivzstepu; stays in %st(2) at end

+	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |

+							//  s/z

+	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |

+							//  du*d_tdivzstepu | du | s/z

+	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |

+							//  du*d_tdivzstepu | du | s/z

+	faddp	%st(0),%st(2)	// dv*d_zistepv |

+							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z

+	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  dv*d_zistepv | s/z

+	fmuls	C(d_zistepu)		// du*d_zistepu |

+							//  dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  dv*d_zistepv | s/z

+	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  du*d_zistepu | dv*d_zistepv | s/z

+	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +

+							//  du*d_tdivzstepu; stays in %st(1) at end

+	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z

+	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z

+	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z

+	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z

+	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +

+							//  du*d_zistepu; stays in %st(0) at end

+							// 1/z | fp_64k | t/z | s/z

+//

+// calculate and clamp s & t

+//

+	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z

+//

+// point %edi to the first pixel in the span

+//

+	movl	C(d_viewbuffer),%ecx

+	movl	espan_t_v(%ebx),%eax

+	movl	%ebx,pspantemp	// preserve spans pointer

+	movl	C(tadjust),%edx

+	movl	C(sadjust),%esi

+	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth

+	addl	%ecx,%edi

+	movl	espan_t_u(%ebx),%ecx

+	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];

+	movl	espan_t_count(%ebx),%ecx

+//

+// now start the FDIV for the end of the span

+//

+	cmpl	$8,%ecx

+	ja		LSetupNotLast1

+	decl	%ecx

+	jz		LCleanup1		// if only one pixel, no need to start an FDIV

+	movl	%ecx,spancountminus1

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	fildl	spancountminus1

+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1

+	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1

+	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1

+	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

+	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1

+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |

+							//  C(d_tdivzstepu)*scm1

+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |

+							//  C(d_tdivzstepu)*scm1

+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

+	faddp	%st(0),%st(3)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

+							//  overlap

+	jmp		LFDIVInFlight1

+LCleanup1:

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	jmp		LFDIVInFlight1

+	.align	4

+LSetupNotLast1:

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	fadds	zi8stepu

+	fxch	%st(2)

+	fadds	sdivz8stepu

+	fxch	%st(2)

+	flds	tdivz8stepu

+	faddp	%st(0),%st(2)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// z = 1/1/z

+							// this is what we've gone to all this trouble to

+							//  overlap

+LFDIVInFlight1:

+	addl	s,%esi

+	addl	t,%edx

+	movl	C(bbextents),%ebx

+	movl	C(bbextentt),%ebp

+	cmpl	%ebx,%esi

+	ja		LClampHighOrLow0

+LClampReentry0:

+	movl	%esi,s

+	movl	pbase,%ebx

+	shll	$16,%esi

+	cmpl	%ebp,%edx

+	movl	%esi,sfracf

+	ja		LClampHighOrLow1

+LClampReentry1:

+	movl	%edx,t

+	movl	s,%esi					// sfrac = scans->sfrac;

+	shll	$16,%edx

+	movl	t,%eax					// tfrac = scans->tfrac;

+	sarl	$16,%esi

+	movl	%edx,tfracf

+//

+// calculate the texture starting address

+//

+	sarl	$16,%eax

+	movl	C(cachewidth),%edx

+	imull	%edx,%eax				// (tfrac >> 16) * cachewidth

+	addl	%ebx,%esi

+	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +

+									//           ((tfrac >> 16) * cachewidth);

+//

+// determine whether last span or not

+//

+	cmpl	$8,%ecx

+	jna		LLastSegment

+//

+// not the last segment; do full 8-wide segment

+//

+LNotLastSegment:

+//

+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

+// get there

+//

+// pick up after the FDIV that was left in flight previously

+	fld		%st(0)			// duplicate it

+	fmul	%st(4),%st(0)	// s = s/z * z

+	fxch	%st(1)

+	fmul	%st(3),%st(0)	// t = t/z * z

+	fxch	%st(1)

+	fistpl	snext

+	fistpl	tnext

+	movl	snext,%eax

+	movl	tnext,%edx

+	movb	(%esi),%bl	// get first source texel

+	subl	$8,%ecx		// count off this segments' pixels

+	movl	C(sadjust),%ebp

+	movl	%ecx,counttemp	// remember count of remaining pixels

+	movl	C(tadjust),%ecx

+	movb	%bl,(%edi)	// store first dest pixel

+	addl	%eax,%ebp

+	addl	%edx,%ecx

+	movl	C(bbextents),%eax

+	movl	C(bbextentt),%edx

+	cmpl	$2048,%ebp

+	jl		LClampLow2

+	cmpl	%eax,%ebp

+	ja		LClampHigh2

+LClampReentry2:

+	cmpl	$2048,%ecx

+	jl		LClampLow3

+	cmpl	%edx,%ecx

+	ja		LClampHigh3

+LClampReentry3:

+	movl	%ebp,snext

+	movl	%ecx,tnext

+	subl	s,%ebp

+	subl	t,%ecx

+//

+// set up advancetable

+//

+	movl	%ecx,%eax

+	movl	%ebp,%edx

+	sarl	$19,%eax			// tstep >>= 16;

+	jz		LZero

+	sarl	$19,%edx			// sstep >>= 16;

+	movl	C(cachewidth),%ebx

+	imull	%ebx,%eax

+	jmp		LSetUp1

+LZero:

+	sarl	$19,%edx			// sstep >>= 16;

+	movl	C(cachewidth),%ebx

+LSetUp1:

+	addl	%edx,%eax			// add in sstep

+								// (tstep >> 16) * cachewidth + (sstep >> 16);

+	movl	tfracf,%edx

+	movl	%eax,advancetable+4	// advance base in t

+	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +

+								//  (sstep >> 16);

+	shll	$13,%ebp			// left-justify sstep fractional part

+	movl	sfracf,%ebx

+	shll	$13,%ecx			// left-justify tstep fractional part

+	movl	%eax,advancetable	// advance extra in t

+	movl	%ecx,tstep

+	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac

+	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)

+	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac

+	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	(%esi),%al

+	addl	%ebp,%ebx

+	movb	%al,1(%edi)

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,2(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,3(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+//

+// start FDIV for end of next segment in flight, so it can overlap

+//

+	movl	counttemp,%ecx

+	cmpl	$8,%ecx			// more than one segment after this?

+	ja		LSetupNotLast2	// yes

+	decl	%ecx

+	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV

+	movl	%ecx,spancountminus1

+	fildl	spancountminus1

+	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1

+	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1

+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1

+	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1

+	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1

+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

+	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1

+	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k

+	faddp	%st(0),%st(4)	// 64k

+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

+							//  overlap

+	jmp		LFDIVInFlight2

+	.align	4

+LSetupNotLast2:

+	fadds	zi8stepu

+	fxch	%st(2)

+	fadds	sdivz8stepu

+	fxch	%st(2)

+	flds	tdivz8stepu

+	faddp	%st(0),%st(2)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// z = 1/1/z

+							// this is what we've gone to all this trouble to

+							//  overlap

+LFDIVInFlight2:

+	movl	%ecx,counttemp

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,4(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,5(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,6(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	$8,%edi

+	movl	%edx,tfracf

+	movl	snext,%edx

+	movl	%ebx,sfracf

+	movl	tnext,%ebx

+	movl	%edx,s

+	movl	%ebx,t

+	movl	counttemp,%ecx		// retrieve count

+//

+// determine whether last span or not

+//

+	cmpl	$8,%ecx				// are there multiple segments remaining?

+	movb	%al,-1(%edi)

+	ja		LNotLastSegment		// yes

+//

+// last segment of scan

+//

+LLastSegment:

+//

+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

+// get there. The number of pixels left is variable, and we want to land on the

+// last pixel, not step one past it, so we can't run into arithmetic problems

+//

+	testl	%ecx,%ecx

+	jz		LNoSteps		// just draw the last pixel and we're done

+// pick up after the FDIV that was left in flight previously

+	fld		%st(0)			// duplicate it

+	fmul	%st(4),%st(0)	// s = s/z * z

+	fxch	%st(1)

+	fmul	%st(3),%st(0)	// t = t/z * z

+	fxch	%st(1)

+	fistpl	snext

+	fistpl	tnext

+	movb	(%esi),%al		// load first texel in segment

+	movl	C(tadjust),%ebx

+	movb	%al,(%edi)		// store first pixel in segment

+	movl	C(sadjust),%eax

+	addl	snext,%eax

+	addl	tnext,%ebx

+	movl	C(bbextents),%ebp

+	movl	C(bbextentt),%edx

+	cmpl	$2048,%eax

+	jl		LClampLow4

+	cmpl	%ebp,%eax

+	ja		LClampHigh4

+LClampReentry4:

+	movl	%eax,snext

+	cmpl	$2048,%ebx

+	jl		LClampLow5

+	cmpl	%edx,%ebx

+	ja		LClampHigh5

+LClampReentry5:

+	cmpl	$1,%ecx			// don't bother

+	je		LOnlyOneStep	// if two pixels in segment, there's only one step,

+							//  of the segment length

+	subl	s,%eax

+	subl	t,%ebx

+	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31

+	addl	%ebx,%ebx		//  reciprocal yields 16.48

+	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)

+	movl	%edx,%ebp

+	movl	%ebx,%eax

+	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)

+LSetEntryvec:

+//

+// set up advancetable

+//

+	movl	entryvec_table(,%ecx,4),%ebx

+	movl	%edx,%eax

+	movl	%ebx,jumptemp		// entry point into code for RET later

+	movl	%ebp,%ecx

+	sarl	$16,%edx			// tstep >>= 16;

+	movl	C(cachewidth),%ebx

+	sarl	$16,%ecx			// sstep >>= 16;

+	imull	%ebx,%edx

+	addl	%ecx,%edx			// add in sstep

+								// (tstep >> 16) * cachewidth + (sstep >> 16);

+	movl	tfracf,%ecx

+	movl	%edx,advancetable+4	// advance base in t

+	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +

+								//  (sstep >> 16);

+	shll	$16,%ebp			// left-justify sstep fractional part

+	movl	sfracf,%ebx

+	shll	$16,%eax			// left-justify tstep fractional part

+	movl	%edx,advancetable	// advance extra in t

+	movl	%eax,tstep

+	movl	%ecx,%edx

+	addl	%eax,%edx

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	jmp		*jumptemp			// jump to the number-of-pixels handler

+//----------------------------------------

+LNoSteps:

+	movb	(%esi),%al		// load first texel in segment

+	subl	$7,%edi			// adjust for hardwired offset

+	jmp		LEndSpan

+LOnlyOneStep:

+	subl	s,%eax

+	subl	t,%ebx

+	movl	%eax,%ebp

+	movl	%ebx,%edx

+	jmp		LSetEntryvec

+//----------------------------------------

+.globl	Entry2_8

+Entry2_8:

+	subl	$6,%edi		// adjust for hardwired offsets

+	movb	(%esi),%al

+	jmp		LLEntry2_8

+//----------------------------------------

+.globl	Entry3_8

+Entry3_8:

+	subl	$5,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	jmp		LLEntry3_8

+//----------------------------------------

+.globl	Entry4_8

+Entry4_8:

+	subl	$4,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LLEntry4_8

+//----------------------------------------

+.globl	Entry5_8

+Entry5_8:

+	subl	$3,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LLEntry5_8

+//----------------------------------------

+.globl	Entry6_8

+Entry6_8:

+	subl	$2,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LLEntry6_8

+//----------------------------------------

+.globl	Entry7_8

+Entry7_8:

+	decl	%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LLEntry7_8

+//----------------------------------------

+.globl	Entry8_8

+Entry8_8:

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,1(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LLEntry7_8:

+	sbbl	%ecx,%ecx

+	movb	%al,2(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LLEntry6_8:

+	sbbl	%ecx,%ecx

+	movb	%al,3(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LLEntry5_8:

+	sbbl	%ecx,%ecx

+	movb	%al,4(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LLEntry4_8:

+	sbbl	%ecx,%ecx

+	movb	%al,5(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+LLEntry3_8:

+	movb	%al,6(%edi)

+	movb	(%esi),%al

+LLEntry2_8:

+LEndSpan:

+//

+// clear s/z, t/z, 1/z from FP stack

+//

+	fstp %st(0)

+	fstp %st(0)

+	fstp %st(0)

+	movl	pspantemp,%ebx				// restore spans pointer

+	movl	espan_t_pnext(%ebx),%ebx	// point to next span

+	testl	%ebx,%ebx			// any more spans?

+	movb	%al,7(%edi)

+	jnz		LSpanLoop			// more spans

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+//----------------------------------------------------------------------

+// 8-bpp horizontal span z drawing codefor polygons, with no transparency.

+//

+// Assumes there is at least one span in pzspans, and that every span

+// contains at least one pixel

+//----------------------------------------------------------------------

+	.text

+// z-clamp on a non-negative gradient span

+LClamp:

+	movl	$0x40000000,%edx

+	xorl	%ebx,%ebx

+	fstp	%st(0)

+	jmp		LZDraw

+// z-clamp on a negative gradient span

+LClampNeg:

+	movl	$0x40000000,%edx

+	xorl	%ebx,%ebx

+	fstp	%st(0)

+	jmp		LZDrawNeg

+#define pzspans	4+16

+.globl C(D_DrawZSpans)

+C(D_DrawZSpans):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+	flds	C(d_zistepu)

+	movl	C(d_zistepu),%eax

+	movl	pzspans(%esp),%esi

+	testl	%eax,%eax

+	jz		LFNegSpan

+	fmuls	Float2ToThe31nd

+	fistpl	izistep		// note: we are relying on FP exceptions being turned

+						// off here to avoid range problems

+	movl	izistep,%ebx	// remains loaded for all spans

+LFSpanLoop:

+// set up the initial 1/z value

+	fildl	espan_t_v(%esi)

+	fildl	espan_t_u(%esi)

+	movl	espan_t_v(%esi),%ecx

+	movl	C(d_pzbuffer),%edi

+	fmuls	C(d_zistepu)

+	fxch	%st(1)

+	fmuls	C(d_zistepv)

+	fxch	%st(1)

+	fadds	C(d_ziorigin)

+	imull	C(d_zrowbytes),%ecx

+	faddp	%st(0),%st(1)

+// clamp if z is nearer than 2 (1/z > 0.5)

+	fcoms	float_point5

+	addl	%ecx,%edi

+	movl	espan_t_u(%esi),%edx

+	addl	%edx,%edx				// word count

+	movl	espan_t_count(%esi),%ecx

+	addl	%edx,%edi				// pdest = &pdestspan[scans->u];

+	pushl	%esi		// preserve spans pointer

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jz		LClamp

+	fmuls	Float2ToThe31nd

+	fistpl	izi			// note: we are relying on FP exceptions being turned

+						// off here to avoid problems when the span is closer

+						// than 1/(2**31)

+	movl	izi,%edx

+// at this point:

+// %ebx = izistep

+// %ecx = count

+// %edx = izi

+// %edi = pdest

+LZDraw:

+// do a single pixel up front, if necessary to dword align the destination

+	testl	$2,%edi

+	jz		LFMiddle

+	movl	%edx,%eax

+	addl	%ebx,%edx

+	shrl	$16,%eax

+	decl	%ecx

+	movw	%ax,(%edi)

+	addl	$2,%edi

+// do middle a pair of aligned dwords at a time

+LFMiddle:

+	pushl	%ecx

+	shrl	$1,%ecx				// count / 2

+	jz		LFLast				// no aligned dwords to do

+	shrl	$1,%ecx				// (count / 2) / 2

+	jnc		LFMiddleLoop		// even number of aligned dwords to do

+	movl	%edx,%eax

+	addl	%ebx,%edx

+	shrl	$16,%eax

+	movl	%edx,%esi

+	addl	%ebx,%edx

+	andl	$0xFFFF0000,%esi

+	orl		%esi,%eax

+	movl	%eax,(%edi)

+	addl	$4,%edi

+	andl	%ecx,%ecx

+	jz		LFLast

+LFMiddleLoop:

+	movl	%edx,%eax

+	addl	%ebx,%edx

+	shrl	$16,%eax

+	movl	%edx,%esi

+	addl	%ebx,%edx

+	andl	$0xFFFF0000,%esi

+	orl		%esi,%eax

+	movl	%edx,%ebp

+	movl	%eax,(%edi)

+	addl	%ebx,%edx

+	shrl	$16,%ebp

+	movl	%edx,%esi

+	addl	%ebx,%edx

+	andl	$0xFFFF0000,%esi

+	orl		%esi,%ebp

+	movl	%ebp,4(%edi)	// FIXME: eliminate register contention

+	addl	$8,%edi

+	decl	%ecx

+	jnz		LFMiddleLoop

+LFLast:

+	popl	%ecx			// retrieve count

+	popl	%esi			// retrieve span pointer

+// do the last, unaligned pixel, if there is one

+	andl	$1,%ecx			// is there an odd pixel left to do?

+	jz		LFSpanDone		// no

+	shrl	$16,%edx

+	movw	%dx,(%edi)		// do the final pixel's z

+LFSpanDone:

+	movl	espan_t_pnext(%esi),%esi

+	testl	%esi,%esi

+	jnz		LFSpanLoop

+	jmp		LFDone

+LFNegSpan:

+	fmuls	FloatMinus2ToThe31nd

+	fistpl	izistep		// note: we are relying on FP exceptions being turned

+						// off here to avoid range problems

+	movl	izistep,%ebx	// remains loaded for all spans

+LFNegSpanLoop:

+// set up the initial 1/z value

+	fildl	espan_t_v(%esi)

+	fildl	espan_t_u(%esi)

+	movl	espan_t_v(%esi),%ecx

+	movl	C(d_pzbuffer),%edi

+	fmuls	C(d_zistepu)

+	fxch	%st(1)

+	fmuls	C(d_zistepv)

+	fxch	%st(1)

+	fadds	C(d_ziorigin)

+	imull	C(d_zrowbytes),%ecx

+	faddp	%st(0),%st(1)

+// clamp if z is nearer than 2 (1/z > 0.5)

+	fcoms	float_point5

+	addl	%ecx,%edi

+	movl	espan_t_u(%esi),%edx

+	addl	%edx,%edx				// word count

+	movl	espan_t_count(%esi),%ecx

+	addl	%edx,%edi				// pdest = &pdestspan[scans->u];

+	pushl	%esi		// preserve spans pointer

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jz		LClampNeg

+	fmuls	Float2ToThe31nd

+	fistpl	izi			// note: we are relying on FP exceptions being turned

+						// off here to avoid problems when the span is closer

+						// than 1/(2**31)

+	movl	izi,%edx

+// at this point:

+// %ebx = izistep

+// %ecx = count

+// %edx = izi

+// %edi = pdest

+LZDrawNeg:

+// do a single pixel up front, if necessary to dword align the destination

+	testl	$2,%edi

+	jz		LFNegMiddle

+	movl	%edx,%eax

+	subl	%ebx,%edx

+	shrl	$16,%eax

+	decl	%ecx

+	movw	%ax,(%edi)

+	addl	$2,%edi

+// do middle a pair of aligned dwords at a time

+LFNegMiddle:

+	pushl	%ecx

+	shrl	$1,%ecx				// count / 2

+	jz		LFNegLast			// no aligned dwords to do

+	shrl	$1,%ecx				// (count / 2) / 2

+	jnc		LFNegMiddleLoop		// even number of aligned dwords to do

+	movl	%edx,%eax

+	subl	%ebx,%edx

+	shrl	$16,%eax

+	movl	%edx,%esi

+	subl	%ebx,%edx

+	andl	$0xFFFF0000,%esi

+	orl		%esi,%eax

+	movl	%eax,(%edi)

+	addl	$4,%edi

+	andl	%ecx,%ecx

+	jz		LFNegLast

+LFNegMiddleLoop:

+	movl	%edx,%eax

+	subl	%ebx,%edx

+	shrl	$16,%eax

+	movl	%edx,%esi

+	subl	%ebx,%edx

+	andl	$0xFFFF0000,%esi

+	orl		%esi,%eax

+	movl	%edx,%ebp

+	movl	%eax,(%edi)

+	subl	%ebx,%edx

+	shrl	$16,%ebp

+	movl	%edx,%esi

+	subl	%ebx,%edx

+	andl	$0xFFFF0000,%esi

+	orl		%esi,%ebp

+	movl	%ebp,4(%edi)	// FIXME: eliminate register contention

+	addl	$8,%edi

+	decl	%ecx

+	jnz		LFNegMiddleLoop

+LFNegLast:

+	popl	%ecx			// retrieve count

+	popl	%esi			// retrieve span pointer

+// do the last, unaligned pixel, if there is one

+	andl	$1,%ecx			// is there an odd pixel left to do?

+	jz		LFNegSpanDone	// no

+	shrl	$16,%edx

+	movw	%dx,(%edi)		// do the final pixel's z

+LFNegSpanDone:

+	movl	espan_t_pnext(%esi),%esi

+	testl	%esi,%esi

+	jnz		LFNegSpanLoop

+LFDone:

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/d_draw16.s

@@ -1,0 +1,955 @@

+//

+// d_draw16.s

+// x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel

+// subdivision.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef	id386

+//----------------------------------------------------------------------

+// 8-bpp horizontal span drawing code for polygons, with no transparency and

+// 16-pixel subdivision.

+//

+// Assumes there is at least one span in pspans, and that every span

+// contains at least one pixel

+//----------------------------------------------------------------------

+	.data

+	.text

+// out-of-line, rarely-needed clamping code

+LClampHigh0:

+	movl	C(bbextents),%esi

+	jmp		LClampReentry0

+LClampHighOrLow0:

+	jg		LClampHigh0

+	xorl	%esi,%esi

+	jmp		LClampReentry0

+LClampHigh1:

+	movl	C(bbextentt),%edx

+	jmp		LClampReentry1

+LClampHighOrLow1:

+	jg		LClampHigh1

+	xorl	%edx,%edx

+	jmp		LClampReentry1

+LClampLow2:

+	movl	$4096,%ebp

+	jmp		LClampReentry2

+LClampHigh2:

+	movl	C(bbextents),%ebp

+	jmp		LClampReentry2

+LClampLow3:

+	movl	$4096,%ecx

+	jmp		LClampReentry3

+LClampHigh3:

+	movl	C(bbextentt),%ecx

+	jmp		LClampReentry3

+LClampLow4:

+	movl	$4096,%eax

+	jmp		LClampReentry4

+LClampHigh4:

+	movl	C(bbextents),%eax

+	jmp		LClampReentry4

+LClampLow5:

+	movl	$4096,%ebx

+	jmp		LClampReentry5

+LClampHigh5:

+	movl	C(bbextentt),%ebx

+	jmp		LClampReentry5

+#define pspans	4+16

+	.align 4

+.globl C(D_DrawSpans16)

+C(D_DrawSpans16):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//

+// set up scaled-by-16 steps, for 16-long segments; also set up cacheblock

+// and span list pointers

+//

+// TODO: any overlap from rearranging?

+	flds	C(d_sdivzstepu)

+	fmuls	fp_16

+	movl	C(cacheblock),%edx

+	flds	C(d_tdivzstepu)

+	fmuls	fp_16

+	movl	pspans(%esp),%ebx	// point to the first span descriptor

+	flds	C(d_zistepu)

+	fmuls	fp_16

+	movl	%edx,pbase			// pbase = cacheblock

+	fstps	zi16stepu

+	fstps	tdivz16stepu

+	fstps	sdivz16stepu

+LSpanLoop:

+//

+// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the

+// initial s and t values

+//

+// FIXME: pipeline FILD?

+	fildl	espan_t_v(%ebx)

+	fildl	espan_t_u(%ebx)

+	fld		%st(1)			// dv | du | dv

+	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv

+	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv

+	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

+	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

+	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |

+							//  dv*d_sdivzstepv | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |

+							//  dv*d_sdivzstepv | du | dv

+	faddp	%st(0),%st(2)	// du*d_tdivzstepu |

+							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |

+							//  du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv

+	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +

+							//  du*d_sdivzstepu; stays in %st(2) at end

+	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |

+							//  s/z

+	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |

+							//  du*d_tdivzstepu | du | s/z

+	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |

+							//  du*d_tdivzstepu | du | s/z

+	faddp	%st(0),%st(2)	// dv*d_zistepv |

+							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z

+	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  dv*d_zistepv | s/z

+	fmuls	C(d_zistepu)		// du*d_zistepu |

+							//  dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  dv*d_zistepv | s/z

+	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  du*d_zistepu | dv*d_zistepv | s/z

+	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +

+							//  du*d_tdivzstepu; stays in %st(1) at end

+	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z

+	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z

+	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z

+	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z

+	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +

+							//  du*d_zistepu; stays in %st(0) at end

+							// 1/z | fp_64k | t/z | s/z

+//

+// calculate and clamp s & t

+//

+	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z

+//

+// point %edi to the first pixel in the span

+//

+	movl	C(d_viewbuffer),%ecx

+	movl	espan_t_v(%ebx),%eax

+	movl	%ebx,pspantemp	// preserve spans pointer

+	movl	C(tadjust),%edx

+	movl	C(sadjust),%esi

+	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth

+	addl	%ecx,%edi

+	movl	espan_t_u(%ebx),%ecx

+	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];

+	movl	espan_t_count(%ebx),%ecx

+//

+// now start the FDIV for the end of the span

+//

+	cmpl	$16,%ecx

+	ja		LSetupNotLast1

+	decl	%ecx

+	jz		LCleanup1		// if only one pixel, no need to start an FDIV

+	movl	%ecx,spancountminus1

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	fildl	spancountminus1

+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1

+	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1

+	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1

+	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

+	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1

+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |

+							//  C(d_tdivzstepu)*scm1

+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |

+							//  C(d_tdivzstepu)*scm1

+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

+	faddp	%st(0),%st(3)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

+							//  overlap

+	jmp		LFDIVInFlight1

+LCleanup1:

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	jmp		LFDIVInFlight1

+	.align	4

+LSetupNotLast1:

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	fadds	zi16stepu

+	fxch	%st(2)

+	fadds	sdivz16stepu

+	fxch	%st(2)

+	flds	tdivz16stepu

+	faddp	%st(0),%st(2)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// z = 1/1/z

+							// this is what we've gone to all this trouble to

+							//  overlap

+LFDIVInFlight1:

+	addl	s,%esi

+	addl	t,%edx

+	movl	C(bbextents),%ebx

+	movl	C(bbextentt),%ebp

+	cmpl	%ebx,%esi

+	ja		LClampHighOrLow0

+LClampReentry0:

+	movl	%esi,s

+	movl	pbase,%ebx

+	shll	$16,%esi

+	cmpl	%ebp,%edx

+	movl	%esi,sfracf

+	ja		LClampHighOrLow1

+LClampReentry1:

+	movl	%edx,t

+	movl	s,%esi					// sfrac = scans->sfrac;

+	shll	$16,%edx

+	movl	t,%eax					// tfrac = scans->tfrac;

+	sarl	$16,%esi

+	movl	%edx,tfracf

+//

+// calculate the texture starting address

+//

+	sarl	$16,%eax

+	movl	C(cachewidth),%edx

+	imull	%edx,%eax				// (tfrac >> 16) * cachewidth

+	addl	%ebx,%esi

+	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +

+									//           ((tfrac >> 16) * cachewidth);

+//

+// determine whether last span or not

+//

+	cmpl	$16,%ecx

+	jna		LLastSegment

+//

+// not the last segment; do full 16-wide segment

+//

+LNotLastSegment:

+//

+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

+// get there

+//

+// pick up after the FDIV that was left in flight previously

+	fld		%st(0)			// duplicate it

+	fmul	%st(4),%st(0)	// s = s/z * z

+	fxch	%st(1)

+	fmul	%st(3),%st(0)	// t = t/z * z

+	fxch	%st(1)

+	fistpl	snext

+	fistpl	tnext

+	movl	snext,%eax

+	movl	tnext,%edx

+	movb	(%esi),%bl	// get first source texel

+	subl	$16,%ecx		// count off this segments' pixels

+	movl	C(sadjust),%ebp

+	movl	%ecx,counttemp	// remember count of remaining pixels

+	movl	C(tadjust),%ecx

+	movb	%bl,(%edi)	// store first dest pixel

+	addl	%eax,%ebp

+	addl	%edx,%ecx

+	movl	C(bbextents),%eax

+	movl	C(bbextentt),%edx

+	cmpl	$4096,%ebp

+	jl		LClampLow2

+	cmpl	%eax,%ebp

+	ja		LClampHigh2

+LClampReentry2:

+	cmpl	$4096,%ecx

+	jl		LClampLow3

+	cmpl	%edx,%ecx

+	ja		LClampHigh3

+LClampReentry3:

+	movl	%ebp,snext

+	movl	%ecx,tnext

+	subl	s,%ebp

+	subl	t,%ecx

+//

+// set up advancetable

+//

+	movl	%ecx,%eax

+	movl	%ebp,%edx

+	sarl	$20,%eax			// tstep >>= 16;

+	jz		LZero

+	sarl	$20,%edx			// sstep >>= 16;

+	movl	C(cachewidth),%ebx

+	imull	%ebx,%eax

+	jmp		LSetUp1

+LZero:

+	sarl	$20,%edx			// sstep >>= 16;

+	movl	C(cachewidth),%ebx

+LSetUp1:

+	addl	%edx,%eax			// add in sstep

+								// (tstep >> 16) * cachewidth + (sstep >> 16);

+	movl	tfracf,%edx

+	movl	%eax,advancetable+4	// advance base in t

+	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +

+								//  (sstep >> 16);

+	shll	$12,%ebp			// left-justify sstep fractional part

+	movl	sfracf,%ebx

+	shll	$12,%ecx			// left-justify tstep fractional part

+	movl	%eax,advancetable	// advance extra in t

+	movl	%ecx,tstep

+	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac

+	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)

+	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac

+	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	(%esi),%al

+	addl	%ebp,%ebx

+	movb	%al,1(%edi)

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,2(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,3(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,4(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,5(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,6(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,7(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+//

+// start FDIV for end of next segment in flight, so it can overlap

+//

+	movl	counttemp,%ecx

+	cmpl	$16,%ecx			// more than one segment after this?

+	ja		LSetupNotLast2	// yes

+	decl	%ecx

+	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV

+	movl	%ecx,spancountminus1

+	fildl	spancountminus1

+	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1

+	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1

+	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1

+	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1

+	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1

+	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1

+	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1

+	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1

+	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1

+	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1

+	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1

+	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k

+	faddp	%st(0),%st(4)	// 64k

+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

+							//  overlap

+	jmp		LFDIVInFlight2

+	.align	4

+LSetupNotLast2:

+	fadds	zi16stepu

+	fxch	%st(2)

+	fadds	sdivz16stepu

+	fxch	%st(2)

+	flds	tdivz16stepu

+	faddp	%st(0),%st(2)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// z = 1/1/z

+							// this is what we've gone to all this trouble to

+							//  overlap

+LFDIVInFlight2:

+	movl	%ecx,counttemp

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,8(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,9(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,10(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,11(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,12(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,13(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,14(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	$16,%edi

+	movl	%edx,tfracf

+	movl	snext,%edx

+	movl	%ebx,sfracf

+	movl	tnext,%ebx

+	movl	%edx,s

+	movl	%ebx,t

+	movl	counttemp,%ecx		// retrieve count

+//

+// determine whether last span or not

+//

+	cmpl	$16,%ecx				// are there multiple segments remaining?

+	movb	%al,-1(%edi)

+	ja		LNotLastSegment		// yes

+//

+// last segment of scan

+//

+LLastSegment:

+//

+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

+// get there. The number of pixels left is variable, and we want to land on the

+// last pixel, not step one past it, so we can't run into arithmetic problems

+//

+	testl	%ecx,%ecx

+	jz		LNoSteps		// just draw the last pixel and we're done

+// pick up after the FDIV that was left in flight previously

+	fld		%st(0)			// duplicate it

+	fmul	%st(4),%st(0)	// s = s/z * z

+	fxch	%st(1)

+	fmul	%st(3),%st(0)	// t = t/z * z

+	fxch	%st(1)

+	fistpl	snext

+	fistpl	tnext

+	movb	(%esi),%al		// load first texel in segment

+	movl	C(tadjust),%ebx

+	movb	%al,(%edi)		// store first pixel in segment

+	movl	C(sadjust),%eax

+	addl	snext,%eax

+	addl	tnext,%ebx

+	movl	C(bbextents),%ebp

+	movl	C(bbextentt),%edx

+	cmpl	$4096,%eax

+	jl		LClampLow4

+	cmpl	%ebp,%eax

+	ja		LClampHigh4

+LClampReentry4:

+	movl	%eax,snext

+	cmpl	$4096,%ebx

+	jl		LClampLow5

+	cmpl	%edx,%ebx

+	ja		LClampHigh5

+LClampReentry5:

+	cmpl	$1,%ecx			// don't bother

+	je		LOnlyOneStep	// if two pixels in segment, there's only one step,

+							//  of the segment length

+	subl	s,%eax

+	subl	t,%ebx

+	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31

+	addl	%ebx,%ebx		//  reciprocal yields 16.48

+	imull	reciprocal_table_16-8(,%ecx,4)	// sstep = (snext - s) /

+											//  (spancount-1)

+	movl	%edx,%ebp

+	movl	%ebx,%eax

+	imull	reciprocal_table_16-8(,%ecx,4)	// tstep = (tnext - t) /

+											//  (spancount-1)

+LSetEntryvec:

+//

+// set up advancetable

+//

+	movl	entryvec_table_16(,%ecx,4),%ebx

+	movl	%edx,%eax

+	movl	%ebx,jumptemp		// entry point into code for RET later

+	movl	%ebp,%ecx

+	sarl	$16,%edx			// tstep >>= 16;

+	movl	C(cachewidth),%ebx

+	sarl	$16,%ecx			// sstep >>= 16;

+	imull	%ebx,%edx

+	addl	%ecx,%edx			// add in sstep

+								// (tstep >> 16) * cachewidth + (sstep >> 16);

+	movl	tfracf,%ecx

+	movl	%edx,advancetable+4	// advance base in t

+	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +

+								//  (sstep >> 16);

+	shll	$16,%ebp			// left-justify sstep fractional part

+	movl	sfracf,%ebx

+	shll	$16,%eax			// left-justify tstep fractional part

+	movl	%edx,advancetable	// advance extra in t

+	movl	%eax,tstep

+	movl	%ecx,%edx

+	addl	%eax,%edx

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	jmp		*jumptemp			// jump to the number-of-pixels handler

+//----------------------------------------

+LNoSteps:

+	movb	(%esi),%al		// load first texel in segment

+	subl	$15,%edi			// adjust for hardwired offset

+	jmp		LEndSpan

+LOnlyOneStep:

+	subl	s,%eax

+	subl	t,%ebx

+	movl	%eax,%ebp

+	movl	%ebx,%edx

+	jmp		LSetEntryvec

+//----------------------------------------

+.globl	Entry2_16, Entry3_16, Entry4_16, Entry5_16

+.globl	Entry6_16, Entry7_16, Entry8_16, Entry9_16

+.globl	Entry10_16, Entry11_16, Entry12_16, Entry13_16

+.globl	Entry14_16, Entry15_16, Entry16_16

+Entry2_16:

+	subl	$14,%edi		// adjust for hardwired offsets

+	movb	(%esi),%al

+	jmp		LEntry2_16

+//----------------------------------------

+Entry3_16:

+	subl	$13,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	jmp		LEntry3_16

+//----------------------------------------

+Entry4_16:

+	subl	$12,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry4_16

+//----------------------------------------

+Entry5_16:

+	subl	$11,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry5_16

+//----------------------------------------

+Entry6_16:

+	subl	$10,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry6_16

+//----------------------------------------

+Entry7_16:

+	subl	$9,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry7_16

+//----------------------------------------

+Entry8_16:

+	subl	$8,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry8_16

+//----------------------------------------

+Entry9_16:

+	subl	$7,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry9_16

+//----------------------------------------

+Entry10_16:

+	subl	$6,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry10_16

+//----------------------------------------

+Entry11_16:

+	subl	$5,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry11_16

+//----------------------------------------

+Entry12_16:

+	subl	$4,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry12_16

+//----------------------------------------

+Entry13_16:

+	subl	$3,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry13_16

+//----------------------------------------

+Entry14_16:

+	subl	$2,%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry14_16

+//----------------------------------------

+Entry15_16:

+	decl	%edi		// adjust for hardwired offsets

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	jmp		LEntry15_16

+//----------------------------------------

+Entry16_16:

+	addl	%eax,%edx

+	movb	(%esi),%al

+	sbbl	%ecx,%ecx

+	addl	%ebp,%ebx

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+	sbbl	%ecx,%ecx

+	movb	%al,1(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry15_16:

+	sbbl	%ecx,%ecx

+	movb	%al,2(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry14_16:

+	sbbl	%ecx,%ecx

+	movb	%al,3(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry13_16:

+	sbbl	%ecx,%ecx

+	movb	%al,4(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry12_16:

+	sbbl	%ecx,%ecx

+	movb	%al,5(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry11_16:

+	sbbl	%ecx,%ecx

+	movb	%al,6(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry10_16:

+	sbbl	%ecx,%ecx

+	movb	%al,7(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry9_16:

+	sbbl	%ecx,%ecx

+	movb	%al,8(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry8_16:

+	sbbl	%ecx,%ecx

+	movb	%al,9(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry7_16:

+	sbbl	%ecx,%ecx

+	movb	%al,10(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry6_16:

+	sbbl	%ecx,%ecx

+	movb	%al,11(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry5_16:

+	sbbl	%ecx,%ecx

+	movb	%al,12(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+	addl	tstep,%edx

+LEntry4_16:

+	sbbl	%ecx,%ecx

+	movb	%al,13(%edi)

+	addl	%ebp,%ebx

+	movb	(%esi),%al

+	adcl	advancetable+4(,%ecx,4),%esi

+LEntry3_16:

+	movb	%al,14(%edi)

+	movb	(%esi),%al

+LEntry2_16:

+LEndSpan:

+//

+// clear s/z, t/z, 1/z from FP stack

+//

+	fstp %st(0)

+	fstp %st(0)

+	fstp %st(0)

+	movl	pspantemp,%ebx				// restore spans pointer

+	movl	espan_t_pnext(%ebx),%ebx	// point to next span

+	testl	%ebx,%ebx			// any more spans?

+	movb	%al,15(%edi)

+	jnz		LSpanLoop			// more spans

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/d_ifacea.h

@@ -1,0 +1,79 @@

+//

+// d_ifacea.h

+//

+// Include file for asm driver interface.

+//

+//

+// !!! note that this file must match the corresponding C structures in

+// d_iface.h at all times !!!

+//

+// !!! if this is changed, it must be changed in r_shared.h too !!!

+#define ALIAS_ONSEAM				0x0020

+// !!! if this is changed, it must be changed in d_iface.h too !!!

+#define TURB_TEX_SIZE	64		// base turbulent texture size

+// !!! if this is changed, it must be changed in d_iface.h too !!!

+#define	CYCLE	128

+// !!! if this is changed, it must be changed in r_shared.h too !!!

+#define	MAXHEIGHT	1024

+// !!! if this is changed, it must be changed in quakedef.h too !!!

+#define CACHE_SIZE	32		// used to align key data structures

+// particle_t structure

+// !!! if this is changed, it must be changed in d_iface.h too !!!

+// driver-usable fields

+#define pt_org				0

+#define pt_color			12

+// drivers never touch the following fields

+#define pt_next				16

+#define pt_vel				20

+#define pt_ramp				32

+#define pt_die				36

+#define pt_type				40

+#define pt_size				44

+#define PARTICLE_Z_CLIP	8.0

+// finalvert_t structure

+// !!! if this is changed, it must be changed in d_iface.h too !!!

+#define fv_v				0	// !!! if this is moved, cases where the !!!

+								// !!! address of this field is pushed in !!!

+								// !!! d_polysa.s must be changed !!!

+#define fv_flags			24

+#define fv_reserved			28

+#define fv_size				32

+#define fv_shift			5

+// stvert_t structure

+// !!! if this is changed, it must be changed in modelgen.h too !!!

+#define stv_onseam	0

+#define stv_s		4

+#define stv_t		8

+#define stv_size	12

+// trivertx_t structure

+// !!! if this is changed, it must be changed in modelgen.h too !!!

+#define tv_v				0

+#define tv_lightnormalindex	3

+#define tv_size				4

+// affinetridesc_t structure

+// !!! if this is changed, it must be changed in d_iface.h too !!!

+#define atd_pskin			0

+#define atd_pskindesc		4

+#define atd_skinwidth		8

+#define atd_skinheight		12

+#define atd_ptriangles		16

+#define atd_pfinalverts		20

+#define atd_numtriangles	24

+#define atd_drawtype		28

+#define atd_seamfixupX16	32

+#define atd_size			36

--- /dev/null

+++ b/u/d_parta.s

@@ -1,0 +1,458 @@

+//

+// d_parta.s

+// x86 assembly-language 8-bpp particle-drawing code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "d_ifacea.h"

+#include "asm_draw.h"

+#ifdef	id386

+//----------------------------------------------------------------------

+// 8-bpp particle drawing code.

+//----------------------------------------------------------------------

+//FIXME: comments, full optimization

+//----------------------------------------------------------------------

+// 8-bpp particle queueing code.

+//----------------------------------------------------------------------

+	.text

+#define P	12+4

+	.align 4

+.globl C(D_DrawParticle)

+C(D_DrawParticle):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi				// preserve register variables

+	pushl	%ebx

+	movl	P(%esp),%edi

+// FIXME: better FP overlap in general here

+// transform point

+//	VectorSubtract (p->org, r_origin, local);

+	flds	C(r_origin)

+	fsubrs	pt_org(%edi)

+	flds	pt_org+4(%edi)

+	fsubs	C(r_origin)+4

+	flds	pt_org+8(%edi)

+	fsubs	C(r_origin)+8

+	fxch	%st(2)			// local[0] | local[1] | local[2]

+//	transformed[2] = DotProduct(local, r_ppn);

+	flds	C(r_ppn)		// r_ppn[0] | local[0] | local[1] | local[2]

+	fmul	%st(1),%st(0)	// dot0 | local[0] | local[1] | local[2]

+	flds	C(r_ppn)+4	// r_ppn[1] | dot0 | local[0] | local[1] | local[2]

+	fmul	%st(3),%st(0)	// dot1 | dot0 | local[0] | local[1] | local[2]

+	flds	C(r_ppn)+8	// r_ppn[2] | dot1 | dot0 | local[0] |

+						//  local[1] | local[2]

+	fmul	%st(5),%st(0)	// dot2 | dot1 | dot0 | local[0] | local[1] | local[2]

+	fxch	%st(2)		// dot0 | dot1 | dot2 | local[0] | local[1] | local[2]

+	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[0] | local[1] |

+						  //  local[2]

+	faddp	%st(0),%st(1) // z | local[0] | local[1] | local[2]

+	fld		%st(0)		// z | z | local[0] | local[1] |

+						//  local[2]

+	fdivrs	float_1		// 1/z | z | local[0] | local[1] | local[2]

+	fxch	%st(1)		// z | 1/z | local[0] | local[1] | local[2]

+//	if (transformed[2] < PARTICLE_Z_CLIP)

+//		return;

+	fcomps	float_particle_z_clip	// 1/z | local[0] | local[1] | local[2]

+	fxch	%st(3)					// local[2] | local[0] | local[1] | 1/z

+	flds	C(r_pup)	// r_pup[0] | local[2] | local[0] | local[1] | 1/z

+	fmul	%st(2),%st(0)	// dot0 | local[2] | local[0] | local[1] | 1/z

+	flds	C(r_pup)+4	// r_pup[1] | dot0 | local[2] | local[0] |

+						//  local[1] | 1/z

+	fnstsw	%ax

+	testb	$1,%ah

+	jnz		LPop6AndDone

+//	transformed[1] = DotProduct(local, r_pup);

+	fmul	%st(4),%st(0)	// dot1 | dot0 | local[2] | local[0] | local[1] | 1/z

+	flds	C(r_pup)+8	// r_pup[2] | dot1 | dot0 | local[2] |

+						//  local[0] | local[1] | 1/z

+	fmul	%st(3),%st(0)	// dot2 | dot1 | dot0 | local[2] | local[0] |

+						//  local[1] | 1/z

+	fxch	%st(2)		// dot0 | dot1 | dot2 | local[2] | local[0] |

+						//  local[1] | 1/z

+	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[2] | local[0] |

+						//  local[1] | 1/z

+	faddp	%st(0),%st(1) // y | local[2] | local[0] | local[1] | 1/z

+	fxch	%st(3)		// local[1] | local[2] | local[0] | y | 1/z

+//	transformed[0] = DotProduct(local, r_pright);

+	fmuls	C(r_pright)+4	// dot1 | local[2] | local[0] | y | 1/z

+	fxch	%st(2)		// local[0] | local[2] | dot1 | y | 1/z

+	fmuls	C(r_pright)	// dot0 | local[2] | dot1 | y | 1/z

+	fxch	%st(1)		// local[2] | dot0 | dot1 | y | 1/z

+	fmuls	C(r_pright)+8	// dot2 | dot0 | dot1 | y | 1/z

+	fxch	%st(2)		// dot1 | dot0 | dot2 | y | 1/z

+	faddp	%st(0),%st(1) // dot1 + dot0 | dot2 | y | 1/z

+	faddp	%st(0),%st(1)	// x | y | 1/z

+	fxch	%st(1)			// y | x | 1/z

+// project the point

+	fmul	%st(2),%st(0)	// y/z | x | 1/z

+	fxch	%st(1)			// x | y/z | 1/z

+	fmul	%st(2),%st(0)	// x/z | y/z | 1/z

+	fxch	%st(1)			// y/z | x/z | 1/z

+	fsubrs	C(ycenter)		// v | x/z | 1/z

+	fxch	%st(1)			// x/z | v | 1/z

+	fadds	C(xcenter)		// u | v | 1/z

+// FIXME: preadjust xcenter and ycenter

+	fxch	%st(1)			// v | u | 1/z

+	fadds	float_point5	// v | u | 1/z

+	fxch	%st(1)			// u | v | 1/z

+	fadds	float_point5	// u | v | 1/z

+	fxch	%st(2)			// 1/z | v | u

+	fmuls	DP_32768		// 1/z * 0x8000 | v | u

+	fxch	%st(2)			// u | v | 1/z * 0x8000

+// FIXME: use Terje's fp->int trick here?

+// FIXME: check we're getting proper rounding here

+	fistpl	DP_u			// v | 1/z * 0x8000

+	fistpl	DP_v			// 1/z * 0x8000

+	movl	DP_u,%eax

+	movl	DP_v,%edx

+// if ((v > d_vrectbottom_particle) ||

+// 	(u > d_vrectright_particle) ||

+// 	(v < d_vrecty) ||

+// 	(u < d_vrectx))

+// {

+// 	continue;

+// }

+	movl	C(d_vrectbottom_particle),%ebx

+	movl	C(d_vrectright_particle),%ecx

+	cmpl	%ebx,%edx

+	jg		LPop1AndDone

+	cmpl	%ecx,%eax

+	jg		LPop1AndDone

+	movl	C(d_vrecty),%ebx

+	movl	C(d_vrectx),%ecx

+	cmpl	%ebx,%edx

+	jl		LPop1AndDone

+	cmpl	%ecx,%eax

+	jl		LPop1AndDone

+	flds	pt_color(%edi)	// color | 1/z * 0x8000

+// FIXME: use Terje's fast fp->int trick?

+	fistpl	DP_Color		// 1/z * 0x8000

+	movl	C(d_viewbuffer),%ebx

+	addl	%eax,%ebx

+	movl	C(d_scantable)(,%edx,4),%edi		// point to the pixel

+	imull	C(d_zrowbytes),%edx		// point to the z pixel

+	leal	(%edx,%eax,2),%edx

+	movl	C(d_pzbuffer),%eax

+	fistpl	izi

+	addl	%ebx,%edi

+	addl	%eax,%edx

+// pix = izi >> d_pix_shift;

+	movl	izi,%eax

+	movl	C(d_pix_shift),%ecx

+	shrl	%cl,%eax

+	movl	izi,%ebp

+// if (pix < d_pix_min)

+// 		pix = d_pix_min;

+// else if (pix > d_pix_max)

+//  	pix = d_pix_max;

+	movl	C(d_pix_min),%ebx

+	movl	C(d_pix_max),%ecx

+	cmpl	%ebx,%eax

+	jnl		LTestPixMax

+	movl	%ebx,%eax

+	jmp		LTestDone

+LTestPixMax:

+	cmpl	%ecx,%eax

+	jng		LTestDone

+	movl	%ecx,%eax

+LTestDone:

+	movb	DP_Color,%ch

+	movl	C(d_y_aspect_shift),%ebx

+	testl	%ebx,%ebx

+	jnz		LDefault

+	cmpl	$4,%eax

+	ja		LDefault

+	jmp		DP_EntryTable-4(,%eax,4)

+// 1x1

+.globl	DP_1x1

+DP_1x1:

+	cmpw	%bp,(%edx)		// just one pixel to do

+	jg		LDone

+	movw	%bp,(%edx)

+	movb	%ch,(%edi)

+	jmp		LDone

+// 2x2

+.globl	DP_2x2

+DP_2x2:

+	pushl	%esi

+	movl	C(screenwidth),%ebx

+	movl	C(d_zrowbytes),%esi

+	cmpw	%bp,(%edx)

+	jg		L2x2_1

+	movw	%bp,(%edx)

+	movb	%ch,(%edi)

+L2x2_1:

+	cmpw	%bp,2(%edx)

+	jg		L2x2_2

+	movw	%bp,2(%edx)

+	movb	%ch,1(%edi)

+L2x2_2:

+	cmpw	%bp,(%edx,%esi,1)

+	jg		L2x2_3

+	movw	%bp,(%edx,%esi,1)

+	movb	%ch,(%edi,%ebx,1)

+L2x2_3:

+	cmpw	%bp,2(%edx,%esi,1)

+	jg		L2x2_4

+	movw	%bp,2(%edx,%esi,1)

+	movb	%ch,1(%edi,%ebx,1)

+L2x2_4:

+	popl	%esi

+	jmp		LDone

+// 3x3

+.globl	DP_3x3

+DP_3x3:

+	pushl	%esi

+	movl	C(screenwidth),%ebx

+	movl	C(d_zrowbytes),%esi

+	cmpw	%bp,(%edx)

+	jg		L3x3_1

+	movw	%bp,(%edx)

+	movb	%ch,(%edi)

+L3x3_1:

+	cmpw	%bp,2(%edx)

+	jg		L3x3_2

+	movw	%bp,2(%edx)

+	movb	%ch,1(%edi)

+L3x3_2:

+	cmpw	%bp,4(%edx)

+	jg		L3x3_3

+	movw	%bp,4(%edx)

+	movb	%ch,2(%edi)

+L3x3_3:

+	cmpw	%bp,(%edx,%esi,1)

+	jg		L3x3_4

+	movw	%bp,(%edx,%esi,1)

+	movb	%ch,(%edi,%ebx,1)

+L3x3_4:

+	cmpw	%bp,2(%edx,%esi,1)

+	jg		L3x3_5

+	movw	%bp,2(%edx,%esi,1)

+	movb	%ch,1(%edi,%ebx,1)

+L3x3_5:

+	cmpw	%bp,4(%edx,%esi,1)

+	jg		L3x3_6

+	movw	%bp,4(%edx,%esi,1)

+	movb	%ch,2(%edi,%ebx,1)

+L3x3_6:

+	cmpw	%bp,(%edx,%esi,2)

+	jg		L3x3_7

+	movw	%bp,(%edx,%esi,2)

+	movb	%ch,(%edi,%ebx,2)

+L3x3_7:

+	cmpw	%bp,2(%edx,%esi,2)

+	jg		L3x3_8

+	movw	%bp,2(%edx,%esi,2)

+	movb	%ch,1(%edi,%ebx,2)

+L3x3_8:

+	cmpw	%bp,4(%edx,%esi,2)

+	jg		L3x3_9

+	movw	%bp,4(%edx,%esi,2)

+	movb	%ch,2(%edi,%ebx,2)

+L3x3_9:

+	popl	%esi

+	jmp		LDone

+// 4x4

+.globl	DP_4x4

+DP_4x4:

+	pushl	%esi

+	movl	C(screenwidth),%ebx

+	movl	C(d_zrowbytes),%esi

+	cmpw	%bp,(%edx)

+	jg		L4x4_1

+	movw	%bp,(%edx)

+	movb	%ch,(%edi)

+L4x4_1:

+	cmpw	%bp,2(%edx)

+	jg		L4x4_2

+	movw	%bp,2(%edx)

+	movb	%ch,1(%edi)

+L4x4_2:

+	cmpw	%bp,4(%edx)

+	jg		L4x4_3

+	movw	%bp,4(%edx)

+	movb	%ch,2(%edi)

+L4x4_3:

+	cmpw	%bp,6(%edx)

+	jg		L4x4_4

+	movw	%bp,6(%edx)

+	movb	%ch,3(%edi)

+L4x4_4:

+	cmpw	%bp,(%edx,%esi,1)

+	jg		L4x4_5

+	movw	%bp,(%edx,%esi,1)

+	movb	%ch,(%edi,%ebx,1)

+L4x4_5:

+	cmpw	%bp,2(%edx,%esi,1)

+	jg		L4x4_6

+	movw	%bp,2(%edx,%esi,1)

+	movb	%ch,1(%edi,%ebx,1)

+L4x4_6:

+	cmpw	%bp,4(%edx,%esi,1)

+	jg		L4x4_7

+	movw	%bp,4(%edx,%esi,1)

+	movb	%ch,2(%edi,%ebx,1)

+L4x4_7:

+	cmpw	%bp,6(%edx,%esi,1)

+	jg		L4x4_8

+	movw	%bp,6(%edx,%esi,1)

+	movb	%ch,3(%edi,%ebx,1)

+L4x4_8:

+	leal	(%edx,%esi,2),%edx

+	leal	(%edi,%ebx,2),%edi

+	cmpw	%bp,(%edx)

+	jg		L4x4_9

+	movw	%bp,(%edx)

+	movb	%ch,(%edi)

+L4x4_9:

+	cmpw	%bp,2(%edx)

+	jg		L4x4_10

+	movw	%bp,2(%edx)

+	movb	%ch,1(%edi)

+L4x4_10:

+	cmpw	%bp,4(%edx)

+	jg		L4x4_11

+	movw	%bp,4(%edx)

+	movb	%ch,2(%edi)

+L4x4_11:

+	cmpw	%bp,6(%edx)

+	jg		L4x4_12

+	movw	%bp,6(%edx)

+	movb	%ch,3(%edi)

+L4x4_12:

+	cmpw	%bp,(%edx,%esi,1)

+	jg		L4x4_13

+	movw	%bp,(%edx,%esi,1)

+	movb	%ch,(%edi,%ebx,1)

+L4x4_13:

+	cmpw	%bp,2(%edx,%esi,1)

+	jg		L4x4_14

+	movw	%bp,2(%edx,%esi,1)

+	movb	%ch,1(%edi,%ebx,1)

+L4x4_14:

+	cmpw	%bp,4(%edx,%esi,1)

+	jg		L4x4_15

+	movw	%bp,4(%edx,%esi,1)

+	movb	%ch,2(%edi,%ebx,1)

+L4x4_15:

+	cmpw	%bp,6(%edx,%esi,1)

+	jg		L4x4_16

+	movw	%bp,6(%edx,%esi,1)

+	movb	%ch,3(%edi,%ebx,1)

+L4x4_16:

+	popl	%esi

+	jmp		LDone

+// default case, handling any size particle

+LDefault:

+// count = pix << d_y_aspect_shift;

+	movl	%eax,%ebx

+	movl	%eax,DP_Pix

+	movb	C(d_y_aspect_shift),%cl

+	shll	%cl,%ebx

+// for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)

+// {

+// 	for (i=0 ; i<pix ; i++)

+// 	{

+// 		if (pz[i] <= izi)

+// 		{

+// 			pz[i] = izi;

+// 			pdest[i] = color;

+// 		}

+// 	}

+// }

+LGenRowLoop:

+	movl	DP_Pix,%eax

+LGenColLoop:

+	cmpw	%bp,-2(%edx,%eax,2)

+	jg		LGSkip

+	movw	%bp,-2(%edx,%eax,2)

+	movb	%ch,-1(%edi,%eax,1)

+LGSkip:

+	decl	%eax			// --pix

+	jnz		LGenColLoop

+	addl	C(d_zrowbytes),%edx

+	addl	C(screenwidth),%edi

+	decl	%ebx			// --count

+	jnz		LGenRowLoop

+LDone:

+	popl	%ebx				// restore register variables

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+LPop6AndDone:

+	fstp	%st(0)

+	fstp	%st(0)

+	fstp	%st(0)

+	fstp	%st(0)

+	fstp	%st(0)

+LPop1AndDone:

+	fstp	%st(0)

+	jmp		LDone

+#endif	// id386

--- /dev/null

+++ b/u/d_polysa.s

@@ -1,0 +1,1723 @@

+//

+// d_polysa.s

+// x86 assembly-language polygon model drawing code

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef	id386

+// !!! if this is changed, it must be changed in d_polyse.c too !!!

+#define DPS_MAXSPANS			MAXHEIGHT+1

+									// 1 extra for spanpackage that marks end

+//#define	SPAN_SIZE	(((DPS_MAXSPANS + 1 + ((CACHE_SIZE - 1) / spanpackage_t_size)) + 1) * spanpackage_t_size)

+#define SPAN_SIZE (1024+1+1+1)*32

+	.data

+	.align	4

+p10_minus_p20:	.single		0

+p01_minus_p21:	.single		0

+temp0:			.single		0

+temp1:			.single		0

+Ltemp:			.single		0

+aff8entryvec_table:	.long	LDraw8, LDraw7, LDraw6, LDraw5

+				.long	LDraw4, LDraw3, LDraw2, LDraw1

+lzistepx:		.long	0

+	.text

+	.extern C(D_PolysetSetEdgeTable)

+	.extern C(D_RasterizeAliasPolySmooth)

+//----------------------------------------------------------------------

+// affine triangle gradient calculation code

+//----------------------------------------------------------------------

+#define skinwidth	4+0

+.globl C(D_PolysetCalcGradients)

+C(D_PolysetCalcGradients):

+//	p00_minus_p20 = r_p0[0] - r_p2[0];

+//	p01_minus_p21 = r_p0[1] - r_p2[1];

+//	p10_minus_p20 = r_p1[0] - r_p2[0];

+//	p11_minus_p21 = r_p1[1] - r_p2[1];

+//

+//	xstepdenominv = 1.0 / (p10_minus_p20 * p01_minus_p21 -

+//			     p00_minus_p20 * p11_minus_p21);

+//

+//	ystepdenominv = -xstepdenominv;

+	fildl	C(r_p0)+0		// r_p0[0]

+	fildl	C(r_p2)+0		// r_p2[0] | r_p0[0]

+	fildl	C(r_p0)+4		// r_p0[1] | r_p2[0] | r_p0[0]

+	fildl	C(r_p2)+4		// r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]

+	fildl	C(r_p1)+0		// r_p1[0] | r_p2[1] | r_p0[1] | r_p2[0] | r_p0[0]

+	fildl	C(r_p1)+4		// r_p1[1] | r_p1[0] | r_p2[1] | r_p0[1] |

+							//  r_p2[0] | r_p0[0]

+	fxch	%st(3)			// r_p0[1] | r_p1[0] | r_p2[1] | r_p1[1] |

+							//  r_p2[0] | r_p0[0]

+	fsub	%st(2),%st(0)	// p01_minus_p21 | r_p1[0] | r_p2[1] | r_p1[1] |

+							//  r_p2[0] | r_p0[0]

+	fxch	%st(1)			// r_p1[0] | p01_minus_p21 | r_p2[1] | r_p1[1] |

+							//  r_p2[0] | r_p0[0]

+	fsub	%st(4),%st(0)	// p10_minus_p20 | p01_minus_p21 | r_p2[1] |

+							//  r_p1[1] | r_p2[0] | r_p0[0]

+	fxch	%st(5)			// r_p0[0] | p01_minus_p21 | r_p2[1] |

+							//  r_p1[1] | r_p2[0] | p10_minus_p20

+	fsubp	%st(0),%st(4)	// p01_minus_p21 | r_p2[1] | r_p1[1] |

+							//  p00_minus_p20 | p10_minus_p20

+	fxch	%st(2)			// r_p1[1] | r_p2[1] | p01_minus_p21 |

+							//  p00_minus_p20 | p10_minus_p20

+	fsubp	%st(0),%st(1)	// p11_minus_p21 | p01_minus_p21 |

+							//  p00_minus_p20 | p10_minus_p20

+	fxch	%st(1)			// p01_minus_p21 | p11_minus_p21 |

+							//  p00_minus_p20 | p10_minus_p20

+	flds	C(d_xdenom)		// d_xdenom | p01_minus_p21 | p11_minus_p21 |

+							//  p00_minus_p20 | p10_minus_p20

+	fxch	%st(4)			// p10_minus_p20 | p01_minus_p21 | p11_minus_p21 |

+							//  p00_minus_p20 | d_xdenom

+	fstps	p10_minus_p20	// p01_minus_p21 | p11_minus_p21 |

+							//  p00_minus_p20 | d_xdenom

+	fstps	p01_minus_p21	// p11_minus_p21 | p00_minus_p20 | xstepdenominv

+	fxch	%st(2)			// xstepdenominv | p00_minus_p20 | p11_minus_p21

+//// ceil () for light so positive steps are exaggerated, negative steps

+//// diminished,  pushing us away from underflow toward overflow. Underflow is

+//// very visible, overflow is very unlikely, because of ambient lighting

+//	t0 = r_p0[4] - r_p2[4];

+//	t1 = r_p1[4] - r_p2[4];

+	fildl	C(r_p2)+16		// r_p2[4] | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fildl	C(r_p0)+16		// r_p0[4] | r_p2[4] | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p1)+16		// r_p1[4] | r_p0[4] | r_p2[4] | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// r_p2[4] | r_p0[4] | r_p1[4] | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// r_p2[4] | r_p2[4] | r_p0[4] | r_p1[4] |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fsubrp	%st(0),%st(2)	// r_p2[4] | t0 | r_p1[4] | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fsubrp	%st(0),%st(2)	// t0 | t1 | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+//	r_lstepx = (int)

+//			ceil((t1 * p01_minus_p21 - t0 * p11_minus_p21) * xstepdenominv);

+//	r_lstepy = (int)

+//			ceil((t1 * p00_minus_p20 - t0 * p10_minus_p20) * ystepdenominv);

+	fld		%st(0)			// t0 | t0 | t1 | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmul	%st(5),%st(0)	// t0*p11_minus_p21 | t0 | t1 | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

+							//  t0*p11_minus_p21 | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

+							//  t0*p11_minus_p21 | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fmul	%st(5),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 | t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |

+							//  t1*p00_minus_p20 | t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fld		%st(2)			// xstepdenominv |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmuls	float_minus_1	// ystepdenominv |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//   xstepdenominv |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//   | ystepdenominv | xstepdenominv |

+							//   p00_minus_p20 | p11_minus_p21

+	fxch	%st(1)			// t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//   xstepdenominv | ystepdenominv |

+							//   xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//  ystepdenominv |

+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//  xstepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fldcw	ceil_cw

+	fistpl	C(r_lstepy)		// r_lstepx | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fistpl	C(r_lstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fldcw	single_cw

+//	t0 = r_p0[2] - r_p2[2];

+//	t1 = r_p1[2] - r_p2[2];

+	fildl	C(r_p2)+8		// r_p2[2] | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p0)+8		// r_p0[2] | r_p2[2] | ystepdenominv |

+							//   xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p1)+8		// r_p1[2] | r_p0[2] | r_p2[2] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// r_p2[2] | r_p0[2] | r_p1[2] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// r_p2[2] | r_p2[2] | r_p0[2] | r_p1[2] |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubrp	%st(0),%st(2)	// r_p2[2] | t0 | r_p1[2] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+//	r_sstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *

+//			xstepdenominv);

+//	r_sstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *

+//			ystepdenominv);

+	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv

+	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

+							//  t0*p11_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

+							//  t0*p11_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |

+							//  t1*p00_minus_p20 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//   ystepdenominv |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//   ystepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//  xstepdenominv |

+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//  ystepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//  ystepdenominv |

+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//  xstepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fistpl	C(r_sstepy)		// r_sstepx | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fistpl	C(r_sstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+//	t0 = r_p0[3] - r_p2[3];

+//	t1 = r_p1[3] - r_p2[3];

+	fildl	C(r_p2)+12		// r_p2[3] | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p0)+12		// r_p0[3] | r_p2[3] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p1)+12		// r_p1[3] | r_p0[3] | r_p2[3] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// r_p2[3] | r_p0[3] | r_p1[3] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// r_p2[3] | r_p2[3] | r_p0[3] | r_p1[3] |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubrp	%st(0),%st(2)	// r_p2[3] | t0 | r_p1[3] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+//	r_tstepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *

+//			xstepdenominv);

+//	r_tstepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *

+//			ystepdenominv);

+	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fmul	%st(6),%st(0)	// t0*p11_minus_p21 | t0 | t1 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// t1 | t0 | t0*p11_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// t1 | t1 | t0 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

+							//  t0*p11_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

+							//  t0*p11_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmul	%st(6),%st(0)	// t1*p00_minus_p20 | t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fxch	%st(2)			// t1*p01_minus_p21 | t0*p10_minus_p20 |

+							//  t1*p00_minus_p20 | t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubp	%st(0),%st(3)	// t0*p10_minus_p20 | t1*p00_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubrp	%st(0),%st(1)	// t1*p00_minus_p20 - t0*p10_minus_p20 |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fmul	%st(2),%st(0)	// (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//   ystepdenominv |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fxch	%st(1)			// t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//  ystepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fmul	%st(3),%st(0)	// (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//  xstepdenominv |

+							//  (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//  ystepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(1)			// (t1*p00_minus_p20 - t0*p10_minus_p20)*

+							//  ystepdenominv |

+							//  (t1*p01_minus_p21 - t0*p11_minus_p21)*

+							//  xstepdenominv | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fistpl	C(r_tstepy)		// r_tstepx | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fistpl	C(r_tstepx)		// ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+//	t0 = r_p0[5] - r_p2[5];

+//	t1 = r_p1[5] - r_p2[5];

+	fildl	C(r_p2)+20		// r_p2[5] | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p0)+20		// r_p0[5] | r_p2[5] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fildl	C(r_p1)+20		// r_p1[5] | r_p0[5] | r_p2[5] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fxch	%st(2)			// r_p2[5] | r_p0[5] | r_p1[5] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fld		%st(0)			// r_p2[5] | r_p2[5] | r_p0[5] | r_p1[5] |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  p11_minus_p21

+	fsubrp	%st(0),%st(2)	// r_p2[5] | t0 | r_p1[5] | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 | p11_minus_p21

+	fsubrp	%st(0),%st(2)	// t0 | t1 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+//	r_zistepx = (int)((t1 * p01_minus_p21 - t0 * p11_minus_p21) *

+//			xstepdenominv);

+//	r_zistepy = (int)((t1 * p00_minus_p20 - t0 * p10_minus_p20) *

+//			ystepdenominv);

+	fld		%st(0)			// t0 | t0 | t1 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | p11_minus_p21

+	fmulp	%st(0),%st(6)	// t0 | t1 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | t0*p11_minus_p21

+	fxch	%st(1)			// t1 | t0 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | t0*p11_minus_p21

+	fld		%st(0)			// t1 | t1 | t0 | ystepdenominv | xstepdenominv |

+							//  p00_minus_p20 | t0*p11_minus_p21

+	fmuls	p01_minus_p21	// t1*p01_minus_p21 | t1 | t0 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 |

+							//  t0*p11_minus_p21

+	fxch	%st(2)			// t0 | t1 | t1*p01_minus_p21 | ystepdenominv |

+							//  xstepdenominv | p00_minus_p20 |

+							//  t0*p11_minus_p21

+	fmuls	p10_minus_p20	// t0*p10_minus_p20 | t1 | t1*p01_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  t0*p11_minus_p21

+	fxch	%st(1)			// t1 | t0*p10_minus_p20 | t1*p01_minus_p21 |

+							//  ystepdenominv | xstepdenominv | p00_minus_p20 |

+							//  t0*p11_minus_p21

+	fmulp	%st(0),%st(5)	// t0*p10_minus_p20 | t1*p01_minus_p21 |

+							//  ystepdenominv | xstepdenominv |

+							//  t1*p00_minus_p20 | t0*p11_minus_p21

+	fxch	%st(5)			// t0*p11_minus_p21 | t1*p01_minus_p21 |

+							//  ystepdenominv | xstepdenominv |

+							//  t1*p00_minus_p20 | t0*p10_minus_p20

+	fsubrp	%st(0),%st(1)	// t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  ystepdenominv | xstepdenominv |

+							//  t1*p00_minus_p20 | t0*p10_minus_p20

+	fxch	%st(3)			// t1*p00_minus_p20 | ystepdenominv |

+							//  xstepdenominv |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  t0*p10_minus_p20

+	fsubp	%st(0),%st(4)	// ystepdenominv | xstepdenominv |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20

+	fxch	%st(1)			// xstepdenominv | ystepdenominv |

+							//  t1*p01_minus_p21 - t0*p11_minus_p21 |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20

+	fmulp	%st(0),%st(2)	// ystepdenominv |

+							//  (t1*p01_minus_p21 - t0*p11_minus_p21) *

+							//  xstepdenominv |

+							//  t1*p00_minus_p20 - t0*p10_minus_p20

+	fmulp	%st(0),%st(2)	// (t1*p01_minus_p21 - t0*p11_minus_p21) *

+							//  xstepdenominv |

+							//  (t1*p00_minus_p20 - t0*p10_minus_p20) *

+							//  ystepdenominv

+	fistpl	C(r_zistepx)	// (t1*p00_minus_p20 - t0*p10_minus_p20) *

+							//  ystepdenominv

+	fistpl	C(r_zistepy)

+//	a_sstepxfrac = r_sstepx << 16;

+//	a_tstepxfrac = r_tstepx << 16;

+//

+//	a_ststepxwhole = r_affinetridesc.skinwidth * (r_tstepx >> 16) +

+//			(r_sstepx >> 16);

+	movl	C(r_sstepx),%eax

+	movl	C(r_tstepx),%edx

+	shll	$16,%eax

+	shll	$16,%edx

+	movl	%eax,C(a_sstepxfrac)

+	movl	%edx,C(a_tstepxfrac)

+	movl	C(r_sstepx),%ecx

+	movl	C(r_tstepx),%eax

+	sarl	$16,%ecx

+	sarl	$16,%eax

+	imull	skinwidth(%esp)

+	addl	%ecx,%eax

+	movl	%eax,C(a_ststepxwhole)

+	ret

+//----------------------------------------------------------------------

+// recursive subdivision affine triangle drawing code

+//

+// not C-callable because of stdcall return

+//----------------------------------------------------------------------

+#define lp1	4+16

+#define lp2	8+16

+#define lp3	12+16

+.globl C(D_PolysetRecursiveTriangle)

+C(D_PolysetRecursiveTriangle):

+	pushl	%ebp				// preserve caller stack frame pointer

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+	pushl	%ebx

+//	int		*temp;

+//	int		d;

+//	int		new[6];

+//	int		i;

+//	int		z;

+//	short	*zbuf;

+	movl	lp2(%esp),%esi

+	movl	lp1(%esp),%ebx

+	movl	lp3(%esp),%edi

+//	d = lp2[0] - lp1[0];

+//	if (d < -1 || d > 1)

+//		goto split;

+	movl	0(%esi),%eax

+	movl	0(%ebx),%edx

+	movl	4(%esi),%ebp

+	subl	%edx,%eax

+	movl	4(%ebx),%ecx

+	subl	%ecx,%ebp

+	incl	%eax

+	cmpl	$2,%eax

+	ja		LSplit

+//	d = lp2[1] - lp1[1];

+//	if (d < -1 || d > 1)

+//		goto split;

+	movl	0(%edi),%eax

+	incl	%ebp

+	cmpl	$2,%ebp

+	ja		LSplit

+//	d = lp3[0] - lp2[0];

+//	if (d < -1 || d > 1)

+//		goto split2;

+	movl	0(%esi),%edx

+	movl	4(%edi),%ebp

+	subl	%edx,%eax

+	movl	4(%esi),%ecx

+	subl	%ecx,%ebp

+	incl	%eax

+	cmpl	$2,%eax

+	ja		LSplit2

+//	d = lp3[1] - lp2[1];

+//	if (d < -1 || d > 1)

+//		goto split2;

+	movl	0(%ebx),%eax

+	incl	%ebp

+	cmpl	$2,%ebp

+	ja		LSplit2

+//	d = lp1[0] - lp3[0];

+//	if (d < -1 || d > 1)

+//		goto split3;

+	movl	0(%edi),%edx

+	movl	4(%ebx),%ebp

+	subl	%edx,%eax

+	movl	4(%edi),%ecx

+	subl	%ecx,%ebp

+	incl	%eax

+	incl	%ebp

+	movl	%ebx,%edx

+	cmpl	$2,%eax

+	ja		LSplit3

+//	d = lp1[1] - lp3[1];

+//	if (d < -1 || d > 1)

+//	{

+//split3:

+//		temp = lp1;

+//		lp3 = lp2;

+//		lp1 = lp3;

+//		lp2 = temp;

+//		goto split;

+//	}

+//

+//	return;			// entire tri is filled

+//

+	cmpl	$2,%ebp

+	jna		LDone

+LSplit3:

+	movl	%edi,%ebx

+	movl	%esi,%edi

+	movl	%edx,%esi

+	jmp		LSplit

+//split2:

+LSplit2:

+//	temp = lp1;

+//	lp1 = lp2;

+//	lp2 = lp3;

+//	lp3 = temp;

+	movl	%ebx,%eax

+	movl	%esi,%ebx

+	movl	%edi,%esi

+	movl	%eax,%edi

+//split:

+LSplit:

+	subl	$24,%esp		// allocate space for a new vertex

+//// split this edge

+//	new[0] = (lp1[0] + lp2[0]) >> 1;

+//	new[1] = (lp1[1] + lp2[1]) >> 1;

+//	new[2] = (lp1[2] + lp2[2]) >> 1;

+//	new[3] = (lp1[3] + lp2[3]) >> 1;

+//	new[5] = (lp1[5] + lp2[5]) >> 1;

+	movl	8(%ebx),%eax

+	movl	8(%esi),%edx

+	movl	12(%ebx),%ecx

+	addl	%edx,%eax

+	movl	12(%esi),%edx

+	sarl	$1,%eax

+	addl	%edx,%ecx

+	movl	%eax,8(%esp)

+	movl	20(%ebx),%eax

+	sarl	$1,%ecx

+	movl	20(%esi),%edx

+	movl	%ecx,12(%esp)

+	addl	%edx,%eax

+	movl	0(%ebx),%ecx

+	movl	0(%esi),%edx

+	sarl	$1,%eax

+	addl	%ecx,%edx

+	movl	%eax,20(%esp)

+	movl	4(%ebx),%eax

+	sarl	$1,%edx

+	movl	4(%esi),%ebp

+	movl	%edx,0(%esp)

+	addl	%eax,%ebp

+	sarl	$1,%ebp

+	movl	%ebp,4(%esp)

+//// draw the point if splitting a leading edge

+//	if (lp2[1] > lp1[1])

+//		goto nodraw;

+	cmpl	%eax,4(%esi)

+	jg		LNoDraw

+//	if ((lp2[1] == lp1[1]) && (lp2[0] < lp1[0]))

+//		goto nodraw;

+	movl	0(%esi),%edx

+	jnz		LDraw

+	cmpl	%ecx,%edx

+	jl		LNoDraw

+LDraw:

+// z = new[5] >> 16;

+	movl	20(%esp),%edx

+	movl	4(%esp),%ecx

+	sarl	$16,%edx

+	movl	0(%esp),%ebp

+//	zbuf = zspantable[new[1]] + new[0];

+	movl	C(zspantable)(,%ecx,4),%eax

+//	if (z >= *zbuf)

+//	{

+	cmpw	(%eax,%ebp,2),%dx

+	jnge	LNoDraw

+//		int		pix;

+//

+//		*zbuf = z;

+	movw	%dx,(%eax,%ebp,2)

+//		pix = d_pcolormap[skintable[new[3]>>16][new[2]>>16]];

+	movl	12(%esp),%eax

+	sarl	$16,%eax

+	movl	8(%esp),%edx

+	sarl	$16,%edx

+	subl	%ecx,%ecx

+	movl	C(skintable)(,%eax,4),%eax

+	movl	4(%esp),%ebp

+	movb	(%eax,%edx,),%cl

+	movl	C(d_pcolormap),%edx

+	movb	(%edx,%ecx,),%dl

+	movl	0(%esp),%ecx

+//		d_viewbuffer[d_scantable[new[1]] + new[0]] = pix;

+	movl	C(d_scantable)(,%ebp,4),%eax

+	addl	%eax,%ecx

+	movl	C(d_viewbuffer),%eax

+	movb	%dl,(%eax,%ecx,1)

+//	}

+//

+//nodraw:

+LNoDraw:

+//// recursively continue

+//	D_PolysetRecursiveTriangle (lp3, lp1, new);

+	pushl	%esp

+	pushl	%ebx

+	pushl	%edi

+	call	C(D_PolysetRecursiveTriangle)

+//	D_PolysetRecursiveTriangle (lp3, new, lp2);

+	movl	%esp,%ebx

+	pushl	%esi

+	pushl	%ebx

+	pushl	%edi

+	call	C(D_PolysetRecursiveTriangle)

+	addl	$24,%esp

+LDone:

+	popl	%ebx				// restore register variables

+	popl	%edi

+	popl	%esi

+	popl	%ebp				// restore caller stack frame pointer

+	ret		$12

+//----------------------------------------------------------------------

+// 8-bpp horizontal span drawing code for affine polygons, with smooth

+// shading and no transparency

+//----------------------------------------------------------------------

+#define pspans	4+8

+.globl C(D_PolysetAff8Start)

+C(D_PolysetAff8Start):

+.globl C(D_PolysetDrawSpans8)

+C(D_PolysetDrawSpans8):

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+	movl	pspans(%esp),%esi	// point to the first span descriptor

+	movl	C(r_zistepx),%ecx

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	rorl	$16,%ecx			// put high 16 bits of 1/z step in low word

+	movl	spanpackage_t_count(%esi),%edx

+	movl	%ecx,lzistepx

+LSpanLoop:

+//		lcount = d_aspancount - pspanpackage->count;

+//

+//		errorterm += erroradjustup;

+//		if (errorterm >= 0)

+//		{

+//			d_aspancount += d_countextrastep;

+//			errorterm -= erroradjustdown;

+//		}

+//		else

+//		{

+//			d_aspancount += ubasestep;

+//		}

+	movl	C(d_aspancount),%eax

+	subl	%edx,%eax

+	movl	C(erroradjustup),%edx

+	movl	C(errorterm),%ebx

+	addl	%edx,%ebx

+	js		LNoTurnover

+	movl	C(erroradjustdown),%edx

+	movl	C(d_countextrastep),%edi

+	subl	%edx,%ebx

+	movl	C(d_aspancount),%ebp

+	movl	%ebx,C(errorterm)

+	addl	%edi,%ebp

+	movl	%ebp,C(d_aspancount)

+	jmp		LRightEdgeStepped

+LNoTurnover:

+	movl	C(d_aspancount),%edi

+	movl	C(ubasestep),%edx

+	movl	%ebx,C(errorterm)

+	addl	%edx,%edi

+	movl	%edi,C(d_aspancount)

+LRightEdgeStepped:

+	cmpl	$1,%eax

+	jl		LNextSpan

+	jz		LExactlyOneLong

+//

+// set up advancetable

+//

+	movl	C(a_ststepxwhole),%ecx

+	movl	C(r_affinetridesc)+atd_skinwidth,%edx

+	movl	%ecx,advancetable+4	// advance base in t

+	addl	%edx,%ecx

+	movl	%ecx,advancetable	// advance extra in t

+	movl	C(a_tstepxfrac),%ecx

+	movw	C(r_lstepx),%cx

+	movl	%eax,%edx			// count

+	movl	%ecx,tstep

+	addl	$7,%edx

+	shrl	$3,%edx				// count of full and partial loops

+	movl	spanpackage_t_sfrac(%esi),%ebx

+	movw	%dx,%bx

+	movl	spanpackage_t_pz(%esi),%ecx

+	negl	%eax

+	movl	spanpackage_t_pdest(%esi),%edi

+	andl	$7,%eax		// 0->0, 1->7, 2->6, ... , 7->1

+	subl	%eax,%edi	// compensate for hardwired offsets

+	subl	%eax,%ecx

+	subl	%eax,%ecx

+	movl	spanpackage_t_tfrac(%esi),%edx

+	movw	spanpackage_t_light(%esi),%dx

+	movl	spanpackage_t_zi(%esi),%ebp

+	rorl	$16,%ebp	// put high 16 bits of 1/z in low word

+	pushl	%esi

+	movl	spanpackage_t_ptex(%esi),%esi

+	jmp		aff8entryvec_table(,%eax,4)

+// %bx = count of full and partial loops

+// %ebx high word = sfrac

+// %ecx = pz

+// %dx = light

+// %edx high word = tfrac

+// %esi = ptex

+// %edi = pdest

+// %ebp = 1/z

+// tstep low word = C(r_lstepx)

+// tstep high word = C(a_tstepxfrac)

+// C(a_sstepxfrac) low word = 0

+// C(a_sstepxfrac) high word = C(a_sstepxfrac)

+LDrawLoop:

+// FIXME: do we need to clamp light? We may need at least a buffer bit to

+// keep it from poking into tfrac and causing problems

+LDraw8:

+	cmpw	(%ecx),%bp

+	jl		Lp1

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch8:

+	movb	%al,(%edi)

+Lp1:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw7:

+	cmpw	2(%ecx),%bp

+	jl		Lp2

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,2(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch7:

+	movb	%al,1(%edi)

+Lp2:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw6:

+	cmpw	4(%ecx),%bp

+	jl		Lp3

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,4(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch6:

+	movb	%al,2(%edi)

+Lp3:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw5:

+	cmpw	6(%ecx),%bp

+	jl		Lp4

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,6(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch5:

+	movb	%al,3(%edi)

+Lp4:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw4:

+	cmpw	8(%ecx),%bp

+	jl		Lp5

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,8(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch4:

+	movb	%al,4(%edi)

+Lp5:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw3:

+	cmpw	10(%ecx),%bp

+	jl		Lp6

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,10(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch3:

+	movb	%al,5(%edi)

+Lp6:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw2:

+	cmpw	12(%ecx),%bp

+	jl		Lp7

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,12(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch2:

+	movb	%al,6(%edi)

+Lp7:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LDraw1:

+	cmpw	14(%ecx),%bp

+	jl		Lp8

+	xorl	%eax,%eax

+	movb	%dh,%ah

+	movb	(%esi),%al

+	movw	%bp,14(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch1:

+	movb	%al,7(%edi)

+Lp8:

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	lzistepx,%ebp

+	adcl	$0,%ebp

+	addl	C(a_sstepxfrac),%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	addl	$8,%edi

+	addl	$16,%ecx

+	decw	%bx

+	jnz		LDrawLoop

+	popl	%esi				// restore spans pointer

+LNextSpan:

+	addl	$(spanpackage_t_size),%esi	// point to next span

+LNextSpanESISet:

+	movl	spanpackage_t_count(%esi),%edx

+	cmpl	$-999999,%edx		// any more spans?

+	jnz		LSpanLoop			// yes

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	popl	%ebx				// restore register variables

+	popl	%esi

+	ret

+// draw a one-long span

+LExactlyOneLong:

+	movl	spanpackage_t_pz(%esi),%ecx

+	movl	spanpackage_t_zi(%esi),%ebp

+	rorl	$16,%ebp	// put high 16 bits of 1/z in low word

+	movl	spanpackage_t_ptex(%esi),%ebx

+	cmpw	(%ecx),%bp

+	jl		LNextSpan

+	xorl	%eax,%eax

+	movl	spanpackage_t_pdest(%esi),%edi

+	movb	spanpackage_t_light+1(%esi),%ah

+	addl	$(spanpackage_t_size),%esi	// point to next span

+	movb	(%ebx),%al

+	movw	%bp,(%ecx)

+	movb	0x12345678(%eax),%al

+LPatch9:

+	movb	%al,(%edi)

+	jmp		LNextSpanESISet

+.globl C(D_PolysetAff8End)

+C(D_PolysetAff8End):

+#define pcolormap		4

+.globl C(D_Aff8Patch)

+C(D_Aff8Patch):

+	movl	pcolormap(%esp),%eax

+	movl	%eax,LPatch1-4

+	movl	%eax,LPatch2-4

+	movl	%eax,LPatch3-4

+	movl	%eax,LPatch4-4

+	movl	%eax,LPatch5-4

+	movl	%eax,LPatch6-4

+	movl	%eax,LPatch7-4

+	movl	%eax,LPatch8-4

+	movl	%eax,LPatch9-4

+	ret

+//----------------------------------------------------------------------

+// Alias model polygon dispatching code, combined with subdivided affine

+// triangle drawing code

+//----------------------------------------------------------------------

+.globl C(D_PolysetDraw)

+C(D_PolysetDraw):

+//	spanpackage_t	spans[DPS_MAXSPANS + 1 +

+//			((CACHE_SIZE - 1) / sizeof(spanpackage_t)) + 1];

+//						// one extra because of cache line pretouching

+//

+//	a_spans = (spanpackage_t *)

+//			(((intptr)&spans[0] + CACHE_SIZE - 1) & ~(CACHE_SIZE - 1));

+	subl	$(SPAN_SIZE),%esp

+	movl	%esp,%eax

+	addl	$(CACHE_SIZE - 1),%eax

+	andl	$(~(CACHE_SIZE - 1)),%eax

+	movl	%eax,C(a_spans)

+//	if (r_affinetridesc.drawtype)

+//		D_DrawSubdiv ();

+//	else

+//		D_DrawNonSubdiv ();

+	movl	C(r_affinetridesc)+atd_drawtype,%eax

+	testl	%eax,%eax

+	jz		C(D_DrawNonSubdiv)

+	pushl	%ebp				// preserve caller stack frame pointer

+//	lnumtriangles = r_affinetridesc.numtriangles;

+	movl	C(r_affinetridesc)+atd_numtriangles,%ebp

+	pushl	%esi				// preserve register variables

+	shll	$4,%ebp

+	pushl	%ebx

+//	ptri = r_affinetridesc.ptriangles;

+	movl	C(r_affinetridesc)+atd_ptriangles,%ebx

+	pushl	%edi

+//	mtriangle_t		*ptri;

+//	finalvert_t		*pfv, *index0, *index1, *index2;

+//	int				i;

+//	int				lnumtriangles;

+//	int				s0, s1, s2;

+//	pfv = r_affinetridesc.pfinalverts;

+	movl	C(r_affinetridesc)+atd_pfinalverts,%edi

+//	for (i=0 ; i<lnumtriangles ; i++)

+//	{

+Llooptop:

+//		index0 = pfv + ptri[i].vertindex[0];

+//		index1 = pfv + ptri[i].vertindex[1];

+//		index2 = pfv + ptri[i].vertindex[2];

+	movl	mtri_vertindex-16+0(%ebx,%ebp,),%ecx

+	movl	mtri_vertindex-16+4(%ebx,%ebp,),%esi

+	shll	$(fv_shift),%ecx

+	movl	mtri_vertindex-16+8(%ebx,%ebp,),%edx

+	shll	$(fv_shift),%esi

+	addl	%edi,%ecx

+	shll	$(fv_shift),%edx

+	addl	%edi,%esi

+	addl	%edi,%edx

+//		if (((index0->v[1]-index1->v[1]) *

+//				(index0->v[0]-index2->v[0]) -

+//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1])) >= 0)

+//		{

+//			continue;

+//		}

+//

+//		d_pcolormap = &((byte *)acolormap)[index0->v[4] & 0xFF00];

+	fildl	fv_v+4(%ecx)	// i0v1

+	fildl	fv_v+4(%esi)	// i1v1 | i0v1

+	fildl	fv_v+0(%ecx)	// i0v0 | i1v1 | i0v1

+	fildl	fv_v+0(%edx)	// i2v0 | i0v0 | i1v1 | i0v1

+	fxch	%st(2)			// i1v1 | i0v0 | i2v0 | i0v1

+	fsubr	%st(3),%st(0)	// i0v1-i1v1 | i0v0 | i2v0 | i0v1

+	fildl	fv_v+0(%esi)	// i1v0 | i0v1-i1v1 | i0v0 | i2v0 | i0v1

+	fxch	%st(2)			// i0v0 | i0v1-i1v1 | i1v0 | i2v0 | i0v1

+	fsub	%st(0),%st(3)	// i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0 | i0v1

+	fildl	fv_v+4(%edx)	// i2v1 | i0v0 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1

+	fxch	%st(1)			// i0v0 | i2v1 | i0v1-i1v1 | i1v0 | i0v0-i2v0| i0v1

+	fsubp	%st(0),%st(3)	// i2v1 | i0v1-i1v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1

+	fxch	%st(1)			// i0v1-i1v1 | i2v1 | i0v0-i1v0 | i0v0-i2v0 | i0v1

+	fmulp	%st(0),%st(3)	// i2v1 | i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1

+	fsubrp	%st(0),%st(3)	// i0v0-i1v0 | i0v1-i1v1*i0v0-i2v0 | i0v1-i2v1

+	movl	fv_v+16(%ecx),%eax

+	andl	$0xFF00,%eax

+	fmulp	%st(0),%st(2)	// i0v1-i1v1*i0v0-i2v0 | i0v0-i1v0*i0v1-i2v1

+	addl	C(acolormap),%eax

+	fsubp	%st(0),%st(1)	// (i0v1-i1v1)*(i0v0-i2v0)-(i0v0-i1v0)*(i0v1-i2v1)

+	movl	%eax,C(d_pcolormap)

+	fstps	Ltemp

+	movl	Ltemp,%eax

+	subl	$0x80000001,%eax

+	jc		Lskip

+//		if (ptri[i].facesfront)

+//		{

+//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);

+	movl	mtri_facesfront-16(%ebx,%ebp,),%eax

+	testl	%eax,%eax

+	jz		Lfacesback

+	pushl	%edx

+	pushl	%esi

+	pushl	%ecx

+	call	C(D_PolysetRecursiveTriangle)

+	subl	$16,%ebp

+	jnz		Llooptop

+	jmp		Ldone2

+//		}

+//		else

+//		{

+Lfacesback:

+//			s0 = index0->v[2];

+//			s1 = index1->v[2];

+//			s2 = index2->v[2];

+	movl	fv_v+8(%ecx),%eax

+	pushl	%eax

+	movl	fv_v+8(%esi),%eax

+	pushl	%eax

+	movl	fv_v+8(%edx),%eax

+	pushl	%eax

+	pushl	%ecx

+	pushl	%edx

+//			if (index0->flags & ALIAS_ONSEAM)

+//				index0->v[2] += r_affinetridesc.seamfixupX16;

+	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax

+	testl	$(ALIAS_ONSEAM),fv_flags(%ecx)

+	jz		Lp11

+	addl	%eax,fv_v+8(%ecx)

+Lp11:

+//			if (index1->flags & ALIAS_ONSEAM)

+//				index1->v[2] += r_affinetridesc.seamfixupX16;

+	testl	$(ALIAS_ONSEAM),fv_flags(%esi)

+	jz		Lp12

+	addl	%eax,fv_v+8(%esi)

+Lp12:

+//			if (index2->flags & ALIAS_ONSEAM)

+//				index2->v[2] += r_affinetridesc.seamfixupX16;

+	testl	$(ALIAS_ONSEAM),fv_flags(%edx)

+	jz		Lp13

+	addl	%eax,fv_v+8(%edx)

+Lp13:

+//			D_PolysetRecursiveTriangle(index0->v, index1->v, index2->v);

+	pushl	%edx

+	pushl	%esi

+	pushl	%ecx

+	call	C(D_PolysetRecursiveTriangle)

+//			index0->v[2] = s0;

+//			index1->v[2] = s1;

+//			index2->v[2] = s2;

+	popl	%edx

+	popl	%ecx

+	popl	%eax

+	movl	%eax,fv_v+8(%edx)

+	popl	%eax

+	movl	%eax,fv_v+8(%esi)

+	popl	%eax

+	movl	%eax,fv_v+8(%ecx)

+//		}

+//	}

+Lskip:

+	subl	$16,%ebp

+	jnz		Llooptop

+Ldone2:

+	popl	%edi				// restore the caller's stack frame

+	popl	%ebx

+	popl	%esi				// restore register variables

+	popl	%ebp

+	addl	$(SPAN_SIZE),%esp

+	ret

+//----------------------------------------------------------------------

+// Alias model triangle left-edge scanning code

+//----------------------------------------------------------------------

+#define height	4+16

+.globl C(D_PolysetScanLeftEdge)

+C(D_PolysetScanLeftEdge):

+	pushl	%ebp				// preserve caller stack frame pointer

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+	pushl	%ebx

+	movl	height(%esp),%eax

+	movl	C(d_sfrac),%ecx

+	andl	$0xFFFF,%eax

+	movl	C(d_ptex),%ebx

+	orl		%eax,%ecx

+	movl	C(d_pedgespanpackage),%esi

+	movl	C(d_tfrac),%edx

+	movl	C(d_light),%edi

+	movl	C(d_zi),%ebp

+// %eax: scratch

+// %ebx: d_ptex

+// %ecx: d_sfrac in high word, count in low word

+// %edx: d_tfrac

+// %esi: d_pedgespanpackage, errorterm, scratch alternately

+// %edi: d_light

+// %ebp: d_zi

+//	do

+//	{

+LScanLoop:

+//		d_pedgespanpackage->ptex = ptex;

+//		d_pedgespanpackage->pdest = d_pdest;

+//		d_pedgespanpackage->pz = d_pz;

+//		d_pedgespanpackage->count = d_aspancount;

+//		d_pedgespanpackage->light = d_light;

+//		d_pedgespanpackage->zi = d_zi;

+//		d_pedgespanpackage->sfrac = d_sfrac << 16;

+//		d_pedgespanpackage->tfrac = d_tfrac << 16;

+	movl	%ebx,spanpackage_t_ptex(%esi)

+	movl	C(d_pdest),%eax

+	movl	%eax,spanpackage_t_pdest(%esi)

+	movl	C(d_pz),%eax

+	movl	%eax,spanpackage_t_pz(%esi)

+	movl	C(d_aspancount),%eax

+	movl	%eax,spanpackage_t_count(%esi)

+	movl	%edi,spanpackage_t_light(%esi)

+	movl	%ebp,spanpackage_t_zi(%esi)

+	movl	%ecx,spanpackage_t_sfrac(%esi)

+	movl	%edx,spanpackage_t_tfrac(%esi)

+// pretouch the next cache line

+	movb	spanpackage_t_size(%esi),%al

+//		d_pedgespanpackage++;

+	addl	$(spanpackage_t_size),%esi

+	movl	C(erroradjustup),%eax

+	movl	%esi,C(d_pedgespanpackage)

+//		errorterm += erroradjustup;

+	movl	C(errorterm),%esi

+	addl	%eax,%esi

+	movl	C(d_pdest),%eax

+//		if (errorterm >= 0)

+//		{

+	js		LNoLeftEdgeTurnover

+//			errorterm -= erroradjustdown;

+//			d_pdest += d_pdestextrastep;

+	subl	C(erroradjustdown),%esi

+	addl	C(d_pdestextrastep),%eax

+	movl	%esi,C(errorterm)

+	movl	%eax,C(d_pdest)

+//			d_pz += d_pzextrastep;

+//			d_aspancount += d_countextrastep;

+//			d_ptex += d_ptexextrastep;

+//			d_sfrac += d_sfracextrastep;

+//			d_ptex += d_sfrac >> 16;

+//			d_sfrac &= 0xFFFF;

+//			d_tfrac += d_tfracextrastep;

+	movl	C(d_pz),%eax

+	movl	C(d_aspancount),%esi

+	addl	C(d_pzextrastep),%eax

+	addl	C(d_sfracextrastep),%ecx

+	adcl	C(d_ptexextrastep),%ebx

+	addl	C(d_countextrastep),%esi

+	movl	%eax,C(d_pz)

+	movl	C(d_tfracextrastep),%eax

+	movl	%esi,C(d_aspancount)

+	addl	%eax,%edx

+//			if (d_tfrac & 0x10000)

+//			{

+	jnc		LSkip1

+//				d_ptex += r_affinetridesc.skinwidth;

+//				d_tfrac &= 0xFFFF;

+	addl	C(r_affinetridesc)+atd_skinwidth,%ebx

+//			}

+LSkip1:

+//			d_light += d_lightextrastep;

+//			d_zi += d_ziextrastep;

+	addl	C(d_lightextrastep),%edi

+	addl	C(d_ziextrastep),%ebp

+//		}

+	movl	C(d_pedgespanpackage),%esi

+	decl	%ecx

+	testl	$0xFFFF,%ecx

+	jnz		LScanLoop

+	popl	%ebx

+	popl	%edi

+	popl	%esi

+	popl	%ebp

+	ret

+//		else

+//		{

+LNoLeftEdgeTurnover:

+	movl	%esi,C(errorterm)

+//			d_pdest += d_pdestbasestep;

+	addl	C(d_pdestbasestep),%eax

+	movl	%eax,C(d_pdest)

+//			d_pz += d_pzbasestep;

+//			d_aspancount += ubasestep;

+//			d_ptex += d_ptexbasestep;

+//			d_sfrac += d_sfracbasestep;

+//			d_ptex += d_sfrac >> 16;

+//			d_sfrac &= 0xFFFF;

+	movl	C(d_pz),%eax

+	movl	C(d_aspancount),%esi

+	addl	C(d_pzbasestep),%eax

+	addl	C(d_sfracbasestep),%ecx

+	adcl	C(d_ptexbasestep),%ebx

+	addl	C(ubasestep),%esi

+	movl	%eax,C(d_pz)

+	movl	%esi,C(d_aspancount)

+//			d_tfrac += d_tfracbasestep;

+	movl	C(d_tfracbasestep),%esi

+	addl	%esi,%edx

+//			if (d_tfrac & 0x10000)

+//			{

+	jnc		LSkip2

+//				d_ptex += r_affinetridesc.skinwidth;

+//				d_tfrac &= 0xFFFF;

+	addl	C(r_affinetridesc)+atd_skinwidth,%ebx

+//			}

+LSkip2:

+//			d_light += d_lightbasestep;

+//			d_zi += d_zibasestep;

+	addl	C(d_lightbasestep),%edi

+	addl	C(d_zibasestep),%ebp

+//		}

+//	} while (--height);

+	movl	C(d_pedgespanpackage),%esi

+	decl	%ecx

+	testl	$0xFFFF,%ecx

+	jnz		LScanLoop

+	popl	%ebx

+	popl	%edi

+	popl	%esi

+	popl	%ebp

+	ret

+//----------------------------------------------------------------------

+// Alias model vertex drawing code

+//----------------------------------------------------------------------

+#define fv			4+8

+#define	numverts	8+8

+.globl C(D_PolysetDrawFinalVerts)

+C(D_PolysetDrawFinalVerts):

+	pushl	%ebp				// preserve caller stack frame pointer

+	pushl	%ebx

+//	int		i, z;

+//	short	*zbuf;

+	movl	numverts(%esp),%ecx

+	movl	fv(%esp),%ebx

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+LFVLoop:

+//	for (i=0 ; i<numverts ; i++, fv++)

+//	{

+//	// valid triangle coordinates for filling can include the bottom and

+//	// right clip edges, due to the fill rule; these shouldn't be drawn

+//		if ((fv->v[0] < r_refdef.vrectright) &&

+//			(fv->v[1] < r_refdef.vrectbottom))

+//		{

+	movl	fv_v+0(%ebx),%eax

+	movl	C(r_refdef)+rd_vrectright,%edx

+	cmpl	%edx,%eax

+	jge		LNextVert

+	movl	fv_v+4(%ebx),%esi

+	movl	C(r_refdef)+rd_vrectbottom,%edx

+	cmpl	%edx,%esi

+	jge		LNextVert

+//			zbuf = zspantable[fv->v[1]] + fv->v[0];

+	movl	C(zspantable)(,%esi,4),%edi

+//			z = fv->v[5]>>16;

+	movl	fv_v+20(%ebx),%edx

+	shrl	$16,%edx

+//			if (z >= *zbuf)

+//			{

+//				int		pix;

+	cmpw	(%edi,%eax,2),%dx

+	jl		LNextVert

+//				*zbuf = z;

+	movw	%dx,(%edi,%eax,2)

+//				pix = skintable[fv->v[3]>>16][fv->v[2]>>16];

+	movl	fv_v+12(%ebx),%edi

+	shrl	$16,%edi

+	movl	C(skintable)(,%edi,4),%edi

+	movl	fv_v+8(%ebx),%edx

+	shrl	$16,%edx

+	movb	(%edi,%edx),%dl

+//				pix = ((byte *)acolormap)[pix + (fv->v[4] & 0xFF00)];

+	movl	fv_v+16(%ebx),%edi

+	andl	$0xFF00,%edi

+	andl	$0x00FF,%edx

+	addl	%edx,%edi

+	movl	C(acolormap),%edx

+	movb	(%edx,%edi,1),%dl

+//				d_viewbuffer[d_scantable[fv->v[1]] + fv->v[0]] = pix;

+	movl	C(d_scantable)(,%esi,4),%edi

+	movl	C(d_viewbuffer),%esi

+	addl	%eax,%edi

+	movb	%dl,(%esi,%edi)

+//			}

+//		}

+//	}

+LNextVert:

+	addl	$(fv_size),%ebx

+	decl	%ecx

+	jnz		LFVLoop

+	popl	%edi

+	popl	%esi

+	popl	%ebx

+	popl	%ebp

+	ret

+//----------------------------------------------------------------------

+// Alias model non-subdivided polygon dispatching code

+//

+// not C-callable because of stack buffer cleanup

+//----------------------------------------------------------------------

+.globl C(D_DrawNonSubdiv)

+C(D_DrawNonSubdiv):

+	pushl	%ebp				// preserve caller stack frame pointer

+	movl	C(r_affinetridesc)+atd_numtriangles,%ebp

+	pushl	%ebx

+	shll	$(mtri_shift),%ebp

+	pushl	%esi				// preserve register variables

+	movl	C(r_affinetridesc)+atd_ptriangles,%esi

+	pushl	%edi

+//	mtriangle_t		*ptri;

+//	finalvert_t		*pfv, *index0, *index1, *index2;

+//	int				i;

+//	int				lnumtriangles;

+//	pfv = r_affinetridesc.pfinalverts;

+//	ptri = r_affinetridesc.ptriangles;

+//	lnumtriangles = r_affinetridesc.numtriangles;

+LNDLoop:

+//	for (i=0 ; i<lnumtriangles ; i++, ptri++)

+//	{

+//		index0 = pfv + ptri->vertindex[0];

+//		index1 = pfv + ptri->vertindex[1];

+//		index2 = pfv + ptri->vertindex[2];

+	movl	C(r_affinetridesc)+atd_pfinalverts,%edi

+	movl	mtri_vertindex+0-mtri_size(%esi,%ebp,1),%ecx

+	shll	$(fv_shift),%ecx

+	movl	mtri_vertindex+4-mtri_size(%esi,%ebp,1),%edx

+	shll	$(fv_shift),%edx

+	movl	mtri_vertindex+8-mtri_size(%esi,%ebp,1),%ebx

+	shll	$(fv_shift),%ebx

+	addl	%edi,%ecx

+	addl	%edi,%edx

+	addl	%edi,%ebx

+//		d_xdenom = (index0->v[1]-index1->v[1]) *

+//				(index0->v[0]-index2->v[0]) -

+//				(index0->v[0]-index1->v[0])*(index0->v[1]-index2->v[1]);

+	movl	fv_v+4(%ecx),%eax

+	movl	fv_v+0(%ecx),%esi

+	subl	fv_v+4(%edx),%eax

+	subl	fv_v+0(%ebx),%esi

+	imull	%esi,%eax

+	movl	fv_v+0(%ecx),%esi

+	movl	fv_v+4(%ecx),%edi

+	subl	fv_v+0(%edx),%esi

+	subl	fv_v+4(%ebx),%edi

+	imull	%esi,%edi

+	subl	%edi,%eax

+//		if (d_xdenom >= 0)

+//		{

+//			continue;

+	jns		LNextTri

+//		}

+	movl	%eax,C(d_xdenom)

+	fildl	C(d_xdenom)

+//		r_p0[0] = index0->v[0];		// u

+//		r_p0[1] = index0->v[1];		// v

+//		r_p0[2] = index0->v[2];		// s

+//		r_p0[3] = index0->v[3];		// t

+//		r_p0[4] = index0->v[4];		// light

+//		r_p0[5] = index0->v[5];		// iz

+	movl	fv_v+0(%ecx),%eax

+	movl	fv_v+4(%ecx),%esi

+	movl	%eax,C(r_p0)+0

+	movl	%esi,C(r_p0)+4

+	movl	fv_v+8(%ecx),%eax

+	movl	fv_v+12(%ecx),%esi

+	movl	%eax,C(r_p0)+8

+	movl	%esi,C(r_p0)+12

+	movl	fv_v+16(%ecx),%eax

+	movl	fv_v+20(%ecx),%esi

+	movl	%eax,C(r_p0)+16

+	movl	%esi,C(r_p0)+20

+	fdivrs	float_1

+//		r_p1[0] = index1->v[0];

+//		r_p1[1] = index1->v[1];

+//		r_p1[2] = index1->v[2];

+//		r_p1[3] = index1->v[3];

+//		r_p1[4] = index1->v[4];

+//		r_p1[5] = index1->v[5];

+	movl	fv_v+0(%edx),%eax

+	movl	fv_v+4(%edx),%esi

+	movl	%eax,C(r_p1)+0

+	movl	%esi,C(r_p1)+4

+	movl	fv_v+8(%edx),%eax

+	movl	fv_v+12(%edx),%esi

+	movl	%eax,C(r_p1)+8

+	movl	%esi,C(r_p1)+12

+	movl	fv_v+16(%edx),%eax

+	movl	fv_v+20(%edx),%esi

+	movl	%eax,C(r_p1)+16

+	movl	%esi,C(r_p1)+20

+//		r_p2[0] = index2->v[0];

+//		r_p2[1] = index2->v[1];

+//		r_p2[2] = index2->v[2];

+//		r_p2[3] = index2->v[3];

+//		r_p2[4] = index2->v[4];

+//		r_p2[5] = index2->v[5];

+	movl	fv_v+0(%ebx),%eax

+	movl	fv_v+4(%ebx),%esi

+	movl	%eax,C(r_p2)+0

+	movl	%esi,C(r_p2)+4

+	movl	fv_v+8(%ebx),%eax

+	movl	fv_v+12(%ebx),%esi

+	movl	%eax,C(r_p2)+8

+	movl	%esi,C(r_p2)+12

+	movl	fv_v+16(%ebx),%eax

+	movl	fv_v+20(%ebx),%esi

+	movl	%eax,C(r_p2)+16

+	movl	C(r_affinetridesc)+atd_ptriangles,%edi

+	movl	%esi,C(r_p2)+20

+	movl	mtri_facesfront-mtri_size(%edi,%ebp,1),%eax

+//		if (!ptri->facesfront)

+//		{

+	testl	%eax,%eax

+	jnz		LFacesFront

+//			if (index0->flags & ALIAS_ONSEAM)

+//				r_p0[2] += r_affinetridesc.seamfixupX16;

+	movl	fv_flags(%ecx),%eax

+	movl	fv_flags(%edx),%esi

+	movl	fv_flags(%ebx),%edi

+	testl	$(ALIAS_ONSEAM),%eax

+	movl	C(r_affinetridesc)+atd_seamfixupX16,%eax

+	jz		LOnseamDone0

+	addl	%eax,C(r_p0)+8

+LOnseamDone0:

+//			if (index1->flags & ALIAS_ONSEAM)

+// 				r_p1[2] += r_affinetridesc.seamfixupX16;

+	testl	$(ALIAS_ONSEAM),%esi

+	jz		LOnseamDone1

+	addl	%eax,C(r_p1)+8

+LOnseamDone1:

+//			if (index2->flags & ALIAS_ONSEAM)

+//				r_p2[2] += r_affinetridesc.seamfixupX16;

+	testl	$(ALIAS_ONSEAM),%edi

+	jz		LOnseamDone2

+	addl	%eax,C(r_p2)+8

+LOnseamDone2:

+//		}

+LFacesFront:

+	fstps	C(d_xdenom)

+//		D_PolysetSetEdgeTable ();

+//		D_RasterizeAliasPolySmooth ();

+		call	C(D_PolysetSetEdgeTable)

+		call	C(D_RasterizeAliasPolySmooth)

+LNextTri:

+		movl	C(r_affinetridesc)+atd_ptriangles,%esi

+		subl	$16,%ebp

+		jnz		LNDLoop

+//	}

+	popl	%edi

+	popl	%esi

+	popl	%ebx

+	popl	%ebp

+	addl	$(SPAN_SIZE),%esp

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/d_scana.s

@@ -1,0 +1,70 @@

+//

+// d_scana.s

+// x86 assembly-language turbulent texture mapping code

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef id386

+	.data

+	.text

+//----------------------------------------------------------------------

+// turbulent texture mapping code

+//----------------------------------------------------------------------

+	.align 4

+.globl C(D_DrawTurbulent8Span)

+C(D_DrawTurbulent8Span):

+	pushl	%ebp				// preserve caller's stack frame pointer

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+	pushl	%ebx

+	movl	C(r_turb_s),%esi

+	movl	C(r_turb_t),%ecx

+	movl	C(r_turb_pdest),%edi

+	movl	C(r_turb_spancount),%ebx

+Llp:

+	movl	%ecx,%eax

+	movl	%esi,%edx

+	sarl	$16,%eax

+	movl	C(r_turb_turb),%ebp

+	sarl	$16,%edx

+	andl	$(CYCLE-1),%eax

+	andl	$(CYCLE-1),%edx

+	movl	(%ebp,%eax,4),%eax

+	movl	(%ebp,%edx,4),%edx

+	addl	%esi,%eax

+	sarl	$16,%eax

+	addl	%ecx,%edx

+	sarl	$16,%edx

+	andl	$(TURB_TEX_SIZE-1),%eax

+	andl	$(TURB_TEX_SIZE-1),%edx

+	shll	$6,%edx

+	movl	C(r_turb_pbase),%ebp

+	addl	%eax,%edx

+	incl	%edi

+	addl	C(r_turb_sstep),%esi

+	addl	C(r_turb_tstep),%ecx

+	movb	(%ebp,%edx,1),%dl

+	decl	%ebx

+	movb	%dl,-1(%edi)

+	jnz		Llp

+	movl	%edi,C(r_turb_pdest)

+	popl	%ebx				// restore register variables

+	popl	%edi

+	popl	%esi

+	popl	%ebp				// restore caller's stack frame pointer

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/d_spr8.s

@@ -1,0 +1,881 @@

+//

+// d_spr8.s

+// x86 assembly-language horizontal 8-bpp transparent span-drawing code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#ifdef id386

+//----------------------------------------------------------------------

+// 8-bpp horizontal span drawing code for polygons, with transparency.

+//----------------------------------------------------------------------

+	.text

+// out-of-line, rarely-needed clamping code

+LClampHigh0:

+	movl	C(bbextents),%esi

+	jmp		LClampReentry0

+LClampHighOrLow0:

+	jg		LClampHigh0

+	xorl	%esi,%esi

+	jmp		LClampReentry0

+LClampHigh1:

+	movl	C(bbextentt),%edx

+	jmp		LClampReentry1

+LClampHighOrLow1:

+	jg		LClampHigh1

+	xorl	%edx,%edx

+	jmp		LClampReentry1

+LClampLow2:

+	movl	$2048,%ebp

+	jmp		LClampReentry2

+LClampHigh2:

+	movl	C(bbextents),%ebp

+	jmp		LClampReentry2

+LClampLow3:

+	movl	$2048,%ecx

+	jmp		LClampReentry3

+LClampHigh3:

+	movl	C(bbextentt),%ecx

+	jmp		LClampReentry3

+LClampLow4:

+	movl	$2048,%eax

+	jmp		LClampReentry4

+LClampHigh4:

+	movl	C(bbextents),%eax

+	jmp		LClampReentry4

+LClampLow5:

+	movl	$2048,%ebx

+	jmp		LClampReentry5

+LClampHigh5:

+	movl	C(bbextentt),%ebx

+	jmp		LClampReentry5

+#define pspans	4+16

+	.align 4

+.globl C(D_SpriteDrawSpans)

+C(D_SpriteDrawSpans):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//

+// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock

+// and span list pointers, and 1/z step in 0.32 fixed-point

+//

+// FIXME: any overlap from rearranging?

+	flds	C(d_sdivzstepu)

+	fmuls	fp_8

+	movl	C(cacheblock),%edx

+	flds	C(d_tdivzstepu)

+	fmuls	fp_8

+	movl	pspans(%esp),%ebx	// point to the first span descriptor

+	flds	C(d_zistepu)

+	fmuls	fp_8

+	movl	%edx,pbase			// pbase = cacheblock

+	flds	C(d_zistepu)

+	fmuls	fp_64kx64k

+	fxch	%st(3)

+	fstps	sdivz8stepu

+	fstps	zi8stepu

+	fstps	tdivz8stepu

+	fistpl	izistep

+	movl	izistep,%eax

+	rorl	$16,%eax		// put upper 16 bits in low word

+	movl	sspan_t_count(%ebx),%ecx

+	movl	%eax,izistep

+	cmpl	$0,%ecx

+	jle		LNextSpan

+LSpanLoop:

+//

+// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the

+// initial s and t values

+//

+// FIXME: pipeline FILD?

+	fildl	sspan_t_v(%ebx)

+	fildl	sspan_t_u(%ebx)

+	fld		%st(1)			// dv | du | dv

+	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv

+	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv

+	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

+	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv

+	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |

+							//  dv*d_sdivzstepv | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |

+							//  dv*d_sdivzstepv | du | dv

+	faddp	%st(0),%st(2)	// du*d_tdivzstepu |

+							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |

+							//  du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  du*d_tdivzstepu | du | dv

+	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |

+							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv

+	fadds	C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +

+							//  du*d_sdivzstepu; stays in %st(2) at end

+	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |

+							//  s/z

+	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |

+							//  du*d_tdivzstepu | du | s/z

+	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |

+							//  du*d_tdivzstepu | du | s/z

+	faddp	%st(0),%st(2)	// dv*d_zistepv |

+							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z

+	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  dv*d_zistepv | s/z

+	fmuls	C(d_zistepu)		// du*d_zistepu |

+							//  dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  dv*d_zistepv | s/z

+	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |

+							//  du*d_zistepu | dv*d_zistepv | s/z

+	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +

+							//  du*d_tdivzstepu; stays in %st(1) at end

+	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z

+	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z

+	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z

+	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z

+	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +

+							//  du*d_zistepu; stays in %st(0) at end

+							// 1/z | fp_64k | t/z | s/z

+	fld		%st(0)			// FIXME: get rid of stall on FMUL?

+	fmuls	fp_64kx64k

+	fxch	%st(1)

+//

+// calculate and clamp s & t

+//

+	fdivr	%st(0),%st(2)	// 1/z | z*64k | t/z | s/z

+	fxch	%st(1)

+	fistpl	izi				// 0.32 fixed-point 1/z

+	movl	izi,%ebp

+//

+// set pz to point to the first z-buffer pixel in the span

+//

+	rorl	$16,%ebp		// put upper 16 bits in low word

+	movl	sspan_t_v(%ebx),%eax

+	movl	%ebp,izi

+	movl	sspan_t_u(%ebx),%ebp

+	imull	C(d_zrowbytes)

+	shll	$1,%ebp					// a word per pixel

+	addl	C(d_pzbuffer),%eax

+	addl	%ebp,%eax

+	movl	%eax,pz

+//

+// point %edi to the first pixel in the span

+//

+	movl	C(d_viewbuffer),%ebp

+	movl	sspan_t_v(%ebx),%eax

+	pushl	%ebx		// preserve spans pointer

+	movl	C(tadjust),%edx

+	movl	C(sadjust),%esi

+	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth

+	addl	%ebp,%edi

+	movl	sspan_t_u(%ebx),%ebp

+	addl	%ebp,%edi				// pdest = &pdestspan[scans->u];

+//

+// now start the FDIV for the end of the span

+//

+	cmpl	$8,%ecx

+	ja		LSetupNotLast1

+	decl	%ecx

+	jz		LCleanup1		// if only one pixel, no need to start an FDIV

+	movl	%ecx,spancountminus1

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	fildl	spancountminus1

+	flds	C(d_tdivzstepu)	// _d_tdivzstepu | spancountminus1

+	flds	C(d_zistepu)	// _d_zistepu | _d_tdivzstepu | spancountminus1

+	fmul	%st(2),%st(0)	// _d_zistepu*scm1 | _d_tdivzstepu | scm1

+	fxch	%st(1)			// _d_tdivzstepu | _d_zistepu*scm1 | scm1

+	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1

+	fxch	%st(2)			// scm1 | _d_zistepu*scm1 | _d_tdivzstepu*scm1

+	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_zistepu*scm1 |

+							//  _d_tdivzstepu*scm1

+	fxch	%st(1)			// _d_zistepu*scm1 | _d_sdivzstepu*scm1 |

+							//  _d_tdivzstepu*scm1

+	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1

+	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1

+	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1

+	faddp	%st(0),%st(3)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

+							//  overlap

+	jmp		LFDIVInFlight1

+LCleanup1:

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	jmp		LFDIVInFlight1

+	.align	4

+LSetupNotLast1:

+// finish up the s and t calcs

+	fxch	%st(1)			// z*64k | 1/z | t/z | s/z

+	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z

+	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z

+	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z

+	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z

+	fxch	%st(1)			// s | t | 1/z | t/z | s/z

+	fistpl	s				// 1/z | t | t/z | s/z

+	fistpl	t				// 1/z | t/z | s/z

+	fadds	zi8stepu

+	fxch	%st(2)

+	fadds	sdivz8stepu

+	fxch	%st(2)

+	flds	tdivz8stepu

+	faddp	%st(0),%st(2)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// z = 1/1/z

+							// this is what we've gone to all this trouble to

+							//  overlap

+LFDIVInFlight1:

+	addl	s,%esi

+	addl	t,%edx

+	movl	C(bbextents),%ebx

+	movl	C(bbextentt),%ebp

+	cmpl	%ebx,%esi

+	ja		LClampHighOrLow0

+LClampReentry0:

+	movl	%esi,s

+	movl	pbase,%ebx

+	shll	$16,%esi

+	cmpl	%ebp,%edx

+	movl	%esi,sfracf

+	ja		LClampHighOrLow1

+LClampReentry1:

+	movl	%edx,t

+	movl	s,%esi					// sfrac = scans->sfrac;

+	shll	$16,%edx

+	movl	t,%eax					// tfrac = scans->tfrac;

+	sarl	$16,%esi

+	movl	%edx,tfracf

+//

+// calculate the texture starting address

+//

+	sarl	$16,%eax

+	addl	%ebx,%esi

+	imull	C(cachewidth),%eax		// (tfrac >> 16) * cachewidth

+	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +

+									//           ((tfrac >> 16) * cachewidth);

+//

+// determine whether last span or not

+//

+	cmpl	$8,%ecx

+	jna		LLastSegment

+//

+// not the last segment; do full 8-wide segment

+//

+LNotLastSegment:

+//

+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

+// get there

+//

+// pick up after the FDIV that was left in flight previously

+	fld		%st(0)			// duplicate it

+	fmul	%st(4),%st(0)	// s = s/z * z

+	fxch	%st(1)

+	fmul	%st(3),%st(0)	// t = t/z * z

+	fxch	%st(1)

+	fistpl	snext

+	fistpl	tnext

+	movl	snext,%eax

+	movl	tnext,%edx

+	subl	$8,%ecx		// count off this segments' pixels

+	movl	C(sadjust),%ebp

+	pushl	%ecx		// remember count of remaining pixels

+	movl	C(tadjust),%ecx

+	addl	%eax,%ebp

+	addl	%edx,%ecx

+	movl	C(bbextents),%eax

+	movl	C(bbextentt),%edx

+	cmpl	$2048,%ebp

+	jl		LClampLow2

+	cmpl	%eax,%ebp

+	ja		LClampHigh2

+LClampReentry2:

+	cmpl	$2048,%ecx

+	jl		LClampLow3

+	cmpl	%edx,%ecx

+	ja		LClampHigh3

+LClampReentry3:

+	movl	%ebp,snext

+	movl	%ecx,tnext

+	subl	s,%ebp

+	subl	t,%ecx

+//

+// set up advancetable

+//

+	movl	%ecx,%eax

+	movl	%ebp,%edx

+	sarl	$19,%edx			// sstep >>= 16;

+	movl	C(cachewidth),%ebx

+	sarl	$19,%eax			// tstep >>= 16;

+	jz		LIsZero

+	imull	%ebx,%eax			// (tstep >> 16) * cachewidth;

+LIsZero:

+	addl	%edx,%eax			// add in sstep

+								// (tstep >> 16) * cachewidth + (sstep >> 16);

+	movl	tfracf,%edx

+	movl	%eax,advancetable+4	// advance base in t

+	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +

+								//  (sstep >> 16);

+	shll	$13,%ebp			// left-justify sstep fractional part

+	movl	%ebp,sstep

+	movl	sfracf,%ebx

+	shll	$13,%ecx			// left-justify tstep fractional part

+	movl	%eax,advancetable	// advance extra in t

+	movl	%ecx,tstep

+	movl	pz,%ecx

+	movl	izi,%ebp

+	cmpw	(%ecx),%bp

+	jl		Lp1

+	movb	(%esi),%al			// get first source texel

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp1

+	movw	%bp,(%ecx)

+	movb	%al,(%edi)			// store first dest pixel

+Lp1:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx			// advance tfrac fractional part by tstep frac

+	sbbl	%eax,%eax			// turn tstep carry into -1 (0 if none)

+	addl	sstep,%ebx			// advance sfrac fractional part by sstep frac

+	adcl	advancetable+4(,%eax,4),%esi	// point to next source texel

+	cmpw	2(%ecx),%bp

+	jl		Lp2

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp2

+	movw	%bp,2(%ecx)

+	movb	%al,1(%edi)

+Lp2:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	cmpw	4(%ecx),%bp

+	jl		Lp3

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp3

+	movw	%bp,4(%ecx)

+	movb	%al,2(%edi)

+Lp3:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	cmpw	6(%ecx),%bp

+	jl		Lp4

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp4

+	movw	%bp,6(%ecx)

+	movb	%al,3(%edi)

+Lp4:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	cmpw	8(%ecx),%bp

+	jl		Lp5

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp5

+	movw	%bp,8(%ecx)

+	movb	%al,4(%edi)

+Lp5:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+//

+// start FDIV for end of next segment in flight, so it can overlap

+//

+	popl	%eax

+	cmpl	$8,%eax			// more than one segment after this?

+	ja		LSetupNotLast2	// yes

+	decl	%eax

+	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV

+	movl	%eax,spancountminus1

+	fildl	spancountminus1

+	flds	C(d_zistepu)		// _d_zistepu | spancountminus1

+	fmul	%st(1),%st(0)	// _d_zistepu*scm1 | scm1

+	flds	C(d_tdivzstepu)	// _d_tdivzstepu | _d_zistepu*scm1 | scm1

+	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1

+	fxch	%st(1)			// _d_zistepu*scm1 | _d_tdivzstepu*scm1 | scm1

+	faddp	%st(0),%st(3)	// _d_tdivzstepu*scm1 | scm1

+	fxch	%st(1)			// scm1 | _d_tdivzstepu*scm1

+	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1

+	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1

+	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1

+	flds	fp_64k			// 64k | _d_sdivzstepu*scm1

+	fxch	%st(1)			// _d_sdivzstepu*scm1 | 64k

+	faddp	%st(0),%st(4)	// 64k

+	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to

+							//  overlap

+	jmp		LFDIVInFlight2

+	.align	4

+LSetupNotLast2:

+	fadds	zi8stepu

+	fxch	%st(2)

+	fadds	sdivz8stepu

+	fxch	%st(2)

+	flds	tdivz8stepu

+	faddp	%st(0),%st(2)

+	flds	fp_64k

+	fdiv	%st(1),%st(0)	// z = 1/1/z

+							// this is what we've gone to all this trouble to

+							//  overlap

+LFDIVInFlight2:

+	pushl	%eax

+	cmpw	10(%ecx),%bp

+	jl		Lp6

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp6

+	movw	%bp,10(%ecx)

+	movb	%al,5(%edi)

+Lp6:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	cmpw	12(%ecx),%bp

+	jl		Lp7

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp7

+	movw	%bp,12(%ecx)

+	movb	%al,6(%edi)

+Lp7:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	cmpw	14(%ecx),%bp

+	jl		Lp8

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp8

+	movw	%bp,14(%ecx)

+	movb	%al,7(%edi)

+Lp8:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+	addl	$8,%edi

+	addl	$16,%ecx

+	movl	%edx,tfracf

+	movl	snext,%edx

+	movl	%ebx,sfracf

+	movl	tnext,%ebx

+	movl	%edx,s

+	movl	%ebx,t

+	movl	%ecx,pz

+	movl	%ebp,izi

+	popl	%ecx				// retrieve count

+//

+// determine whether last span or not

+//

+	cmpl	$8,%ecx				// are there multiple segments remaining?

+	ja		LNotLastSegment		// yes

+//

+// last segment of scan

+//

+LLastSegment:

+//

+// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to

+// get there. The number of pixels left is variable, and we want to land on the

+// last pixel, not step one past it, so we can't run into arithmetic problems

+//

+	testl	%ecx,%ecx

+	jz		LNoSteps		// just draw the last pixel and we're done

+// pick up after the FDIV that was left in flight previously

+	fld		%st(0)			// duplicate it

+	fmul	%st(4),%st(0)	// s = s/z * z

+	fxch	%st(1)

+	fmul	%st(3),%st(0)	// t = t/z * z

+	fxch	%st(1)

+	fistpl	snext

+	fistpl	tnext

+	movl	C(tadjust),%ebx

+	movl	C(sadjust),%eax

+	addl	snext,%eax

+	addl	tnext,%ebx

+	movl	C(bbextents),%ebp

+	movl	C(bbextentt),%edx

+	cmpl	$2048,%eax

+	jl		LClampLow4

+	cmpl	%ebp,%eax

+	ja		LClampHigh4

+LClampReentry4:

+	movl	%eax,snext

+	cmpl	$2048,%ebx

+	jl		LClampLow5

+	cmpl	%edx,%ebx

+	ja		LClampHigh5

+LClampReentry5:

+	cmpl	$1,%ecx			// don't bother

+	je		LOnlyOneStep	// if two pixels in segment, there's only one step,

+							//  of the segment length

+	subl	s,%eax

+	subl	t,%ebx

+	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31

+	addl	%ebx,%ebx		//  reciprocal yields 16.48

+	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)

+	movl	%edx,%ebp

+	movl	%ebx,%eax

+	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)

+LSetEntryvec:

+//

+// set up advancetable

+//

+	movl	spr8entryvec_table(,%ecx,4),%ebx

+	movl	%edx,%eax

+	pushl	%ebx				// entry point into code for RET later

+	movl	%ebp,%ecx

+	sarl	$16,%ecx			// sstep >>= 16;

+	movl	C(cachewidth),%ebx

+	sarl	$16,%edx			// tstep >>= 16;

+	jz		LIsZeroLast

+	imull	%ebx,%edx			// (tstep >> 16) * cachewidth;

+LIsZeroLast:

+	addl	%ecx,%edx			// add in sstep

+								// (tstep >> 16) * cachewidth + (sstep >> 16);

+	movl	tfracf,%ecx

+	movl	%edx,advancetable+4	// advance base in t

+	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +

+								//  (sstep >> 16);

+	shll	$16,%ebp			// left-justify sstep fractional part

+	movl	sfracf,%ebx

+	shll	$16,%eax			// left-justify tstep fractional part

+	movl	%edx,advancetable	// advance extra in t

+	movl	%eax,tstep

+	movl	%ebp,sstep

+	movl	%ecx,%edx

+	movl	pz,%ecx

+	movl	izi,%ebp

+	ret							// jump to the number-of-pixels handler

+//----------------------------------------

+LNoSteps:

+	movl	pz,%ecx

+	subl	$7,%edi			// adjust for hardwired offset

+	subl	$14,%ecx

+	jmp		LEndSpan

+LOnlyOneStep:

+	subl	s,%eax

+	subl	t,%ebx

+	movl	%eax,%ebp

+	movl	%ebx,%edx

+	jmp		LSetEntryvec

+//----------------------------------------

+.globl	Spr8Entry2_8

+Spr8Entry2_8:

+	subl	$6,%edi		// adjust for hardwired offsets

+	subl	$12,%ecx

+	movb	(%esi),%al

+	jmp		LLEntry2_8

+//----------------------------------------

+.globl	Spr8Entry3_8

+Spr8Entry3_8:

+	subl	$5,%edi		// adjust for hardwired offsets

+	subl	$10,%ecx

+	jmp		LLEntry3_8

+//----------------------------------------

+.globl	Spr8Entry4_8

+Spr8Entry4_8:

+	subl	$4,%edi		// adjust for hardwired offsets

+	subl	$8,%ecx

+	jmp		LLEntry4_8

+//----------------------------------------

+.globl	Spr8Entry5_8

+Spr8Entry5_8:

+	subl	$3,%edi		// adjust for hardwired offsets

+	subl	$6,%ecx

+	jmp		LLEntry5_8

+//----------------------------------------

+.globl	Spr8Entry6_8

+Spr8Entry6_8:

+	subl	$2,%edi		// adjust for hardwired offsets

+	subl	$4,%ecx

+	jmp		LLEntry6_8

+//----------------------------------------

+.globl	Spr8Entry7_8

+Spr8Entry7_8:

+	decl	%edi		// adjust for hardwired offsets

+	subl	$2,%ecx

+	jmp		LLEntry7_8

+//----------------------------------------

+.globl	Spr8Entry8_8

+Spr8Entry8_8:

+	cmpw	(%ecx),%bp

+	jl		Lp9

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp9

+	movw	%bp,(%ecx)

+	movb	%al,(%edi)

+Lp9:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LLEntry7_8:

+	cmpw	2(%ecx),%bp

+	jl		Lp10

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp10

+	movw	%bp,2(%ecx)

+	movb	%al,1(%edi)

+Lp10:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LLEntry6_8:

+	cmpw	4(%ecx),%bp

+	jl		Lp11

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp11

+	movw	%bp,4(%ecx)

+	movb	%al,2(%edi)

+Lp11:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LLEntry5_8:

+	cmpw	6(%ecx),%bp

+	jl		Lp12

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp12

+	movw	%bp,6(%ecx)

+	movb	%al,3(%edi)

+Lp12:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LLEntry4_8:

+	cmpw	8(%ecx),%bp

+	jl		Lp13

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp13

+	movw	%bp,8(%ecx)

+	movb	%al,4(%edi)

+Lp13:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LLEntry3_8:

+	cmpw	10(%ecx),%bp

+	jl		Lp14

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp14

+	movw	%bp,10(%ecx)

+	movb	%al,5(%edi)

+Lp14:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LLEntry2_8:

+	cmpw	12(%ecx),%bp

+	jl		Lp15

+	movb	(%esi),%al

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp15

+	movw	%bp,12(%ecx)

+	movb	%al,6(%edi)

+Lp15:

+	addl	izistep,%ebp

+	adcl	$0,%ebp

+	addl	tstep,%edx

+	sbbl	%eax,%eax

+	addl	sstep,%ebx

+	adcl	advancetable+4(,%eax,4),%esi

+LEndSpan:

+	cmpw	14(%ecx),%bp

+	jl		Lp16

+	movb	(%esi),%al		// load first texel in segment

+	cmpb	$(TRANSPARENT_COLOR),%al

+	jz		Lp16

+	movw	%bp,14(%ecx)

+	movb	%al,7(%edi)

+Lp16:

+//

+// clear s/z, t/z, 1/z from FP stack

+//

+	fstp %st(0)

+	fstp %st(0)

+	fstp %st(0)

+	popl	%ebx				// restore spans pointer

+LNextSpan:

+	addl	$(sspan_t_size),%ebx // point to next span

+	movl	sspan_t_count(%ebx),%ecx

+	cmpl	$0,%ecx				// any more spans?

+	jg		LSpanLoop			// yes

+	jz		LNextSpan			// yes, but this one's empty

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/d_varsa.s

@@ -1,0 +1,186 @@

+//

+// d_varsa.s

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef	id386

+	.data

+//-------------------------------------------------------

+// global refresh variables

+//-------------------------------------------------------

+// FIXME: put all refresh variables into one contiguous block. Make into one

+// big structure, like cl or sv?

+	.align	4

+.globl	C(d_sdivzstepu)

+.globl	C(d_tdivzstepu)

+.globl	C(d_zistepu)

+.globl	C(d_sdivzstepv)

+.globl	C(d_tdivzstepv)

+.globl	C(d_zistepv)

+.globl	C(d_sdivzorigin)

+.globl	C(d_tdivzorigin)

+.globl	C(d_ziorigin)

+C(d_sdivzstepu):	.single	0

+C(d_tdivzstepu):	.single	0

+C(d_zistepu):		.single	0

+C(d_sdivzstepv):	.single	0

+C(d_tdivzstepv):	.single	0

+C(d_zistepv):		.single	0

+C(d_sdivzorigin):	.single	0

+C(d_tdivzorigin):	.single	0

+C(d_ziorigin):		.single	0

+.globl	C(sadjust)

+.globl	C(tadjust)

+.globl	C(bbextents)

+.globl	C(bbextentt)

+C(sadjust):			.long	0

+C(tadjust):			.long	0

+C(bbextents):		.long	0

+C(bbextentt):		.long	0

+.globl	C(cacheblock)

+.globl	C(d_viewbuffer)

+.globl	C(cachewidth)

+.globl	C(d_pzbuffer)

+.globl	C(d_zrowbytes)

+.globl	C(d_zwidth)

+C(cacheblock):		.long	0

+C(cachewidth):		.long	0

+C(d_viewbuffer):	.long	0

+C(d_pzbuffer):		.long	0

+C(d_zrowbytes):		.long	0

+C(d_zwidth):		.long	0

+//-------------------------------------------------------

+// ASM-only variables

+//-------------------------------------------------------

+.globl	izi

+izi:			.long	0

+.globl	pbase, s, t, sfracf, tfracf, snext, tnext

+.globl	spancountminus1, zi16stepu, sdivz16stepu, tdivz16stepu

+.globl	zi8stepu, sdivz8stepu, tdivz8stepu, pz

+s:				.long	0

+t:				.long	0

+snext:			.long	0

+tnext:			.long	0

+sfracf:			.long	0

+tfracf:			.long	0

+pbase:			.long	0

+zi8stepu:		.long	0

+sdivz8stepu:	.long	0

+tdivz8stepu:	.long	0

+zi16stepu:		.long	0

+sdivz16stepu:	.long	0

+tdivz16stepu:	.long	0

+spancountminus1: .long	0

+pz:				.long	0

+.globl	izistep

+izistep:				.long	0

+//-------------------------------------------------------

+// local variables for d_draw16.s

+//-------------------------------------------------------

+.globl	reciprocal_table_16, entryvec_table_16

+// 1/2, 1/3, 1/4, 1/5, 1/6, 1/7, 1/8, 1/9, 1/10, 1/11, 1/12, 1/13,

+// 1/14, and 1/15 in 0.32 form

+reciprocal_table_16:	.long	0x40000000, 0x2aaaaaaa, 0x20000000

+						.long	0x19999999, 0x15555555, 0x12492492

+						.long	0x10000000, 0xe38e38e, 0xccccccc, 0xba2e8ba

+						.long	0xaaaaaaa, 0x9d89d89, 0x9249249, 0x8888888

+	.extern Entry2_16

+	.extern Entry3_16

+	.extern Entry4_16

+	.extern Entry5_16

+	.extern Entry6_16

+	.extern Entry7_16

+	.extern Entry8_16

+	.extern Entry9_16

+	.extern Entry10_16

+	.extern Entry11_16

+	.extern Entry12_16

+	.extern Entry13_16

+	.extern Entry14_16

+	.extern Entry15_16

+	.extern Entry16_16

+entryvec_table_16:	.long	0, Entry2_16, Entry3_16, Entry4_16

+					.long	Entry5_16, Entry6_16, Entry7_16, Entry8_16

+					.long	Entry9_16, Entry10_16, Entry11_16, Entry12_16

+					.long	Entry13_16, Entry14_16, Entry15_16, Entry16_16

+//-------------------------------------------------------

+// local variables for d_parta.s

+//-------------------------------------------------------

+.globl	DP_Count, DP_u, DP_v, DP_32768, DP_Color, DP_Pix, DP_EntryTable

+DP_Count:		.long	0

+DP_u:			.long	0

+DP_v:			.long	0

+DP_32768:		.single	32768.0

+DP_Color:		.long	0

+DP_Pix:			.long	0

+	.extern DP_1x1

+	.extern DP_2x2

+	.extern DP_3x3

+	.extern DP_4x4

+DP_EntryTable:	.long	DP_1x1, DP_2x2, DP_3x3, DP_4x4

+//

+// advancetable is 8 bytes, but points to the middle of that range so negative

+// offsets will work

+//

+.globl	advancetable, sstep, tstep, pspantemp, counttemp, jumptemp

+advancetable:	.long	0, 0

+sstep:			.long	0

+tstep:			.long	0

+pspantemp:		.long	0

+counttemp:		.long	0

+jumptemp:		.long	0

+// 1/2, 1/3, 1/4, 1/5, 1/6, and 1/7 in 0.32 form

+.globl	reciprocal_table, entryvec_table

+reciprocal_table:	.long	0x40000000, 0x2aaaaaaa, 0x20000000

+					.long	0x19999999, 0x15555555, 0x12492492

+	.extern Entry2_8

+	.extern Entry3_8

+	.extern Entry4_8

+	.extern Entry5_8

+	.extern Entry6_8

+	.extern Entry7_8

+	.extern Entry8_8

+entryvec_table:	.long	0, Entry2_8, Entry3_8, Entry4_8

+				.long	Entry5_8, Entry6_8, Entry7_8, Entry8_8

+	.extern Spr8Entry2_8

+	.extern Spr8Entry3_8

+	.extern Spr8Entry4_8

+	.extern Spr8Entry5_8

+	.extern Spr8Entry6_8

+	.extern Spr8Entry7_8

+	.extern Spr8Entry8_8

+.globl spr8entryvec_table

+spr8entryvec_table:	.long	0, Spr8Entry2_8, Spr8Entry3_8, Spr8Entry4_8

+					.long	Spr8Entry5_8, Spr8Entry6_8, Spr8Entry7_8, Spr8Entry8_8

+#endif	// id386

--- /dev/null

+++ b/u/math.s

@@ -1,0 +1,399 @@

+//

+// math.s

+// x86 assembly-language math routines.

+#define GLQUAKE	1	// don't include unneeded defs

+#include "asm_i386.h"

+#include "quakeasm.h"

+#ifdef	id386

+	.data

+	.align	4

+Ljmptab:	.long	Lcase0, Lcase1, Lcase2, Lcase3

+			.long	Lcase4, Lcase5, Lcase6, Lcase7

+	.text

+// TODO: rounding needed?

+// stack parameter offset

+#define	val	4

+.globl C(Invert24To16)

+C(Invert24To16):

+	movl	val(%esp),%ecx

+	movl	$0x100,%edx		// 0x10000000000 as dividend

+	cmpl	%edx,%ecx

+	jle		LOutOfRange

+	subl	%eax,%eax

+	divl	%ecx

+	ret

+LOutOfRange:

+	movl	$0xFFFFFFFF,%eax

+	ret

+#define	in	4

+#define out	8

+	.align 2

+.globl C(TransformVector)

+C(TransformVector):

+	movl	in(%esp),%eax

+	movl	out(%esp),%edx

+	flds	(%eax)		// in[0]

+	fmuls	C(vright)		// in[0]*vright[0]

+	flds	(%eax)		// in[0] | in[0]*vright[0]

+	fmuls	C(vup)		// in[0]*vup[0] | in[0]*vright[0]

+	flds	(%eax)		// in[0] | in[0]*vup[0] | in[0]*vright[0]

+	fmuls	C(vpn)		// in[0]*vpn[0] | in[0]*vup[0] | in[0]*vright[0]

+	flds	4(%eax)		// in[1] | ...

+	fmuls	C(vright)+4	// in[1]*vright[1] | ...

+	flds	4(%eax)		// in[1] | in[1]*vright[1] | ...

+	fmuls	C(vup)+4		// in[1]*vup[1] | in[1]*vright[1] | ...

+	flds	4(%eax)		// in[1] | in[1]*vup[1] | in[1]*vright[1] | ...

+	fmuls	C(vpn)+4		// in[1]*vpn[1] | in[1]*vup[1] | in[1]*vright[1] | ...

+	fxch	%st(2)		// in[1]*vright[1] | in[1]*vup[1] | in[1]*vpn[1] | ...

+	faddp	%st(0),%st(5)	// in[1]*vup[1] | in[1]*vpn[1] | ...

+	faddp	%st(0),%st(3)	// in[1]*vpn[1] | ...

+	faddp	%st(0),%st(1)	// vpn_accum | vup_accum | vright_accum

+	flds	8(%eax)		// in[2] | ...

+	fmuls	C(vright)+8	// in[2]*vright[2] | ...

+	flds	8(%eax)		// in[2] | in[2]*vright[2] | ...

+	fmuls	C(vup)+8		// in[2]*vup[2] | in[2]*vright[2] | ...

+	flds	8(%eax)		// in[2] | in[2]*vup[2] | in[2]*vright[2] | ...

+	fmuls	C(vpn)+8		// in[2]*vpn[2] | in[2]*vup[2] | in[2]*vright[2] | ...

+	fxch	%st(2)		// in[2]*vright[2] | in[2]*vup[2] | in[2]*vpn[2] | ...

+	faddp	%st(0),%st(5)	// in[2]*vup[2] | in[2]*vpn[2] | ...

+	faddp	%st(0),%st(3)	// in[2]*vpn[2] | ...

+	faddp	%st(0),%st(1)	// vpn_accum | vup_accum | vright_accum

+	fstps	8(%edx)		// out[2]

+	fstps	4(%edx)		// out[1]

+	fstps	(%edx)		// out[0]

+	ret

+#define EMINS	4+4

+#define EMAXS	4+8

+#define P		4+12

+	.align 2

+.globl C(BoxOnPlaneSide)

+C(BoxOnPlaneSide):

+	pushl	%ebx

+	movl	P(%esp),%edx

+	movl	EMINS(%esp),%ecx

+	xorl	%eax,%eax

+	movl	EMAXS(%esp),%ebx

+	movb	pl_signbits(%edx),%al

+	cmpb	$8,%al

+	jge		Lerror

+	flds	pl_normal(%edx)		// p->normal[0]

+	fld		%st(0)				// p->normal[0] | p->normal[0]

+	jmp		Ljmptab(,%eax,4)

+//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

+//dist2= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

+Lcase0:

+	fmuls	(%ebx)				// p->normal[0]*emaxs[0] | p->normal[0]

+	flds	pl_normal+4(%edx)	// p->normal[1] | p->normal[0]*emaxs[0] |

+								//  p->normal[0]

+	fxch	%st(2)				// p->normal[0] | p->normal[0]*emaxs[0] |

+								//  p->normal[1]

+	fmuls	(%ecx)				// p->normal[0]*emins[0] |

+								//  p->normal[0]*emaxs[0] | p->normal[1]

+	fxch	%st(2)				// p->normal[1] | p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fld		%st(0)				// p->normal[1] | p->normal[1] |

+								//  p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fmuls	4(%ebx)				// p->normal[1]*emaxs[1] | p->normal[1] |

+								//  p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	flds	pl_normal+8(%edx)	// p->normal[2] | p->normal[1]*emaxs[1] |

+								//  p->normal[1] | p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fxch	%st(2)				// p->normal[1] | p->normal[1]*emaxs[1] |

+								//  p->normal[2] | p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fmuls	4(%ecx)				// p->normal[1]*emins[1] |

+								//  p->normal[1]*emaxs[1] |

+								//  p->normal[2] | p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fxch	%st(2)				// p->normal[2] | p->normal[1]*emaxs[1] |

+								//  p->normal[1]*emins[1] |

+								//  p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fld		%st(0)				// p->normal[2] | p->normal[2] |

+								//  p->normal[1]*emaxs[1] |

+								//  p->normal[1]*emins[1] |

+								//  p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fmuls	8(%ebx)				// p->normal[2]*emaxs[2] |

+								//  p->normal[2] |

+								//  p->normal[1]*emaxs[1] |

+								//  p->normal[1]*emins[1] |

+								//  p->normal[0]*emaxs[0] |

+								//  p->normal[0]*emins[0]

+	fxch	%st(5)				// p->normal[0]*emins[0] |

+								//  p->normal[2] |

+								//  p->normal[1]*emaxs[1] |

+								//  p->normal[1]*emins[1] |

+								//  p->normal[0]*emaxs[0] |

+								//  p->normal[2]*emaxs[2]

+	faddp	%st(0),%st(3)		//p->normal[2] |

+								// p->normal[1]*emaxs[1] |

+								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

+								// p->normal[0]*emaxs[0] |

+								// p->normal[2]*emaxs[2]

+	fmuls	8(%ecx)				//p->normal[2]*emins[2] |

+								// p->normal[1]*emaxs[1] |

+								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

+								// p->normal[0]*emaxs[0] |

+								// p->normal[2]*emaxs[2]

+	fxch	%st(1)				//p->normal[1]*emaxs[1] |

+								// p->normal[2]*emins[2] |

+								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

+								// p->normal[0]*emaxs[0] |

+								// p->normal[2]*emaxs[2]

+	faddp	%st(0),%st(3)		//p->normal[2]*emins[2] |

+								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

+								// p->normal[0]*emaxs[0]+p->normal[1]*emaxs[1]|

+								// p->normal[2]*emaxs[2]

+	fxch	%st(3)				//p->normal[2]*emaxs[2] +

+								// p->normal[1]*emins[1]+p->normal[0]*emins[0]|

+								// p->normal[0]*emaxs[0]+p->normal[1]*emaxs[1]|

+								// p->normal[2]*emins[2]

+	faddp	%st(0),%st(2)		//p->normal[1]*emins[1]+p->normal[0]*emins[0]|

+								// dist1 | p->normal[2]*emins[2]

+	jmp		LSetSides

+//dist1= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

+//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

+Lcase1:

+	fmuls	(%ecx)				// emins[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ebx)				// emaxs[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ebx)				// emaxs[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ecx)				// emins[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+	jmp		LSetSides

+//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

+//dist2= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

+Lcase2:

+	fmuls	(%ebx)				// emaxs[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ecx)				// emins[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ecx)				// emins[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ebx)				// emaxs[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+	jmp		LSetSides

+//dist1= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

+//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

+Lcase3:

+	fmuls	(%ecx)				// emins[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ebx)				// emaxs[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ecx)				// emins[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ebx)				// emaxs[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+	jmp		LSetSides

+//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

+//dist2= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

+Lcase4:

+	fmuls	(%ebx)				// emaxs[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ecx)				// emins[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ebx)				// emaxs[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ecx)				// emins[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+	jmp		LSetSides

+//dist1= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];

+//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];

+Lcase5:

+	fmuls	(%ecx)				// emins[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ebx)				// emaxs[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ebx)				// emaxs[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ecx)				// emins[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+	jmp		LSetSides

+//dist1= p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

+//dist2= p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

+Lcase6:

+	fmuls	(%ebx)				// emaxs[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ecx)				// emins[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ecx)				// emins[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ebx)				// emaxs[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+	jmp		LSetSides

+//dist1= p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];

+//dist2= p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];

+Lcase7:

+	fmuls	(%ecx)				// emins[0]

+	flds	pl_normal+4(%edx)

+	fxch	%st(2)

+	fmuls	(%ebx)				// emaxs[0]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	4(%ecx)				// emins[1]

+	flds	pl_normal+8(%edx)

+	fxch	%st(2)

+	fmuls	4(%ebx)				// emaxs[1]

+	fxch	%st(2)

+	fld		%st(0)

+	fmuls	8(%ecx)				// emins[2]

+	fxch	%st(5)

+	faddp	%st(0),%st(3)

+	fmuls	8(%ebx)				// emaxs[2]

+	fxch	%st(1)

+	faddp	%st(0),%st(3)

+	fxch	%st(3)

+	faddp	%st(0),%st(2)

+LSetSides:

+//	sides = 0;

+//	if (dist1 >= p->dist)

+//		sides = 1;

+//	if (dist2 < p->dist)

+//		sides |= 2;

+	faddp	%st(0),%st(2)		// dist1 | dist2

+	fcomps	pl_dist(%edx)

+	xorl	%ecx,%ecx

+	fnstsw	%ax

+	fcomps	pl_dist(%edx)

+	andb	$1,%ah

+	xorb	$1,%ah

+	addb	%ah,%cl

+	fnstsw	%ax

+	andb	$1,%ah

+	addb	%ah,%ah

+	addb	%ah,%cl

+//	return sides;

+	popl	%ebx

+	movl	%ecx,%eax	// return status

+	ret

+Lerror:

+	call	C(BOPS_Error)

+#endif	// id386

--- /dev/null

+++ b/u/quakeasm.h

@@ -1,0 +1,248 @@

+//

+// quakeasm.h: general asm header file

+//

+//#define GLQUAKE	1

+#ifdef __i386__

+#define id386

+#endif

+// !!! must be kept the same as in d_iface.h !!!

+#define TRANSPARENT_COLOR	255

+#ifndef GLQUAKE

+	.extern C(d_zistepu)

+	.extern C(d_pzbuffer)

+	.extern C(d_zistepv)

+	.extern C(d_zrowbytes)

+	.extern C(d_ziorigin)

+	.extern C(r_turb_s)

+	.extern C(r_turb_t)

+	.extern C(r_turb_pdest)

+	.extern C(r_turb_spancount)

+	.extern C(r_turb_turb)

+	.extern C(r_turb_pbase)

+	.extern C(r_turb_sstep)

+	.extern C(r_turb_tstep)

+	.extern	C(r_bmodelactive)

+	.extern	C(d_sdivzstepu)

+	.extern	C(d_tdivzstepu)

+	.extern	C(d_sdivzstepv)

+	.extern	C(d_tdivzstepv)

+	.extern	C(d_sdivzorigin)

+	.extern	C(d_tdivzorigin)

+	.extern	C(sadjust)

+	.extern	C(tadjust)

+	.extern	C(bbextents)

+	.extern	C(bbextentt)

+	.extern	C(cacheblock)

+	.extern	C(d_viewbuffer)

+	.extern	C(cachewidth)

+	.extern	C(d_pzbuffer)

+	.extern	C(d_zrowbytes)

+	.extern	C(d_zwidth)

+	.extern C(d_scantable)

+	.extern C(r_lightptr)

+	.extern C(r_numvblocks)

+	.extern C(prowdestbase)

+	.extern C(pbasesource)

+	.extern C(r_lightwidth)

+	.extern C(lightright)

+	.extern C(lightrightstep)

+	.extern C(lightdeltastep)

+	.extern C(lightdelta)

+	.extern C(lightright)

+	.extern C(lightdelta)

+	.extern C(sourcetstep)

+	.extern C(surfrowbytes)

+	.extern C(lightrightstep)

+	.extern C(lightdeltastep)

+	.extern C(r_sourcemax)

+	.extern C(r_stepback)

+	.extern C(colormap)

+	.extern C(blocksize)

+	.extern C(sourcesstep)

+	.extern C(lightleft)

+	.extern C(blockdivshift)

+	.extern C(blockdivmask)

+	.extern C(lightleftstep)

+	.extern C(r_origin)

+	.extern C(r_ppn)

+	.extern C(r_pup)

+	.extern C(r_pright)

+	.extern C(ycenter)

+	.extern C(xcenter)

+	.extern C(d_vrectbottom_particle)

+	.extern C(d_vrectright_particle)

+	.extern C(d_vrecty)

+	.extern C(d_vrectx)

+	.extern C(d_pix_shift)

+	.extern C(d_pix_min)

+	.extern C(d_pix_max)

+	.extern C(d_y_aspect_shift)

+	.extern C(screenwidth)

+	.extern C(r_leftclipped)

+	.extern C(r_leftenter)

+	.extern C(r_rightclipped)

+	.extern C(r_rightenter)

+	.extern C(modelorg)

+	.extern C(xscale)

+	.extern C(r_refdef)

+	.extern C(yscale)

+	.extern C(r_leftexit)

+	.extern C(r_rightexit)

+	.extern C(r_lastvertvalid)

+	.extern C(cacheoffset)

+	.extern C(newedges)

+	.extern C(removeedges)

+	.extern C(r_pedge)

+	.extern C(r_framecount)

+	.extern C(r_u1)

+	.extern C(r_emitted)

+	.extern C(edge_p)

+	.extern C(surface_p)

+	.extern C(surfaces)

+	.extern C(r_lzi1)

+	.extern C(r_v1)

+	.extern C(r_ceilv1)

+	.extern C(r_nearzi)

+	.extern C(r_nearzionly)

+	.extern C(edge_aftertail)

+	.extern C(edge_tail)

+	.extern C(current_iv)

+	.extern C(edge_head_u_shift20)

+	.extern C(span_p)

+	.extern C(edge_head)

+	.extern C(fv)

+	.extern C(edge_tail_u_shift20)

+	.extern C(r_apverts)

+	.extern C(r_anumverts)

+	.extern C(aliastransform)

+	.extern C(r_avertexnormals)

+	.extern C(r_plightvec)

+	.extern C(r_ambientlight)

+	.extern C(r_shadelight)

+	.extern C(aliasxcenter)

+	.extern C(aliasycenter)

+	.extern C(a_sstepxfrac)

+	.extern C(r_affinetridesc)

+	.extern C(acolormap)

+	.extern C(d_pcolormap)

+	.extern C(r_affinetridesc)

+	.extern C(d_sfrac)

+	.extern C(d_ptex)

+	.extern C(d_pedgespanpackage)

+	.extern C(d_tfrac)

+	.extern C(d_light)

+	.extern C(d_zi)

+	.extern C(d_pdest)

+	.extern C(d_pz)

+	.extern C(d_aspancount)

+	.extern C(erroradjustup)

+	.extern C(errorterm)

+	.extern C(d_xdenom)

+	.extern C(r_p0)

+	.extern C(r_p1)

+	.extern C(r_p2)

+	.extern C(a_tstepxfrac)

+	.extern C(r_sstepx)

+	.extern C(r_tstepx)

+	.extern C(a_ststepxwhole)

+	.extern C(zspantable)

+	.extern C(skintable)

+	.extern C(r_zistepx)

+	.extern C(erroradjustdown)

+	.extern C(d_countextrastep)

+	.extern C(ubasestep)

+	.extern C(a_ststepxwhole)

+	.extern C(a_tstepxfrac)

+	.extern C(r_lstepx)

+	.extern C(a_spans)

+	.extern C(erroradjustdown)

+	.extern C(d_pdestextrastep)

+	.extern C(d_pzextrastep)

+	.extern C(d_sfracextrastep)

+	.extern C(d_ptexextrastep)

+	.extern C(d_countextrastep)

+	.extern C(d_tfracextrastep)

+	.extern C(d_lightextrastep)

+	.extern C(d_ziextrastep)

+	.extern C(d_pdestbasestep)

+	.extern C(d_pzbasestep)

+	.extern C(d_sfracbasestep)

+	.extern C(d_ptexbasestep)

+	.extern C(ubasestep)

+	.extern C(d_tfracbasestep)

+	.extern C(d_lightbasestep)

+	.extern C(d_zibasestep)

+	.extern C(zspantable)

+	.extern C(r_lstepy)

+	.extern C(r_sstepy)

+	.extern C(r_tstepy)

+	.extern C(r_zistepy)

+	.extern C(D_PolysetSetEdgeTable)

+	.extern C(D_RasterizeAliasPolySmooth)

+	.extern float_point5

+	.extern Float2ToThe31nd

+	.extern izistep

+	.extern izi

+	.extern FloatMinus2ToThe31nd

+	.extern float_1

+	.extern float_particle_z_clip

+	.extern float_minus_1

+	.extern float_0

+	.extern fp_16

+	.extern fp_64k

+	.extern fp_1m

+	.extern fp_1m_minus_1

+	.extern fp_8

+	.extern entryvec_table

+	.extern advancetable

+	.extern sstep

+	.extern tstep

+	.extern pspantemp

+	.extern counttemp

+	.extern jumptemp

+	.extern reciprocal_table

+	.extern DP_Count

+	.extern DP_u

+	.extern DP_v

+	.extern DP_32768

+	.extern DP_Color

+	.extern DP_Pix

+	.extern DP_EntryTable

+	.extern	pbase

+	.extern s

+	.extern t

+	.extern sfracf

+	.extern tfracf

+	.extern snext

+	.extern tnext

+	.extern	spancountminus1

+	.extern zi16stepu

+	.extern sdivz16stepu

+	.extern tdivz16stepu

+	.extern	zi8stepu

+	.extern sdivz8stepu

+	.extern tdivz8stepu

+	.extern reciprocal_table_16

+	.extern entryvec_table_16

+	.extern ceil_cw

+	.extern single_cw

+	.extern fp_64kx64k

+	.extern pz

+	.extern spr8entryvec_table

+#endif

+	.extern C(snd_scaletable)

+	.extern C(paintbuffer)

+	.extern C(snd_linear_count)

+	.extern C(snd_p)

+	.extern C(snd_vol)

+	.extern C(snd_out)

+	.extern C(vright)

+	.extern C(vup)

+	.extern C(vpn)

+	.extern C(BOPS_Error)

--- /dev/null

+++ b/u/r_aclipa.s

@@ -1,0 +1,197 @@

+//

+// r_aliasa.s

+// x86 assembly-language Alias model transform and project code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef id386

+	.data

+Ltemp0:	.long	0

+Ltemp1:	.long	0

+	.text

+#define pfv0		8+4

+#define pfv1		8+8

+#define out			8+12

+.globl C(R_Alias_clip_bottom)

+C(R_Alias_clip_bottom):

+	pushl	%esi

+	pushl	%edi

+	movl	pfv0(%esp),%esi

+	movl	pfv1(%esp),%edi

+	movl	C(r_refdef)+rd_aliasvrectbottom,%eax

+LDoForwardOrBackward:

+	movl	fv_v+4(%esi),%edx

+	movl	fv_v+4(%edi),%ecx

+	cmpl	%ecx,%edx

+	jl		LDoForward

+	movl	fv_v+4(%esi),%ecx

+	movl	fv_v+4(%edi),%edx

+	movl	pfv0(%esp),%edi

+	movl	pfv1(%esp),%esi

+LDoForward:

+	subl	%edx,%ecx

+	subl	%edx,%eax

+	movl	%ecx,Ltemp1

+	movl	%eax,Ltemp0

+	fildl	Ltemp1

+	fildl	Ltemp0

+	movl	out(%esp),%edx

+	movl	$2,%eax

+	fdivp	%st(0),%st(1)					// scale

+LDo3Forward:

+	fildl	fv_v+0(%esi)	// fv0v0 | scale

+	fildl	fv_v+0(%edi)	// fv1v0 | fv0v0 | scale

+	fildl	fv_v+4(%esi)	// fv0v1 | fv1v0 | fv0v0 | scale

+	fildl	fv_v+4(%edi)	// fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale

+	fildl	fv_v+8(%esi)	// fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 | scale

+	fildl	fv_v+8(%edi)	// fv1v2 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv0v0 |

+							//  scale

+	fxch	%st(5)			// fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0 | fv1v2 |

+							//  scale

+	fsubr	%st(0),%st(4)	// fv0v0 | fv0v2 | fv1v1 | fv0v1 | fv1v0-fv0v0 |

+							//  fv1v2 | scale

+	fxch	%st(3)			// fv0v1 | fv0v2 | fv1v1 | fv0v0 | fv1v0-fv0v0 |

+							//  fv1v2 | scale

+	fsubr	%st(0),%st(2)	// fv0v1 | fv0v2 | fv1v1-fv0v1 | fv0v0 |

+							//  fv1v0-fv0v0 | fv1v2 | scale

+	fxch	%st(1)			// fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 |

+							//  fv1v0-fv0v0 | fv1v2 | scale

+	fsubr	%st(0),%st(5)	// fv0v2 | fv0v1 | fv1v1-fv0v1 | fv0v0 |

+							//  fv1v0-fv0v0 | fv1v2-fv0v2 | scale

+	fxch	%st(6)			// scale | fv0v1 | fv1v1-fv0v1 | fv0v0 |

+							//  fv1v0-fv0v0 | fv1v2-fv0v2 | fv0v2

+	fmul	%st(0),%st(4)	// scale | fv0v1 | fv1v1-fv0v1 | fv0v0 |

+							//  (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2

+	addl	$12,%edi

+	fmul	%st(0),%st(2)	// scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 |

+							//  (fv1v0-fv0v0)*scale | fv1v2-fv0v2 | fv0v2

+	addl	$12,%esi

+	addl	$12,%edx

+	fmul	%st(0),%st(5)	// scale | fv0v1 | (fv1v1-fv0v1)*scale | fv0v0 |

+							//  (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale |

+							//  fv0v2

+	fxch	%st(3)			// fv0v0 | fv0v1 | (fv1v1-fv0v1)*scale | scale |

+							//  (fv1v0-fv0v0)*scale | (fv1v2-fv0v2)*scale |

+							//  fv0v2

+	faddp	%st(0),%st(4)	// fv0v1 | (fv1v1-fv0v1)*scale | scale |

+							//  fv0v0+(fv1v0-fv0v0)*scale |

+							//  (fv1v2-fv0v2)*scale | fv0v2

+	faddp	%st(0),%st(1)	// fv0v1+(fv1v1-fv0v1)*scale | scale |

+							//  fv0v0+(fv1v0-fv0v0)*scale |

+							//  (fv1v2-fv0v2)*scale | fv0v2

+	fxch	%st(4)			// fv0v2 | scale | fv0v0+(fv1v0-fv0v0)*scale |

+							//  (fv1v2-fv0v2)*scale | fv0v1+(fv1v1-fv0v1)*scale

+	faddp	%st(0),%st(3)	// scale | fv0v0+(fv1v0-fv0v0)*scale |

+							//  fv0v2+(fv1v2-fv0v2)*scale |

+							//  fv0v1+(fv1v1-fv0v1)*scale

+	fxch	%st(1)			// fv0v0+(fv1v0-fv0v0)*scale | scale |

+							//  fv0v2+(fv1v2-fv0v2)*scale |

+							//  fv0v1+(fv1v1-fv0v1)*scale

+	fadds	float_point5

+	fxch	%st(3)			// fv0v1+(fv1v1-fv0v1)*scale | scale |

+							//  fv0v2+(fv1v2-fv0v2)*scale |

+							//  fv0v0+(fv1v0-fv0v0)*scale

+	fadds	float_point5

+	fxch	%st(2)			// fv0v2+(fv1v2-fv0v2)*scale | scale |

+							//  fv0v1+(fv1v1-fv0v1)*scale |

+							//  fv0v0+(fv1v0-fv0v0)*scale

+	fadds	float_point5

+	fxch	%st(3)			// fv0v0+(fv1v0-fv0v0)*scale | scale |

+							//  fv0v1+(fv1v1-fv0v1)*scale |

+							//  fv0v2+(fv1v2-fv0v2)*scale

+	fistpl	fv_v+0-12(%edx)	// scale | fv0v1+(fv1v1-fv0v1)*scale |

+							//  fv0v2+(fv1v2-fv0v2)*scale

+	fxch	%st(1)			// fv0v1+(fv1v1-fv0v1)*scale | scale |

+							//  fv0v2+(fv1v2-fv0v2)*scale | scale

+	fistpl	fv_v+4-12(%edx)	// scale | fv0v2+(fv1v2-fv0v2)*scale

+	fxch	%st(1)			// fv0v2+(fv1v2-fv0v2)*sc | scale

+	fistpl	fv_v+8-12(%edx)	// scale

+	decl	%eax

+	jnz		LDo3Forward

+	fstp	%st(0)

+	popl	%edi

+	popl	%esi

+	ret

+.globl C(R_Alias_clip_top)

+C(R_Alias_clip_top):

+	pushl	%esi

+	pushl	%edi

+	movl	pfv0(%esp),%esi

+	movl	pfv1(%esp),%edi

+	movl	C(r_refdef)+rd_aliasvrect+4,%eax

+	jmp		LDoForwardOrBackward

+.globl C(R_Alias_clip_right)

+C(R_Alias_clip_right):

+	pushl	%esi

+	pushl	%edi

+	movl	pfv0(%esp),%esi

+	movl	pfv1(%esp),%edi

+	movl	C(r_refdef)+rd_aliasvrectright,%eax

+LRightLeftEntry:

+	movl	fv_v+4(%esi),%edx

+	movl	fv_v+4(%edi),%ecx

+	cmpl	%ecx,%edx

+	movl	fv_v+0(%esi),%edx

+	movl	fv_v+0(%edi),%ecx

+	jl		LDoForward2

+	movl	fv_v+0(%esi),%ecx

+	movl	fv_v+0(%edi),%edx

+	movl	pfv0(%esp),%edi

+	movl	pfv1(%esp),%esi

+LDoForward2:

+	jmp		LDoForward

+.globl C(R_Alias_clip_left)

+C(R_Alias_clip_left):

+	pushl	%esi

+	pushl	%edi

+	movl	pfv0(%esp),%esi

+	movl	pfv1(%esp),%edi

+	movl	C(r_refdef)+rd_aliasvrect+0,%eax

+	jmp		LRightLeftEntry

+#endif	// id386

--- /dev/null

+++ b/u/r_aliasa.s

@@ -1,0 +1,218 @@

+//

+// r_aliasa.s

+// x86 assembly-language Alias model transform and project code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef id386

+	.data

+Lfloat_1:	.single	1.0

+Ltemp:		.long	0

+Lcoords:	.long	0, 0, 0

+	.text

+#define fv			12+4

+#define pstverts	12+8

+.globl C(R_AliasTransformAndProjectFinalVerts)

+C(R_AliasTransformAndProjectFinalVerts):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+//	int			i, temp;

+//	float		lightcos, *plightnormal, zi;

+//	trivertx_t	*pverts;

+//	pverts = r_apverts;

+	movl	C(r_apverts),%esi

+//	for (i=0 ; i<r_anumverts ; i++, fv++, pverts++, pstverts++)

+//	{

+	movl	pstverts(%esp),%ebp

+	movl	fv(%esp),%edi

+	movl	C(r_anumverts),%ecx

+	subl	%edx,%edx

+Lloop:

+//	// transform and project

+//		zi = 1.0 / (DotProduct(pverts->v, aliastransform[2]) +

+//				aliastransform[2][3]);

+	movb	(%esi),%dl

+	movb	%dl,Lcoords

+	fildl	Lcoords				// v[0]

+	movb	1(%esi),%dl

+	movb	%dl,Lcoords+4

+	fildl	Lcoords+4			// v[1] | v[0]

+	movb	2(%esi),%dl

+	movb	%dl,Lcoords+8

+	fildl	Lcoords+8			// v[2] | v[1] | v[0]

+	fld		%st(2)				// v[0] | v[2] | v[1] | v[0]

+	fmuls	C(aliastransform)+32 // accum | v[2] | v[1] | v[0]

+	fld		%st(2)				// v[1] | accum | v[2] | v[1] | v[0]

+	fmuls	C(aliastransform)+36 // accum2 | accum | v[2] | v[1] | v[0]

+	fxch	%st(1)				// accum | accum2 | v[2] | v[1] | v[0]

+	fadds	C(aliastransform)+44 // accum | accum2 | v[2] | v[1] | v[0]

+	fld		%st(2)				// v[2] | accum | accum2 | v[2] | v[1] | v[0]

+	fmuls	C(aliastransform)+40 // accum3 | accum | accum2 | v[2] | v[1] |

+								 //  v[0]

+	fxch	%st(1)				// accum | accum3 | accum2 | v[2] | v[1] | v[0]

+	faddp	%st(0),%st(2)		// accum3 | accum | v[2] | v[1] | v[0]

+	movb	tv_lightnormalindex(%esi),%dl

+	movl	stv_s(%ebp),%eax

+	movl	%eax,fv_v+8(%edi)

+	faddp	%st(0),%st(1)		// z | v[2] | v[1] | v[0]

+	movl	stv_t(%ebp),%eax

+	movl	%eax,fv_v+12(%edi)

+//	// lighting

+//		plightnormal = r_avertexnormals[pverts->lightnormalindex];

+	fdivrs	Lfloat_1			// zi | v[2] | v[1] | v[0]

+//		fv->v[2] = pstverts->s;

+//		fv->v[3] = pstverts->t;

+//		fv->flags = pstverts->onseam;

+	movl	stv_onseam(%ebp),%eax

+	movl	%eax,fv_flags(%edi)

+	movl	fv_size(%edi),%eax

+	movl	stv_size(%ebp),%eax

+	movl	4(%esi),%eax

+	leal	(%edx,%edx,2),%eax	// index*3

+	fxch	%st(3)				// v[0] | v[2] | v[1] | zi

+//		lightcos = DotProduct (plightnormal, r_plightvec);

+	flds	C(r_avertexnormals)(,%eax,4)

+	fmuls	C(r_plightvec)

+	flds	C(r_avertexnormals)+4(,%eax,4)

+	fmuls	C(r_plightvec)+4

+	flds	C(r_avertexnormals)+8(,%eax,4)

+	fmuls	C(r_plightvec)+8

+	fxch	%st(1)

+	faddp	%st(0),%st(2)

+	fld		%st(2)				 // v[0] | laccum | laccum2 | v[0] | v[2] |

+								 //  v[1] | zi

+	fmuls	C(aliastransform)+0  // xaccum | laccum | laccum2 | v[0] | v[2] |

+								 //  v[1] | zi

+	fxch	%st(2)				 // laccum2 | laccum | xaccum | v[0] | v[2] |

+								 //  v[1] | zi

+	faddp	%st(0),%st(1)		 // laccum | xaccum | v[0] | v[2] | v[1] | zi

+//		temp = r_ambientlight;

+//		if (lightcos < 0)

+//		{

+	fsts	Ltemp

+	movl	C(r_ambientlight),%eax

+	movb	Ltemp+3,%dl

+	testb	$0x80,%dl

+	jz		Lsavelight	// no need to clamp if only ambient lit, because

+						//  r_ambientlight is preclamped

+//			temp += (int)(r_shadelight * lightcos);

+	fmuls	C(r_shadelight)

+// FIXME: fast float->int conversion?

+	fistpl	Ltemp

+	addl	Ltemp,%eax

+//		// clamp; because we limited the minimum ambient and shading light, we

+//		// don't have to clamp low light, just bright

+//			if (temp < 0)

+//				temp = 0;

+	jns		Lp1

+	subl	%eax,%eax

+//		}

+Lp1:

+//		fv->v[4] = temp;

+//

+//	// x, y, and z are scaled down by 1/2**31 in the transform, so 1/z is

+//	// scaled up by 1/2**31, and the scaling cancels out for x and y in the

+//	// projection

+//		fv->v[0] = ((DotProduct(pverts->v, aliastransform[0]) +

+//				aliastransform[0][3]) * zi) + aliasxcenter;

+//		fv->v[1] = ((DotProduct(pverts->v, aliastransform[1]) +

+//				aliastransform[1][3]) * zi) + aliasycenter;

+//		fv->v[5] = zi;

+	fxch	%st(1)				 // v[0] | xaccum | v[2] | v[1] | zi

+	fmuls	C(aliastransform)+16 // yaccum | xaccum | v[2] | v[1] | zi

+	fxch	%st(3)				 // v[1] | xaccum | v[2] | yaccum | zi

+	fld		%st(0)				 // v[1] | v[1] | xaccum | v[2] | yaccum | zi

+	fmuls	C(aliastransform)+4	 // xaccum2 | v[1] | xaccum | v[2] | yaccum |zi

+	fxch	%st(1)				 // v[1] | xaccum2 | xaccum | v[2] | yaccum |zi

+	movl	%eax,fv_v+16(%edi)

+	fmuls	C(aliastransform)+20 // yaccum2 | xaccum2 | xaccum | v[2] | yaccum|

+								 //  zi

+	fxch	%st(2)				 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum|

+								 //  zi

+	fadds	C(aliastransform)+12 // xaccum | xaccum2 | yaccum2 | v[2] | yaccum|

+								 //  zi

+	fxch	%st(4)				 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum|

+								 //  zi

+	fadds	C(aliastransform)+28 // yaccum | xaccum2 | yaccum2 | v[2] | xaccum|

+								 //  zi

+	fxch	%st(3)				 // v[2] | xaccum2 | yaccum2 | yaccum | xaccum|

+								 //  zi

+	fld		%st(0)				 // v[2] | v[2] | xaccum2 | yaccum2 | yaccum |

+								 //  xaccum | zi

+	fmuls	C(aliastransform)+8	 // xaccum3 | v[2] | xaccum2 | yaccum2 |yaccum|

+								 //  xaccum | zi

+	fxch	%st(1)				 // v[2] | xaccum3 | xaccum2 | yaccum2 |yaccum|

+								 //  xaccum | zi

+	fmuls	C(aliastransform)+24 // yaccum3 | xaccum3 | xaccum2 | yaccum2 |

+								 // yaccum | xaccum | zi

+	fxch	%st(5)				 // xaccum | xaccum3 | xaccum2 | yaccum2 |

+								 // yaccum | yaccum3 | zi

+	faddp	%st(0),%st(2)		 // xaccum3 | xaccum | yaccum2 | yaccum |

+								 //  yaccum3 | zi

+	fxch	%st(3)				 // yaccum | xaccum | yaccum2 | xaccum3 |

+								 //  yaccum3 | zi

+	faddp	%st(0),%st(2)		 // xaccum | yaccum | xaccum3 | yaccum3 | zi

+	addl	$(tv_size),%esi

+	faddp	%st(0),%st(2)		 // yaccum | x | yaccum3 | zi

+	faddp	%st(0),%st(2)		 // x | y | zi

+	addl	$(stv_size),%ebp

+	fmul	%st(2),%st(0)		 // x/z | y | zi

+	fxch	%st(1)				 // y | x/z | zi

+	fmul	%st(2),%st(0)		 // y/z | x/z | zi

+	fxch	%st(1)				 // x/z | y/z | zi

+	fadds	C(aliasxcenter)		 // u | y/z | zi

+	fxch	%st(1)				 // y/z | u | zi

+	fadds	C(aliasycenter)		 // v | u | zi

+	fxch	%st(2)				 // zi | u | v

+// FIXME: fast float->int conversion?

+	fistpl	fv_v+20(%edi)		 // u | v

+	fistpl	fv_v+0(%edi)		 // v

+	fistpl	fv_v+4(%edi)

+//	}

+	addl	$(fv_size),%edi

+	decl	%ecx

+	jnz		Lloop

+	popl	%esi				// restore register variables

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+Lsavelight:

+	fstp	%st(0)

+	jmp		Lp1

+#endif	// id386

--- /dev/null

+++ b/u/r_drawa.s

@@ -1,0 +1,819 @@

+//

+// r_drawa.s

+// x86 assembly-language edge clipping and emission code

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef	id386

+// !!! if these are changed, they must be changed in r_draw.c too !!!

+#define FULLY_CLIPPED_CACHED	0x80000000

+#define FRAMECOUNT_MASK			0x7FFFFFFF

+	.data

+Ld0:			.single		0.0

+Ld1:			.single		0.0

+Lstack:			.long		0

+Lfp_near_clip:	.single		NEAR_CLIP

+Lceilv0:		.long		0

+Lv:				.long		0

+Lu0:			.long		0

+Lv0:			.long		0

+Lzi0:			.long		0

+	.text

+//----------------------------------------------------------------------

+// edge clipping code

+//----------------------------------------------------------------------

+#define pv0		4+12

+#define pv1		8+12

+#define clip	12+12

+	.align 4

+.globl C(R_ClipEdge)

+C(R_ClipEdge):

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+	pushl	%ebx

+	movl	%esp,Lstack			// for clearing the stack later

+//	float		d0, d1, f;

+//	mvertex_t	clipvert;

+	movl	clip(%esp),%ebx

+	movl	pv0(%esp),%esi

+	movl	pv1(%esp),%edx

+//	if (clip)

+//	{

+	testl	%ebx,%ebx

+	jz		Lemit

+//		do

+//		{

+Lcliploop:

+//			d0 = DotProduct (pv0->position, clip->normal) - clip->dist;

+//			d1 = DotProduct (pv1->position, clip->normal) - clip->dist;

+	flds	mv_position+0(%esi)

+	fmuls	cp_normal+0(%ebx)

+	flds	mv_position+4(%esi)

+	fmuls	cp_normal+4(%ebx)

+	flds	mv_position+8(%esi)

+	fmuls	cp_normal+8(%ebx)

+	fxch	%st(1)

+	faddp	%st(0),%st(2)		// d0mul2 | d0add0

+	flds	mv_position+0(%edx)

+	fmuls	cp_normal+0(%ebx)

+	flds	mv_position+4(%edx)

+	fmuls	cp_normal+4(%ebx)

+	flds	mv_position+8(%edx)

+	fmuls	cp_normal+8(%ebx)

+	fxch	%st(1)

+	faddp	%st(0),%st(2)		// d1mul2 | d1add0 | d0mul2 | d0add0

+	fxch	%st(3)				// d0add0 | d1add0 | d0mul2 | d1mul2

+	faddp	%st(0),%st(2)		// d1add0 | dot0 | d1mul2

+	faddp	%st(0),%st(2)		// dot0 | dot1

+	fsubs	cp_dist(%ebx)		// d0 | dot1

+	fxch	%st(1)				// dot1 | d0

+	fsubs	cp_dist(%ebx)		// d1 | d0

+	fxch	%st(1)

+	fstps	Ld0

+	fstps	Ld1

+//			if (d0 >= 0)

+//			{

+	movl	Ld0,%eax

+	movl	Ld1,%ecx

+	orl		%eax,%ecx

+	js		Lp2

+// both points are unclipped

+Lcontinue:

+//

+//				R_ClipEdge (&clipvert, pv1, clip->next);

+//				return;

+//			}

+//		} while ((clip = clip->next) != NULL);

+	movl	cp_next(%ebx),%ebx

+	testl	%ebx,%ebx

+	jnz		Lcliploop

+//	}

+//// add the edge

+//	R_EmitEdge (pv0, pv1);

+Lemit:

+//

+// set integer rounding to ceil mode, set to single precision

+//

+// FIXME: do away with by manually extracting integers from floats?

+// FIXME: set less often

+	fldcw	ceil_cw

+//	edge_t	*edge, *pcheck;

+//	int		u_check;

+//	float	u, u_step;

+//	vec3_t	local, transformed;

+//	float	*world;

+//	int		v, v2, ceilv0;

+//	float	scale, lzi0, u0, v0;

+//	int		side;

+//	if (r_lastvertvalid)

+//	{

+	cmpl	$0,C(r_lastvertvalid)

+	jz		LCalcFirst

+//		u0 = r_u1;

+//		v0 = r_v1;

+//		lzi0 = r_lzi1;

+//		ceilv0 = r_ceilv1;

+	movl	C(r_lzi1),%eax

+	movl	C(r_u1),%ecx

+	movl	%eax,Lzi0

+	movl	%ecx,Lu0

+	movl	C(r_v1),%ecx

+	movl	C(r_ceilv1),%eax

+	movl	%ecx,Lv0

+	movl	%eax,Lceilv0

+	jmp		LCalcSecond

+//	}

+LCalcFirst:

+//	else

+//	{

+//		world = &pv0->position[0];

+	call	LTransformAndProject	// v0 | lzi0 | u0

+	fsts	Lv0

+	fxch	%st(2)					// u0 | lzi0 | v0

+	fstps	Lu0						// lzi0 | v0

+	fstps	Lzi0					// v0

+//		ceilv0 = (int)(v0 - 2000) + 2000; // ceil(v0);

+	fistpl	Lceilv0

+//	}

+LCalcSecond:

+//	world = &pv1->position[0];

+	movl	%edx,%esi

+	call	LTransformAndProject	// v1 | lzi1 | u1

+	flds	Lu0						// u0 | v1 | lzi1 | u1

+	fxch	%st(3)					// u1 | v1 | lzi1 | u0

+	flds	Lzi0					// lzi0 | u1 | v1 | lzi1 | u0

+	fxch	%st(3)					// lzi1 | u1 | v1 | lzi0 | u0

+	flds	Lv0						// v0 | lzi1 | u1 | v1 | lzi0 | u0

+	fxch	%st(3)					// v1 | lzi1 | u1 | v0 | lzi0 | u0

+//	r_ceilv1 = (int)(r_v1 - 2000) + 2000; // ceil(r_v1);

+	fistl	C(r_ceilv1)

+	fldcw	single_cw				// put back normal floating-point state

+	fsts	C(r_v1)

+	fxch	%st(4)					// lzi0 | lzi1 | u1 | v0 | v1 | u0

+//	if (r_lzi1 > lzi0)

+//		lzi0 = r_lzi1;

+	fcom	%st(1)

+	fnstsw	%ax

+	testb	$1,%ah

+	jz		LP0

+	fstp	%st(0)

+	fld		%st(0)

+LP0:

+	fxch	%st(1)					// lzi1 | lzi0 | u1 | v0 | v1 | u0

+	fstps	C(r_lzi1)				// lzi0 | u1 | v0 | v1 | u0

+	fxch	%st(1)

+	fsts	C(r_u1)

+	fxch	%st(1)

+//	if (lzi0 > r_nearzi)	// for mipmap finding

+//		r_nearzi = lzi0;

+	fcoms	C(r_nearzi)

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jnz		LP1

+	fsts	C(r_nearzi)

+LP1:

+// // for right edges, all we want is the effect on 1/z

+//	if (r_nearzionly)

+//		return;

+	movl	C(r_nearzionly),%eax

+	testl	%eax,%eax

+	jz		LP2

+LPop5AndDone:

+	movl	C(cacheoffset),%eax

+	movl	C(r_framecount),%edx

+	cmpl	$0x7FFFFFFF,%eax

+	jz		LDoPop

+	andl	$(FRAMECOUNT_MASK),%edx

+	orl		$(FULLY_CLIPPED_CACHED),%edx

+	movl	%edx,C(cacheoffset)

+LDoPop:

+	fstp	%st(0)			// u1 | v0 | v1 | u0

+	fstp	%st(0)			// v0 | v1 | u0

+	fstp	%st(0)			// v1 | u0

+	fstp	%st(0)			// u0

+	fstp	%st(0)

+	jmp		Ldone

+LP2:

+// // create the edge

+//	if (ceilv0 == r_ceilv1)

+//		return;		// horizontal edge

+	movl	Lceilv0,%ebx

+	movl	C(edge_p),%edi

+	movl	C(r_ceilv1),%ecx

+	movl	%edi,%edx

+	movl	C(r_pedge),%esi

+	addl	$(et_size),%edx

+	cmpl	%ecx,%ebx

+	jz		LPop5AndDone

+	movl	C(r_pedge),%eax

+	movl	%eax,et_owner(%edi)

+//	side = ceilv0 > r_ceilv1;

+//

+//	edge->nearzi = lzi0;

+	fstps	et_nearzi(%edi)		// u1 | v0 | v1 | u0

+//	if (side == 1)

+//	{

+	jc		LSide0

+LSide1:

+//	// leading edge (go from p2 to p1)

+//		u_step = ((u0 - r_u1) / (v0 - r_v1));

+	fsubrp	%st(0),%st(3)		// v0 | v1 | u0-u1

+	fsub	%st(1),%st(0)		// v0-v1 | v1 | u0-u1

+	fdivrp	%st(0),%st(2)		// v1 | ustep

+//	r_emitted = 1;

+	movl	$1,C(r_emitted)

+//	edge = edge_p++;

+	movl	%edx,C(edge_p)

+// pretouch next edge

+	movl	(%edx),%eax

+//		v2 = ceilv0 - 1;

+//		v = r_ceilv1;

+	movl	%ecx,%eax

+	leal	-1(%ebx),%ecx

+	movl	%eax,%ebx

+//		edge->surfs[0] = 0;

+//		edge->surfs[1] = surface_p - surfaces;

+	movl	C(surface_p),%eax

+	movl	C(surfaces),%esi

+	subl	%edx,%edx

+	subl	%esi,%eax

+	shrl	$(SURF_T_SHIFT),%eax

+	movl	%edx,et_surfs(%edi)

+	movl	%eax,et_surfs+2(%edi)

+	subl	%esi,%esi

+//		u = r_u1 + ((float)v - r_v1) * u_step;

+	movl	%ebx,Lv

+	fildl	Lv					// v | v1 | ustep

+	fsubp	%st(0),%st(1)		// v-v1 | ustep

+	fmul	%st(1),%st(0)		// (v-v1)*ustep | ustep

+	fadds	C(r_u1)				// u | ustep

+	jmp		LSideDone

+//	}

+LSide0:

+//	else

+//	{

+//	// trailing edge (go from p1 to p2)

+//		u_step = ((r_u1 - u0) / (r_v1 - v0));

+	fsub	%st(3),%st(0)		// u1-u0 | v0 | v1 | u0

+	fxch	%st(2)				// v1 | v0 | u1-u0 | u0

+	fsub	%st(1),%st(0)		// v1-v0 | v0 | u1-u0 | u0

+	fdivrp	%st(0),%st(2)		// v0 | ustep | u0

+//	r_emitted = 1;

+	movl	$1,C(r_emitted)

+//	edge = edge_p++;

+	movl	%edx,C(edge_p)

+// pretouch next edge

+	movl	(%edx),%eax

+//		v = ceilv0;

+//		v2 = r_ceilv1 - 1;

+	decl	%ecx

+//		edge->surfs[0] = surface_p - surfaces;

+//		edge->surfs[1] = 0;

+	movl	C(surface_p),%eax

+	movl	C(surfaces),%esi

+	subl	%edx,%edx

+	subl	%esi,%eax

+	shrl	$(SURF_T_SHIFT),%eax

+	movl	%edx,et_surfs+2(%edi)

+	movl	%eax,et_surfs(%edi)

+	movl	$1,%esi

+//		u = u0 + ((float)v - v0) * u_step;

+	movl	%ebx,Lv

+	fildl	Lv					// v | v0 | ustep | u0

+	fsubp	%st(0),%st(1)		// v-v0 | ustep | u0

+	fmul	%st(1),%st(0)		// (v-v0)*ustep | ustep | u0

+	faddp	%st(0),%st(2)		// ustep | u

+	fxch	%st(1)				// u | ustep

+//	}

+LSideDone:

+//	edge->u_step = u_step*0x100000;

+//	edge->u = u*0x100000 + 0xFFFFF;

+	fmuls	fp_1m				// u*0x100000 | ustep

+	fxch	%st(1)				// ustep | u*0x100000

+	fmuls	fp_1m				// ustep*0x100000 | u*0x100000

+	fxch	%st(1)				// u*0x100000 | ustep*0x100000

+	fadds	fp_1m_minus_1		// u*0x100000 + 0xFFFFF | ustep*0x100000

+	fxch	%st(1)				// ustep*0x100000 | u*0x100000 + 0xFFFFF

+	fistpl	et_u_step(%edi)		// u*0x100000 + 0xFFFFF

+	fistpl	et_u(%edi)

+// // we need to do this to avoid stepping off the edges if a very nearly

+// // horizontal edge is less than epsilon above a scan, and numeric error

+// // causes it to incorrectly extend to the scan, and the extension of the

+// // line goes off the edge of the screen

+// // FIXME: is this actually needed?

+//	if (edge->u < r_refdef.vrect_x_adj_shift20)

+//		edge->u = r_refdef.vrect_x_adj_shift20;

+//	if (edge->u > r_refdef.vrectright_adj_shift20)

+//		edge->u = r_refdef.vrectright_adj_shift20;

+	movl	et_u(%edi),%eax

+	movl	C(r_refdef)+rd_vrect_x_adj_shift20,%edx

+	cmpl	%edx,%eax

+	jl		LP4

+	movl	C(r_refdef)+rd_vrectright_adj_shift20,%edx

+	cmpl	%edx,%eax

+	jng		LP5

+LP4:

+	movl	%edx,et_u(%edi)

+	movl	%edx,%eax

+LP5:

+// // sort the edge in normally

+//	u_check = edge->u;

+//

+//	if (edge->surfs[0])

+//		u_check++;	// sort trailers after leaders

+	addl	%esi,%eax

+//	if (!newedges[v] || newedges[v]->u >= u_check)

+//	{

+	movl	C(newedges)(,%ebx,4),%esi

+	testl	%esi,%esi

+	jz		LDoFirst

+	cmpl	%eax,et_u(%esi)

+	jl		LNotFirst

+LDoFirst:

+//		edge->next = newedges[v];

+//		newedges[v] = edge;

+	movl	%esi,et_next(%edi)

+	movl	%edi,C(newedges)(,%ebx,4)

+	jmp		LSetRemove

+//	}

+LNotFirst:

+//	else

+//	{

+//		pcheck = newedges[v];

+//

+//		while (pcheck->next && pcheck->next->u < u_check)

+//			pcheck = pcheck->next;

+LFindInsertLoop:

+	movl	%esi,%edx

+	movl	et_next(%esi),%esi

+	testl	%esi,%esi

+	jz		LInsertFound

+	cmpl	%eax,et_u(%esi)

+	jl		LFindInsertLoop

+LInsertFound:

+//		edge->next = pcheck->next;

+//		pcheck->next = edge;

+	movl	%esi,et_next(%edi)

+	movl	%edi,et_next(%edx)

+//	}

+LSetRemove:

+//	edge->nextremove = removeedges[v2];

+//	removeedges[v2] = edge;

+	movl	C(removeedges)(,%ecx,4),%eax

+	movl	%edi,C(removeedges)(,%ecx,4)

+	movl	%eax,et_nextremove(%edi)

+Ldone:

+	movl	Lstack,%esp			// clear temporary variables from stack

+	popl	%ebx				// restore register variables

+	popl	%edi

+	popl	%esi

+	ret

+// at least one point is clipped

+Lp2:

+	testl	%eax,%eax

+	jns		Lp1

+//			else

+//			{

+//			// point 0 is clipped

+//				if (d1 < 0)

+//				{

+	movl	Ld1,%eax

+	testl	%eax,%eax

+	jns		Lp3

+//				// both points are clipped

+//				// we do cache fully clipped edges

+//					if (!leftclipped)

+	movl	C(r_leftclipped),%eax

+	movl	C(r_pedge),%ecx

+	testl	%eax,%eax

+	jnz		Ldone

+//						r_pedge->framecount = r_framecount;

+	movl	C(r_framecount),%eax

+	andl	$(FRAMECOUNT_MASK),%eax

+	orl		$(FULLY_CLIPPED_CACHED),%eax

+	movl	%eax,C(cacheoffset)

+//					return;

+	jmp		Ldone

+//				}

+Lp1:

+//			// point 0 is unclipped

+//				if (d1 >= 0)

+//				{

+//				// both points are unclipped

+//					continue;

+//			// only point 1 is clipped

+//				f = d0 / (d0 - d1);

+	flds	Ld0

+	flds	Ld1

+	fsubr	%st(1),%st(0)

+//			// we don't cache partially clipped edges

+	movl	$0x7FFFFFFF,C(cacheoffset)

+	fdivrp	%st(0),%st(1)

+	subl	$(mv_size),%esp			// allocate space for clipvert

+//				clipvert.position[0] = pv0->position[0] +

+//						f * (pv1->position[0] - pv0->position[0]);

+//				clipvert.position[1] = pv0->position[1] +

+//						f * (pv1->position[1] - pv0->position[1]);

+//				clipvert.position[2] = pv0->position[2] +

+//						f * (pv1->position[2] - pv0->position[2]);

+	flds	mv_position+8(%edx)

+	fsubs	mv_position+8(%esi)

+	flds	mv_position+4(%edx)

+	fsubs	mv_position+4(%esi)

+	flds	mv_position+0(%edx)

+	fsubs	mv_position+0(%esi)		// 0 | 1 | 2

+// replace pv1 with the clip point

+	movl	%esp,%edx

+	movl	cp_leftedge(%ebx),%eax

+	testb	%al,%al

+	fmul	%st(3),%st(0)

+	fxch	%st(1)					// 1 | 0 | 2

+	fmul	%st(3),%st(0)

+	fxch	%st(2)					// 2 | 0 | 1

+	fmulp	%st(0),%st(3)			// 0 | 1 | 2

+	fadds	mv_position+0(%esi)

+	fxch	%st(1)					// 1 | 0 | 2

+	fadds	mv_position+4(%esi)

+	fxch	%st(2)					// 2 | 0 | 1

+	fadds	mv_position+8(%esi)

+	fxch	%st(1)					// 0 | 2 | 1

+	fstps	mv_position+0(%esp)		// 2 | 1

+	fstps	mv_position+8(%esp)		// 1

+	fstps	mv_position+4(%esp)

+//				if (clip->leftedge)

+//				{

+	jz		Ltestright

+//					r_leftclipped = true;

+//					r_leftexit = clipvert;

+	movl	$1,C(r_leftclipped)

+	movl	mv_position+0(%esp),%eax

+	movl	%eax,C(r_leftexit)+mv_position+0

+	movl	mv_position+4(%esp),%eax

+	movl	%eax,C(r_leftexit)+mv_position+4

+	movl	mv_position+8(%esp),%eax

+	movl	%eax,C(r_leftexit)+mv_position+8

+	jmp		Lcontinue

+//				}

+Ltestright:

+//				else if (clip->rightedge)

+//				{

+	testb	%ah,%ah

+	jz		Lcontinue

+//					r_rightclipped = true;

+//					r_rightexit = clipvert;

+	movl	$1,C(r_rightclipped)

+	movl	mv_position+0(%esp),%eax

+	movl	%eax,C(r_rightexit)+mv_position+0

+	movl	mv_position+4(%esp),%eax

+	movl	%eax,C(r_rightexit)+mv_position+4

+	movl	mv_position+8(%esp),%eax

+	movl	%eax,C(r_rightexit)+mv_position+8

+//				}

+//

+//				R_ClipEdge (pv0, &clipvert, clip->next);

+//				return;

+//			}

+	jmp		Lcontinue

+//			}

+Lp3:

+//			// only point 0 is clipped

+//				r_lastvertvalid = false;

+	movl	$0,C(r_lastvertvalid)

+//				f = d0 / (d0 - d1);

+	flds	Ld0

+	flds	Ld1

+	fsubr	%st(1),%st(0)

+//			// we don't cache partially clipped edges

+	movl	$0x7FFFFFFF,C(cacheoffset)

+	fdivrp	%st(0),%st(1)

+	subl	$(mv_size),%esp			// allocate space for clipvert

+//				clipvert.position[0] = pv0->position[0] +

+//						f * (pv1->position[0] - pv0->position[0]);

+//				clipvert.position[1] = pv0->position[1] +

+//						f * (pv1->position[1] - pv0->position[1]);

+//				clipvert.position[2] = pv0->position[2] +

+//						f * (pv1->position[2] - pv0->position[2]);

+	flds	mv_position+8(%edx)

+	fsubs	mv_position+8(%esi)

+	flds	mv_position+4(%edx)

+	fsubs	mv_position+4(%esi)

+	flds	mv_position+0(%edx)

+	fsubs	mv_position+0(%esi)		// 0 | 1 | 2

+	movl	cp_leftedge(%ebx),%eax

+	testb	%al,%al

+	fmul	%st(3),%st(0)

+	fxch	%st(1)					// 1 | 0 | 2

+	fmul	%st(3),%st(0)

+	fxch	%st(2)					// 2 | 0 | 1

+	fmulp	%st(0),%st(3)			// 0 | 1 | 2

+	fadds	mv_position+0(%esi)

+	fxch	%st(1)					// 1 | 0 | 2

+	fadds	mv_position+4(%esi)

+	fxch	%st(2)					// 2 | 0 | 1

+	fadds	mv_position+8(%esi)

+	fxch	%st(1)					// 0 | 2 | 1

+	fstps	mv_position+0(%esp)		// 2 | 1

+	fstps	mv_position+8(%esp)		// 1

+	fstps	mv_position+4(%esp)

+// replace pv0 with the clip point

+	movl	%esp,%esi

+//				if (clip->leftedge)

+//				{

+	jz		Ltestright2

+//					r_leftclipped = true;

+//					r_leftenter = clipvert;

+	movl	$1,C(r_leftclipped)

+	movl	mv_position+0(%esp),%eax

+	movl	%eax,C(r_leftenter)+mv_position+0

+	movl	mv_position+4(%esp),%eax

+	movl	%eax,C(r_leftenter)+mv_position+4

+	movl	mv_position+8(%esp),%eax

+	movl	%eax,C(r_leftenter)+mv_position+8

+	jmp		Lcontinue

+//				}

+Ltestright2:

+//				else if (clip->rightedge)

+//				{

+	testb	%ah,%ah

+	jz		Lcontinue

+//					r_rightclipped = true;

+//					r_rightenter = clipvert;

+	movl	$1,C(r_rightclipped)

+	movl	mv_position+0(%esp),%eax

+	movl	%eax,C(r_rightenter)+mv_position+0

+	movl	mv_position+4(%esp),%eax

+	movl	%eax,C(r_rightenter)+mv_position+4

+	movl	mv_position+8(%esp),%eax

+	movl	%eax,C(r_rightenter)+mv_position+8

+//				}

+	jmp		Lcontinue

+// %esi = vec3_t point to transform and project

+// %edx preserved

+LTransformAndProject:

+//	// transform and project

+//		VectorSubtract (world, modelorg, local);

+	flds	mv_position+0(%esi)

+	fsubs	C(modelorg)+0

+	flds	mv_position+4(%esi)

+	fsubs	C(modelorg)+4

+	flds	mv_position+8(%esi)

+	fsubs	C(modelorg)+8

+	fxch	%st(2)				// local[0] | local[1] | local[2]

+//		TransformVector (local, transformed);

+//

+//		if (transformed[2] < NEAR_CLIP)

+//			transformed[2] = NEAR_CLIP;

+//

+//		lzi0 = 1.0 / transformed[2];

+	fld		%st(0)				// local[0] | local[0] | local[1] | local[2]

+	fmuls	C(vpn)+0			// zm0 | local[0] | local[1] | local[2]

+	fld		%st(1)				// local[0] | zm0 | local[0] | local[1] |

+								//  local[2]

+	fmuls	C(vright)+0			// xm0 | zm0 | local[0] | local[1] | local[2]

+	fxch	%st(2)				// local[0] | zm0 | xm0 | local[1] | local[2]

+	fmuls	C(vup)+0			// ym0 |  zm0 | xm0 | local[1] | local[2]

+	fld		%st(3)				// local[1] | ym0 |  zm0 | xm0 | local[1] |

+								//  local[2]

+	fmuls	C(vpn)+4			// zm1 | ym0 | zm0 | xm0 | local[1] |

+								//  local[2]

+	fld		%st(4)				// local[1] | zm1 | ym0 | zm0 | xm0 |

+								//  local[1] | local[2]

+	fmuls	C(vright)+4			// xm1 | zm1 | ym0 |  zm0 | xm0 |

+								//  local[1] | local[2]

+	fxch	%st(5)				// local[1] | zm1 | ym0 | zm0 | xm0 |

+								//  xm1 | local[2]

+	fmuls	C(vup)+4			// ym1 | zm1 | ym0 | zm0 | xm0 |

+								//  xm1 | local[2]

+	fxch	%st(1)				// zm1 | ym1 | ym0 | zm0 | xm0 |

+								//  xm1 | local[2]

+	faddp	%st(0),%st(3)		// ym1 | ym0 | zm2 | xm0 | xm1 | local[2]

+	fxch	%st(3)				// xm0 | ym0 | zm2 | ym1 | xm1 | local[2]

+	faddp	%st(0),%st(4)		// ym0 | zm2 | ym1 | xm2 | local[2]

+	faddp	%st(0),%st(2)		// zm2 | ym2 | xm2 | local[2]

+	fld		%st(3)				// local[2] | zm2 | ym2 | xm2 | local[2]

+	fmuls	C(vpn)+8			// zm3 | zm2 | ym2 | xm2 | local[2]

+	fld		%st(4)				// local[2] | zm3 | zm2 | ym2 | xm2 | local[2]

+	fmuls	C(vright)+8			// xm3 | zm3 | zm2 | ym2 | xm2 | local[2]

+	fxch	%st(5)				// local[2] | zm3 | zm2 | ym2 | xm2 | xm3

+	fmuls	C(vup)+8			// ym3 | zm3 | zm2 | ym2 | xm2 | xm3

+	fxch	%st(1)				// zm3 | ym3 | zm2 | ym2 | xm2 | xm3

+	faddp	%st(0),%st(2)		// ym3 | zm4 | ym2 | xm2 | xm3

+	fxch	%st(4)				// xm3 | zm4 | ym2 | xm2 | ym3

+	faddp	%st(0),%st(3)		// zm4 | ym2 | xm4 | ym3

+	fxch	%st(1)				// ym2 | zm4 | xm4 | ym3

+	faddp	%st(0),%st(3)		// zm4 | xm4 | ym4

+	fcoms	Lfp_near_clip

+	fnstsw	%ax

+	testb	$1,%ah

+	jz		LNoClip

+	fstp	%st(0)

+	flds	Lfp_near_clip

+LNoClip:

+	fdivrs	float_1				// lzi0 | x | y

+	fxch	%st(1)				// x | lzi0 | y

+//	// FIXME: build x/yscale into transform?

+//		scale = xscale * lzi0;

+//		u0 = (xcenter + scale*transformed[0]);

+	flds	C(xscale)			// xscale | x | lzi0 | y

+	fmul	%st(2),%st(0)		// scale | x | lzi0 | y

+	fmulp	%st(0),%st(1)		// scale*x | lzi0 | y

+	fadds	C(xcenter)			// u0 | lzi0 | y

+//		if (u0 < r_refdef.fvrectx_adj)

+//			u0 = r_refdef.fvrectx_adj;

+//		if (u0 > r_refdef.fvrectright_adj)

+//			u0 = r_refdef.fvrectright_adj;

+// FIXME: use integer compares of floats?

+	fcoms	C(r_refdef)+rd_fvrectx_adj

+	fnstsw	%ax

+	testb	$1,%ah

+	jz		LClampP0

+	fstp	%st(0)

+	flds	C(r_refdef)+rd_fvrectx_adj

+LClampP0:

+	fcoms	C(r_refdef)+rd_fvrectright_adj

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jnz		LClampP1

+	fstp	%st(0)

+	flds	C(r_refdef)+rd_fvrectright_adj

+LClampP1:

+	fld		%st(1)				// lzi0 | u0 | lzi0 | y

+//		scale = yscale * lzi0;

+//		v0 = (ycenter - scale*transformed[1]);

+	fmuls	C(yscale)			// scale | u0 | lzi0 | y

+	fmulp	%st(0),%st(3)		// u0 | lzi0 | scale*y

+	fxch	%st(2)				// scale*y | lzi0 | u0

+	fsubrs	C(ycenter)			// v0 | lzi0 | u0

+//		if (v0 < r_refdef.fvrecty_adj)

+//			v0 = r_refdef.fvrecty_adj;

+//		if (v0 > r_refdef.fvrectbottom_adj)

+//			v0 = r_refdef.fvrectbottom_adj;

+// FIXME: use integer compares of floats?

+	fcoms	C(r_refdef)+rd_fvrecty_adj

+	fnstsw	%ax

+	testb	$1,%ah

+	jz		LClampP2

+	fstp	%st(0)

+	flds	C(r_refdef)+rd_fvrecty_adj

+LClampP2:

+	fcoms	C(r_refdef)+rd_fvrectbottom_adj

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jnz		LClampP3

+	fstp	%st(0)

+	flds	C(r_refdef)+rd_fvrectbottom_adj

+LClampP3:

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/r_edgea.s

@@ -1,0 +1,731 @@

+//

+// r_edgea.s

+// x86 assembly-language edge-processing code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#ifdef	id386

+	.data

+Ltemp:					.long	0

+float_1_div_0100000h:	.long	0x35800000	// 1.0/(float)0x100000

+float_point_999:		.single	0.999

+float_1_point_001:		.single	1.001

+	.text

+//--------------------------------------------------------------------

+#define edgestoadd	4+8		// note odd stack offsets because of interleaving

+#define edgelist	8+12	// with pushes

+.globl C(R_EdgeCodeStart)

+C(R_EdgeCodeStart):

+.globl C(R_InsertNewEdges)

+C(R_InsertNewEdges):

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	movl	edgestoadd(%esp),%edx

+	pushl	%ebx

+	movl	edgelist(%esp),%ecx

+LDoNextEdge:

+	movl	et_u(%edx),%eax

+	movl	%edx,%edi

+LContinueSearch:

+	movl	et_u(%ecx),%ebx

+	movl	et_next(%ecx),%esi

+	cmpl	%ebx,%eax

+	jle		LAddedge

+	movl	et_u(%esi),%ebx

+	movl	et_next(%esi),%ecx

+	cmpl	%ebx,%eax

+	jle		LAddedge2

+	movl	et_u(%ecx),%ebx

+	movl	et_next(%ecx),%esi

+	cmpl	%ebx,%eax

+	jle		LAddedge

+	movl	et_u(%esi),%ebx

+	movl	et_next(%esi),%ecx

+	cmpl	%ebx,%eax

+	jg		LContinueSearch

+LAddedge2:

+	movl	et_next(%edx),%edx

+	movl	et_prev(%esi),%ebx

+	movl	%esi,et_next(%edi)

+	movl	%ebx,et_prev(%edi)

+	movl	%edi,et_next(%ebx)

+	movl	%edi,et_prev(%esi)

+	movl	%esi,%ecx

+	cmpl	$0,%edx

+	jnz		LDoNextEdge

+	jmp		LDone

+	.align 4

+LAddedge:

+	movl	et_next(%edx),%edx

+	movl	et_prev(%ecx),%ebx

+	movl	%ecx,et_next(%edi)

+	movl	%ebx,et_prev(%edi)

+	movl	%edi,et_next(%ebx)

+	movl	%edi,et_prev(%ecx)

+	cmpl	$0,%edx

+	jnz		LDoNextEdge

+LDone:

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	ret

+//--------------------------------------------------------------------

+#define predge	4+4

+.globl C(R_RemoveEdges)

+C(R_RemoveEdges):

+	pushl	%ebx

+	movl	predge(%esp),%eax

+Lre_loop:

+	movl	et_next(%eax),%ecx

+	movl	et_nextremove(%eax),%ebx

+	movl	et_prev(%eax),%edx

+	testl	%ebx,%ebx

+	movl	%edx,et_prev(%ecx)

+	jz		Lre_done

+	movl	%ecx,et_next(%edx)

+	movl	et_next(%ebx),%ecx

+	movl	et_prev(%ebx),%edx

+	movl	et_nextremove(%ebx),%eax

+	movl	%edx,et_prev(%ecx)

+	testl	%eax,%eax

+	movl	%ecx,et_next(%edx)

+	jnz		Lre_loop

+	popl	%ebx

+	ret

+Lre_done:

+	movl	%ecx,et_next(%edx)

+	popl	%ebx

+	ret

+//--------------------------------------------------------------------

+#define pedgelist	4+4		// note odd stack offset because of interleaving

+							// with pushes

+.globl C(R_StepActiveU)

+C(R_StepActiveU):

+	pushl	%edi

+	movl	pedgelist(%esp),%edx

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+	movl	et_prev(%edx),%esi

+LNewEdge:

+	movl	et_u(%esi),%edi

+LNextEdge:

+	movl	et_u(%edx),%eax

+	movl	et_u_step(%edx),%ebx

+	addl	%ebx,%eax

+	movl	et_next(%edx),%esi

+	movl	%eax,et_u(%edx)

+	cmpl	%edi,%eax

+	jl		LPushBack

+	movl	et_u(%esi),%edi

+	movl	et_u_step(%esi),%ebx

+	addl	%ebx,%edi

+	movl	et_next(%esi),%edx

+	movl	%edi,et_u(%esi)

+	cmpl	%eax,%edi

+	jl		LPushBack2

+	movl	et_u(%edx),%eax

+	movl	et_u_step(%edx),%ebx

+	addl	%ebx,%eax

+	movl	et_next(%edx),%esi

+	movl	%eax,et_u(%edx)

+	cmpl	%edi,%eax

+	jl		LPushBack

+	movl	et_u(%esi),%edi

+	movl	et_u_step(%esi),%ebx

+	addl	%ebx,%edi

+	movl	et_next(%esi),%edx

+	movl	%edi,et_u(%esi)

+	cmpl	%eax,%edi

+	jnl		LNextEdge

+LPushBack2:

+	movl	%edx,%ebx

+	movl	%edi,%eax

+	movl	%esi,%edx

+	movl	%ebx,%esi

+LPushBack:

+// push it back to keep it sorted

+	movl	et_prev(%edx),%ecx

+	movl	et_next(%edx),%ebx

+// done if the -1 in edge_aftertail triggered this

+	cmpl	$(C(edge_aftertail)),%edx

+	jz		LUDone

+// pull the edge out of the edge list

+	movl	et_prev(%ecx),%edi

+	movl	%ecx,et_prev(%esi)

+	movl	%ebx,et_next(%ecx)

+// find out where the edge goes in the edge list

+LPushBackLoop:

+	movl	et_prev(%edi),%ecx

+	movl	et_u(%edi),%ebx

+	cmpl	%ebx,%eax

+	jnl		LPushBackFound

+	movl	et_prev(%ecx),%edi

+	movl	et_u(%ecx),%ebx

+	cmpl	%ebx,%eax

+	jl		LPushBackLoop

+	movl	%ecx,%edi

+// put the edge back into the edge list

+LPushBackFound:

+	movl	et_next(%edi),%ebx

+	movl	%edi,et_prev(%edx)

+	movl	%ebx,et_next(%edx)

+	movl	%edx,et_next(%edi)

+	movl	%edx,et_prev(%ebx)

+	movl	%esi,%edx

+	movl	et_prev(%esi),%esi

+	cmpl	$(C(edge_tail)),%edx

+	jnz		LNewEdge

+LUDone:

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	ret

+//--------------------------------------------------------------------

+#define surf	4		// note this is loaded before any pushes

+	.align 4

+TrailingEdge:

+	movl	st_spanstate(%esi),%eax	// check for edge inversion

+	decl	%eax

+	jnz		LInverted

+	movl	%eax,st_spanstate(%esi)

+	movl	st_insubmodel(%esi),%ecx

+	movl	0x12345678,%edx		// surfaces[1].st_next

+LPatch0:

+	movl	C(r_bmodelactive),%eax

+	subl	%ecx,%eax

+	cmpl	%esi,%edx

+	movl	%eax,C(r_bmodelactive)

+	jnz		LNoEmit				// surface isn't on top, just remove

+// emit a span (current top going away)

+	movl	et_u(%ebx),%eax

+	shrl	$20,%eax				// iu = integral pixel u

+	movl	st_last_u(%esi),%edx

+	movl	st_next(%esi),%ecx

+	cmpl	%edx,%eax

+	jle		LNoEmit2				// iu <= surf->last_u, so nothing to emit

+	movl	%eax,st_last_u(%ecx)	// surf->next->last_u = iu;

+	subl	%edx,%eax

+	movl	%edx,espan_t_u(%ebp)		// span->u = surf->last_u;

+	movl	%eax,espan_t_count(%ebp)	// span->count = iu - span->u;

+	movl	C(current_iv),%eax

+	movl	%eax,espan_t_v(%ebp)		// span->v = current_iv;

+	movl	st_spans(%esi),%eax

+	movl	%eax,espan_t_pnext(%ebp)	// span->pnext = surf->spans;

+	movl	%ebp,st_spans(%esi)			// surf->spans = span;

+	addl	$(espan_t_size),%ebp

+	movl	st_next(%esi),%edx		// remove the surface from the surface

+	movl	st_prev(%esi),%esi		// stack

+	movl	%edx,st_next(%esi)

+	movl	%esi,st_prev(%edx)

+	ret

+LNoEmit2:

+	movl	%eax,st_last_u(%ecx)	// surf->next->last_u = iu;

+	movl	st_next(%esi),%edx		// remove the surface from the surface

+	movl	st_prev(%esi),%esi		// stack

+	movl	%edx,st_next(%esi)

+	movl	%esi,st_prev(%edx)

+	ret

+LNoEmit:

+	movl	st_next(%esi),%edx		// remove the surface from the surface

+	movl	st_prev(%esi),%esi		// stack

+	movl	%edx,st_next(%esi)

+	movl	%esi,st_prev(%edx)

+	ret

+LInverted:

+	movl	%eax,st_spanstate(%esi)

+	ret

+//--------------------------------------------------------------------

+// trailing edge only

+Lgs_trailing:

+	pushl	$Lgs_nextedge

+	jmp		TrailingEdge

+.globl C(R_GenerateSpans)

+C(R_GenerateSpans):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+// clear active surfaces to just the background surface

+	movl	C(surfaces),%eax

+	movl	C(edge_head_u_shift20),%edx

+	addl	$(st_size),%eax

+// %ebp = span_p throughout

+	movl	C(span_p),%ebp

+	movl	$0,C(r_bmodelactive)

+	movl	%eax,st_next(%eax)

+	movl	%eax,st_prev(%eax)

+	movl	%edx,st_last_u(%eax)

+	movl	C(edge_head)+et_next,%ebx		// edge=edge_head.next

+// generate spans

+	cmpl	$(C(edge_tail)),%ebx		// done if empty list

+	jz		Lgs_lastspan

+Lgs_edgeloop:

+	movl	et_surfs(%ebx),%edi

+	movl	C(surfaces),%eax

+	movl	%edi,%esi

+	andl	$0xFFFF0000,%edi

+	andl	$0xFFFF,%esi

+	jz		Lgs_leading		// not a trailing edge

+// it has a left surface, so a surface is going away for this span

+	shll	$(SURF_T_SHIFT),%esi

+	addl	%eax,%esi

+	testl	%edi,%edi

+	jz		Lgs_trailing

+// both leading and trailing

+	call	TrailingEdge

+	movl	C(surfaces),%eax

+// ---------------------------------------------------------------

+// handle a leading edge

+// ---------------------------------------------------------------

+Lgs_leading:

+	shrl	$16-SURF_T_SHIFT,%edi

+	movl	C(surfaces),%eax

+	addl	%eax,%edi

+	movl	0x12345678,%esi		// surf2 = surfaces[1].next;

+LPatch2:

+	movl	st_spanstate(%edi),%edx

+	movl	st_insubmodel(%edi),%eax

+	testl	%eax,%eax

+	jnz		Lbmodel_leading

+// handle a leading non-bmodel edge

+// don't start a span if this is an inverted span, with the end edge preceding

+// the start edge (that is, we've already seen the end edge)

+	testl	%edx,%edx

+	jnz		Lxl_done

+// if (surf->key < surf2->key)

+//		goto newtop;

+	incl	%edx

+	movl	st_key(%edi),%eax

+	movl	%edx,st_spanstate(%edi)

+	movl	st_key(%esi),%ecx

+	cmpl	%ecx,%eax

+	jl		Lnewtop

+// main sorting loop to search through surface stack until insertion point

+// found. Always terminates because background surface is sentinel

+// do

+// {

+// 		surf2 = surf2->next;

+// } while (surf->key >= surf2->key);

+Lsortloopnb:

+	movl	st_next(%esi),%esi

+	movl	st_key(%esi),%ecx

+	cmpl	%ecx,%eax

+	jge		Lsortloopnb

+	jmp		LInsertAndExit

+// handle a leading bmodel edge

+	.align	4

+Lbmodel_leading:

+// don't start a span if this is an inverted span, with the end edge preceding

+// the start edge (that is, we've already seen the end edge)

+	testl	%edx,%edx

+	jnz		Lxl_done

+	movl	C(r_bmodelactive),%ecx

+	incl	%edx

+	incl	%ecx

+	movl	%edx,st_spanstate(%edi)

+	movl	%ecx,C(r_bmodelactive)

+// if (surf->key < surf2->key)

+//		goto newtop;

+	movl	st_key(%edi),%eax

+	movl	st_key(%esi),%ecx

+	cmpl	%ecx,%eax

+	jl		Lnewtop

+// if ((surf->key == surf2->key) && surf->insubmodel)

+// {

+	jz		Lzcheck_for_newtop

+// main sorting loop to search through surface stack until insertion point

+// found. Always terminates because background surface is sentinel

+// do

+// {

+// 		surf2 = surf2->next;

+// } while (surf->key > surf2->key);

+Lsortloop:

+	movl	st_next(%esi),%esi

+	movl	st_key(%esi),%ecx

+	cmpl	%ecx,%eax

+	jg		Lsortloop

+	jne		LInsertAndExit

+// Do 1/z sorting to see if we've arrived in the right position

+	movl	et_u(%ebx),%eax

+	subl	$0xFFFFF,%eax

+	movl	%eax,Ltemp

+	fildl	Ltemp

+	fmuls	float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) *

+								//      (1.0 / 0x100000);

+	fld		%st(0)				// fu | fu

+	fmuls	st_d_zistepu(%edi)	// fu*surf->d_zistepu | fu

+	flds	C(fv)					// fv | fu*surf->d_zistepu | fu

+	fmuls	st_d_zistepv(%edi)	// fv*surf->d_zistepv | fu*surf->d_zistepu | fu

+	fxch	%st(1)				// fu*surf->d_zistepu | fv*surf->d_zistepv | fu

+	fadds	st_d_ziorigin(%edi)	// fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fv*surf->d_zistepv | fu

+	flds	st_d_zistepu(%esi)	// surf2->d_zistepu |

+								//  fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fv*surf->d_zistepv | fu

+	fmul	%st(3),%st(0)		// fu*surf2->d_zistepu |

+								//  fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fv*surf->d_zistepv | fu

+	fxch	%st(1)				// fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fu*surf2->d_zistepu |

+								//  fv*surf->d_zistepv | fu

+	faddp	%st(0),%st(2)		// fu*surf2->d_zistepu | newzi | fu

+	flds	C(fv)					// fv | fu*surf2->d_zistepu | newzi | fu

+	fmuls	st_d_zistepv(%esi)	// fv*surf2->d_zistepv |

+								//  fu*surf2->d_zistepu | newzi | fu

+	fld		%st(2)				// newzi | fv*surf2->d_zistepv |

+								//  fu*surf2->d_zistepu | newzi | fu

+	fmuls	float_point_999		// newzibottom | fv*surf2->d_zistepv |

+								//  fu*surf2->d_zistepu | newzi | fu

+	fxch	%st(2)				// fu*surf2->d_zistepu | fv*surf2->d_zistepv |

+								//  newzibottom | newzi | fu

+	fadds	st_d_ziorigin(%esi)	// fu*surf2->d_zistepu + surf2->d_ziorigin |

+								//  fv*surf2->d_zistepv | newzibottom | newzi |

+								//  fu

+	faddp	%st(0),%st(1)		// testzi | newzibottom | newzi | fu

+	fxch	%st(1)				// newzibottom | testzi | newzi | fu

+// if (newzibottom >= testzi)

+//     goto Lgotposition;

+	fcomp	%st(1)				// testzi | newzi | fu

+	fxch	%st(1)				// newzi | testzi | fu

+	fmuls	float_1_point_001	// newzitop | testzi | fu

+	fxch	%st(1)				// testzi | newzitop | fu

+	fnstsw	%ax

+	testb	$0x01,%ah

+	jz		Lgotposition_fpop3

+// if (newzitop >= testzi)

+// {

+	fcomp	%st(1)				// newzitop | fu

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jz		Lsortloop_fpop2

+// if (surf->d_zistepu >= surf2->d_zistepu)

+//     goto newtop;

+	flds	st_d_zistepu(%edi)	// surf->d_zistepu | newzitop| fu

+	fcomps	st_d_zistepu(%esi)	// newzitop | fu

+	fnstsw	%ax

+	testb	$0x01,%ah

+	jz		Lgotposition_fpop2

+	fstp	%st(0)				// clear the FPstack

+	fstp	%st(0)

+	movl	st_key(%edi),%eax

+	jmp		Lsortloop

+Lgotposition_fpop3:

+	fstp	%st(0)

+Lgotposition_fpop2:

+	fstp	%st(0)

+	fstp	%st(0)

+	jmp		LInsertAndExit

+// emit a span (obscures current top)

+Lnewtop_fpop3:

+	fstp	%st(0)

+Lnewtop_fpop2:

+	fstp	%st(0)

+	fstp	%st(0)

+	movl	st_key(%edi),%eax		// reload the sorting key

+Lnewtop:

+	movl	et_u(%ebx),%eax

+	movl	st_last_u(%esi),%edx

+	shrl	$20,%eax				// iu = integral pixel u

+	movl	%eax,st_last_u(%edi)	// surf->last_u = iu;

+	cmpl	%edx,%eax

+	jle		LInsertAndExit			// iu <= surf->last_u, so nothing to emit

+	subl	%edx,%eax

+	movl	%edx,espan_t_u(%ebp)		// span->u = surf->last_u;

+	movl	%eax,espan_t_count(%ebp)	// span->count = iu - span->u;

+	movl	C(current_iv),%eax

+	movl	%eax,espan_t_v(%ebp)		// span->v = current_iv;

+	movl	st_spans(%esi),%eax

+	movl	%eax,espan_t_pnext(%ebp)	// span->pnext = surf->spans;

+	movl	%ebp,st_spans(%esi)			// surf->spans = span;

+	addl	$(espan_t_size),%ebp

+LInsertAndExit:

+// insert before surf2

+	movl	%esi,st_next(%edi)		// surf->next = surf2;

+	movl	st_prev(%esi),%eax

+	movl	%eax,st_prev(%edi)		// surf->prev = surf2->prev;

+	movl	%edi,st_prev(%esi)		// surf2->prev = surf;

+	movl	%edi,st_next(%eax)		// surf2->prev->next = surf;

+// ---------------------------------------------------------------

+// leading edge done

+// ---------------------------------------------------------------

+// ---------------------------------------------------------------

+// see if there are any more edges

+// ---------------------------------------------------------------

+Lgs_nextedge:

+	movl	et_next(%ebx),%ebx

+	cmpl	$(C(edge_tail)),%ebx

+	jnz		Lgs_edgeloop

+// clean up at the right edge

+Lgs_lastspan:

+// now that we've reached the right edge of the screen, we're done with any

+// unfinished surfaces, so emit a span for whatever's on top

+	movl	0x12345678,%esi		// surfaces[1].st_next

+LPatch3:

+	movl	C(edge_tail_u_shift20),%eax

+	xorl	%ecx,%ecx

+	movl	st_last_u(%esi),%edx

+	subl	%edx,%eax

+	jle		Lgs_resetspanstate

+	movl	%edx,espan_t_u(%ebp)

+	movl	%eax,espan_t_count(%ebp)

+	movl	C(current_iv),%eax

+	movl	%eax,espan_t_v(%ebp)

+	movl	st_spans(%esi),%eax

+	movl	%eax,espan_t_pnext(%ebp)

+	movl	%ebp,st_spans(%esi)

+	addl	$(espan_t_size),%ebp

+// reset spanstate for all surfaces in the surface stack

+Lgs_resetspanstate:

+	movl	%ecx,st_spanstate(%esi)

+	movl	st_next(%esi),%esi

+	cmpl	$0x12345678,%esi		// &surfaces[1]

+LPatch4:

+	jnz		Lgs_resetspanstate

+// store the final span_p

+	movl	%ebp,C(span_p)

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+// ---------------------------------------------------------------

+// 1/z sorting for bmodels in the same leaf

+// ---------------------------------------------------------------

+	.align	4

+Lxl_done:

+	incl	%edx

+	movl	%edx,st_spanstate(%edi)

+	jmp		Lgs_nextedge

+	.align	4

+Lzcheck_for_newtop:

+	movl	et_u(%ebx),%eax

+	subl	$0xFFFFF,%eax

+	movl	%eax,Ltemp

+	fildl	Ltemp

+	fmuls	float_1_div_0100000h // fu = (float)(edge->u - 0xFFFFF) *

+								//      (1.0 / 0x100000);

+	fld		%st(0)				// fu | fu

+	fmuls	st_d_zistepu(%edi)	// fu*surf->d_zistepu | fu

+	flds	C(fv)				// fv | fu*surf->d_zistepu | fu

+	fmuls	st_d_zistepv(%edi)	// fv*surf->d_zistepv | fu*surf->d_zistepu | fu

+	fxch	%st(1)				// fu*surf->d_zistepu | fv*surf->d_zistepv | fu

+	fadds	st_d_ziorigin(%edi)	// fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fv*surf->d_zistepv | fu

+	flds	st_d_zistepu(%esi)	// surf2->d_zistepu |

+								//  fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fv*surf->d_zistepv | fu

+	fmul	%st(3),%st(0)		// fu*surf2->d_zistepu |

+								//  fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fv*surf->d_zistepv | fu

+	fxch	%st(1)				// fu*surf->d_zistepu + surf->d_ziorigin |

+								//  fu*surf2->d_zistepu |

+								//  fv*surf->d_zistepv | fu

+	faddp	%st(0),%st(2)		// fu*surf2->d_zistepu | newzi | fu

+	flds	C(fv)				// fv | fu*surf2->d_zistepu | newzi | fu

+	fmuls	st_d_zistepv(%esi)	// fv*surf2->d_zistepv |

+								//  fu*surf2->d_zistepu | newzi | fu

+	fld		%st(2)				// newzi | fv*surf2->d_zistepv |

+								//  fu*surf2->d_zistepu | newzi | fu

+	fmuls	float_point_999		// newzibottom | fv*surf2->d_zistepv |

+								//  fu*surf2->d_zistepu | newzi | fu

+	fxch	%st(2)				// fu*surf2->d_zistepu | fv*surf2->d_zistepv |

+								//  newzibottom | newzi | fu

+	fadds	st_d_ziorigin(%esi)	// fu*surf2->d_zistepu + surf2->d_ziorigin |

+								//  fv*surf2->d_zistepv | newzibottom | newzi |

+								//  fu

+	faddp	%st(0),%st(1)		// testzi | newzibottom | newzi | fu

+	fxch	%st(1)				// newzibottom | testzi | newzi | fu

+// if (newzibottom >= testzi)

+//     goto newtop;

+	fcomp	%st(1)				// testzi | newzi | fu

+	fxch	%st(1)				// newzi | testzi | fu

+	fmuls	float_1_point_001	// newzitop | testzi | fu

+	fxch	%st(1)				// testzi | newzitop | fu

+	fnstsw	%ax

+	testb	$0x01,%ah

+	jz		Lnewtop_fpop3

+// if (newzitop >= testzi)

+// {

+	fcomp	%st(1)				// newzitop | fu

+	fnstsw	%ax

+	testb	$0x45,%ah

+	jz		Lsortloop_fpop2

+// if (surf->d_zistepu >= surf2->d_zistepu)

+//     goto newtop;

+	flds	st_d_zistepu(%edi)	// surf->d_zistepu | newzitop | fu

+	fcomps	st_d_zistepu(%esi)	// newzitop | fu

+	fnstsw	%ax

+	testb	$0x01,%ah

+	jz		Lnewtop_fpop2

+Lsortloop_fpop2:

+	fstp	%st(0)				// clear the FP stack

+	fstp	%st(0)

+	movl	st_key(%edi),%eax

+	jmp		Lsortloop

+.globl C(R_EdgeCodeEnd)

+C(R_EdgeCodeEnd):

+//----------------------------------------------------------------------

+// Surface array address code patching routine

+//----------------------------------------------------------------------

+	.align 4

+.globl C(R_SurfacePatch)

+C(R_SurfacePatch):

+	movl	C(surfaces),%eax

+	addl	$(st_size),%eax

+	movl	%eax,LPatch4-4

+	addl	$(st_next),%eax

+	movl	%eax,LPatch0-4

+	movl	%eax,LPatch2-4

+	movl	%eax,LPatch3-4

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/r_varsa.s

@@ -1,0 +1,45 @@

+//

+// r_varsa.s

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#include "d_ifacea.h"

+#ifdef id386

+	.data

+//-------------------------------------------------------

+// ASM-only variables

+//-------------------------------------------------------

+.globl	float_1, float_particle_z_clip, float_point5

+.globl	float_minus_1, float_0

+float_0:		.single	0.0

+float_1:		.single	1.0

+float_minus_1:	.single	-1.0

+float_particle_z_clip:	.single	PARTICLE_Z_CLIP

+float_point5:	.single	0.5

+.globl	fp_16, fp_64k, fp_1m, fp_64kx64k

+.globl	fp_1m_minus_1

+.globl	fp_8

+fp_1m:			.single	1048576.0

+fp_1m_minus_1:	.single	1048575.0

+fp_64k:			.single	65536.0

+fp_8:			.single	8.0

+fp_16:			.single	16.0

+fp_64kx64k:		.long	0x4f000000	// (float)0x8000*0x10000

+.globl	FloatZero, Float2ToThe31nd, FloatMinus2ToThe31nd

+FloatZero:				.long	0

+Float2ToThe31nd:		.long	0x4f000000

+FloatMinus2ToThe31nd:	.long	0xcf000000

+.globl	C(r_bmodelactive)

+C(r_bmodelactive):	.long	0

+#endif	// id386

--- /dev/null

+++ b/u/snd_mixa.s

@@ -1,0 +1,199 @@

+//

+// snd_mixa.s

+// x86 assembly-language sound code

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#ifdef	id386

+	.text

+//----------------------------------------------------------------------

+// 8-bit sound-mixing code

+//----------------------------------------------------------------------

+#define ch		4+16

+#define sc		8+16

+#define count	12+16

+.globl C(SND_PaintChannelFrom8)

+C(SND_PaintChannelFrom8):

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+	pushl	%ebx

+	pushl	%ebp

+//	int 	data;

+//	short	*lscale, *rscale;

+//	unsigned char *sfx;

+//	int		i;

+	movl	ch(%esp),%ebx

+	movl	sc(%esp),%esi

+//	if (ch->leftvol > 255)

+//		ch->leftvol = 255;

+//	if (ch->rightvol > 255)

+//		ch->rightvol = 255;

+	movl	ch_leftvol(%ebx),%eax

+	movl	ch_rightvol(%ebx),%edx

+	cmpl	$255,%eax

+	jna		LLeftSet

+	movl	$255,%eax

+LLeftSet:

+	cmpl	$255,%edx

+	jna		LRightSet

+	movl	$255,%edx

+LRightSet:

+//	lscale = snd_scaletable[ch->leftvol >> 3];

+//	rscale = snd_scaletable[ch->rightvol >> 3];

+//	sfx = (signed char *)sc->data + ch->pos;

+//	ch->pos += count;

+	andl	$0xF8,%eax

+	addl	$(sfxc_data),%esi

+	andl	$0xF8,%edx

+	movl	ch_pos(%ebx),%edi

+	movl	count(%esp),%ecx

+	addl	%edi,%esi

+	shll	$7,%eax

+	addl	%ecx,%edi

+	shll	$7,%edx

+	movl	%edi,ch_pos(%ebx)

+	addl	$(C(snd_scaletable)),%eax

+	addl	$(C(snd_scaletable)),%edx

+	subl	%ebx,%ebx

+	movb	-1(%esi,%ecx,1),%bl

+	testl	$1,%ecx

+	jz		LMix8Loop

+	movl	(%eax,%ebx,4),%edi

+	movl	(%edx,%ebx,4),%ebp

+	addl	C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi

+	addl	C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp

+	movl	%edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size)

+	movl	%ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size)

+	movb	-2(%esi,%ecx,1),%bl

+	decl	%ecx

+	jz		LDone

+//	for (i=0 ; i<count ; i++)

+//	{

+LMix8Loop:

+//		data = sfx[i];

+//		paintbuffer[i].left += lscale[data];

+//		paintbuffer[i].right += rscale[data];

+	movl	(%eax,%ebx,4),%edi

+	movl	(%edx,%ebx,4),%ebp

+	addl	C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size),%edi

+	addl	C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size),%ebp

+	movb	-2(%esi,%ecx,1),%bl

+	movl	%edi,C(paintbuffer)+psp_left-psp_size(,%ecx,psp_size)

+	movl	%ebp,C(paintbuffer)+psp_right-psp_size(,%ecx,psp_size)

+	movl	(%eax,%ebx,4),%edi

+	movl	(%edx,%ebx,4),%ebp

+	movb	-3(%esi,%ecx,1),%bl

+	addl	C(paintbuffer)+psp_left-psp_size*2(,%ecx,psp_size),%edi

+	addl	C(paintbuffer)+psp_right-psp_size*2(,%ecx,psp_size),%ebp

+	movl	%edi,C(paintbuffer)+psp_left-psp_size*2(,%ecx,psp_size)

+	movl	%ebp,C(paintbuffer)+psp_right-psp_size*2(,%ecx,psp_size)

+//	}

+	subl	$2,%ecx

+	jnz		LMix8Loop

+LDone:

+	popl	%ebp

+	popl	%ebx

+	popl	%edi

+	popl	%esi

+	ret

+//----------------------------------------------------------------------

+// Transfer of stereo buffer to 16-bit DMA buffer code

+//----------------------------------------------------------------------

+.globl C(Snd_WriteLinearBlastStereo16)

+C(Snd_WriteLinearBlastStereo16):

+	pushl	%esi				// preserve register variables

+	pushl	%edi

+	pushl	%ebx

+//	int		i;

+//	int		val;

+	movl	C(snd_linear_count),%ecx

+	movl	C(snd_p),%ebx

+	movl	C(snd_vol),%esi

+	movl	C(snd_out),%edi

+//	for (i=0 ; i<snd_linear_count ; i+=2)

+//	{

+LWLBLoopTop:

+//		val = (snd_p[i]*snd_vol)>>8;

+//		if (val > 0x7fff)

+//			snd_out[i] = 0x7fff;

+//		else if (val < (short)0x8000)

+//			snd_out[i] = (short)0x8000;

+//		else

+//			snd_out[i] = val;

+	movl	-8(%ebx,%ecx,4),%eax

+	imull	%esi,%eax

+	sarl	$8,%eax

+	cmpl	$0x7FFF,%eax

+	jg		LClampHigh

+	cmpl	$0xFFFF8000,%eax

+	jnl		LClampDone

+	movl	$0xFFFF8000,%eax

+	jmp		LClampDone

+LClampHigh:

+	movl	$0x7FFF,%eax

+LClampDone:

+//		val = (snd_p[i+1]*snd_vol)>>8;

+//		if (val > 0x7fff)

+//			snd_out[i+1] = 0x7fff;

+//		else if (val < (short)0x8000)

+//			snd_out[i+1] = (short)0x8000;

+//		else

+//			snd_out[i+1] = val;

+	movl	-4(%ebx,%ecx,4),%edx

+	imull	%esi,%edx

+	sarl	$8,%edx

+	cmpl	$0x7FFF,%edx

+	jg		LClampHigh2

+	cmpl	$0xFFFF8000,%edx

+	jnl		LClampDone2

+	movl	$0xFFFF8000,%edx

+	jmp		LClampDone2

+LClampHigh2:

+	movl	$0x7FFF,%edx

+LClampDone2:

+	shll	$16,%edx

+	andl	$0xFFFF,%eax

+	orl		%eax,%edx

+	movl	%edx,-4(%edi,%ecx,2)

+//	}

+	subl	$2,%ecx

+	jnz		LWLBLoopTop

+//	snd_p += snd_linear_count;

+	popl	%ebx

+	popl	%edi

+	popl	%esi

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/surf16.s

@@ -1,0 +1,153 @@

+//

+// surf16.s

+// x86 assembly-language 16 bpp surface block drawing code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#ifdef id386

+//----------------------------------------------------------------------

+// Surface block drawer

+//----------------------------------------------------------------------

+	.data

+k:			.long	0

+loopentry:	.long	0

+	.align	4

+blockjumptable16:

+	.long	LEnter2_16

+	.long	LEnter4_16

+	.long	0, LEnter8_16

+	.long	0, 0, 0, LEnter16_16

+	.text

+	.align 4

+.globl C(R_Surf16Start)

+C(R_Surf16Start):

+	.align 4

+.globl C(R_DrawSurfaceBlock16)

+C(R_DrawSurfaceBlock16):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+	movl	C(blocksize),%eax

+	movl	C(prowdestbase),%edi

+	movl	C(pbasesource),%esi

+	movl	C(sourcesstep),%ebx

+	movl	blockjumptable16-4(,%eax,2),%ecx

+	movl	%eax,k

+	movl	%ecx,loopentry

+	movl	C(lightleft),%edx

+	movl	C(lightright),%ebp

+Lblockloop16:

+	subl	%edx,%ebp

+	movb	C(blockdivshift),%cl

+	sarl	%cl,%ebp

+	jns		Lp1_16

+	testl	C(blockdivmask),%ebp

+	jz		Lp1_16

+	incl	%ebp

+Lp1_16:

+	subl	%eax,%eax

+	subl	%ecx,%ecx	// high words must be 0 in loop for addressing

+	jmp		*loopentry

+	.align	4

+#include "block16.h"

+	movl	C(pbasesource),%esi

+	movl	C(lightleft),%edx

+	movl	C(lightright),%ebp

+	movl	C(sourcetstep),%eax

+	movl	C(lightrightstep),%ecx

+	movl	C(prowdestbase),%edi

+	addl	%eax,%esi

+	addl	%ecx,%ebp

+	movl	C(lightleftstep),%eax

+	movl	C(surfrowbytes),%ecx

+	addl	%eax,%edx

+	addl	%ecx,%edi

+	movl	%esi,C(pbasesource)

+	movl	%ebp,C(lightright)

+	movl	k,%eax

+	movl	%edx,C(lightleft)

+	decl	%eax

+	movl	%edi,C(prowdestbase)

+	movl	%eax,k

+	jnz		Lblockloop16

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+.globl C(R_Surf16End)

+C(R_Surf16End):

+//----------------------------------------------------------------------

+// Code patching routines

+//----------------------------------------------------------------------

+	.data

+	.align 4

+LPatchTable16:

+	.long	LBPatch0-4

+	.long	LBPatch1-4

+	.long	LBPatch2-4

+	.long	LBPatch3-4

+	.long	LBPatch4-4

+	.long	LBPatch5-4

+	.long	LBPatch6-4

+	.long	LBPatch7-4

+	.long	LBPatch8-4

+	.long	LBPatch9-4

+	.long	LBPatch10-4

+	.long	LBPatch11-4

+	.long	LBPatch12-4

+	.long	LBPatch13-4

+	.long	LBPatch14-4

+	.long	LBPatch15-4

+	.text

+	.align 4

+.globl C(R_Surf16Patch)

+C(R_Surf16Patch):

+	pushl	%ebx

+	movl	C(colormap),%eax

+	movl	$LPatchTable16,%ebx

+	movl	$16,%ecx

+LPatchLoop16:

+	movl	(%ebx),%edx

+	addl	$4,%ebx

+	movl	%eax,(%edx)

+	decl	%ecx

+	jnz		LPatchLoop16

+	popl	%ebx

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/surf8.s

@@ -1,0 +1,764 @@

+//

+// surf8.s

+// x86 assembly-language 8 bpp surface block drawing code.

+//

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "asm_draw.h"

+#ifdef	id386

+	.data

+sb_v:		.long	0

+	.text

+	.align 4

+.globl C(R_Surf8Start)

+C(R_Surf8Start):

+//----------------------------------------------------------------------

+// Surface block drawer for mip level 0

+//----------------------------------------------------------------------

+	.align 4

+.globl C(R_DrawSurfaceBlock8_mip0)

+C(R_DrawSurfaceBlock8_mip0):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//		for (v=0 ; v<numvblocks ; v++)

+//		{

+	movl	C(r_lightptr),%ebx

+	movl	C(r_numvblocks),%eax

+	movl	%eax,sb_v

+	movl	C(prowdestbase),%edi

+	movl	C(pbasesource),%esi

+Lv_loop_mip0:

+//			lightleft = lightptr[0];

+//			lightright = lightptr[1];

+//			lightdelta = (lightleft - lightright) & 0xFFFFF;

+	movl	(%ebx),%eax			// lightleft

+	movl	4(%ebx),%edx		// lightright

+	movl	%eax,%ebp

+	movl	C(r_lightwidth),%ecx

+	movl	%edx,C(lightright)

+	subl	%edx,%ebp

+	andl	$0xFFFFF,%ebp

+	leal	(%ebx,%ecx,4),%ebx

+//			lightptr += lightwidth;

+	movl	%ebx,C(r_lightptr)

+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

+//					0xF0000000;

+	movl	4(%ebx),%ecx	// lightptr[1]

+	movl	(%ebx),%ebx		// lightptr[0]

+	subl	%eax,%ebx

+	subl	%edx,%ecx

+	sarl	$4,%ecx

+	orl		$0xF0000000,%ebp

+	sarl	$4,%ebx

+	movl	%ecx,C(lightrightstep)

+	subl	%ecx,%ebx

+	andl	$0xFFFFF,%ebx

+	orl		$0xF0000000,%ebx

+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

+	movl	%ebx,C(lightdeltastep)

+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

+Lblockloop8_mip0:

+	movl	%ebp,C(lightdelta)

+	movb	14(%esi),%cl

+	sarl	$4,%ebp

+	movb	%dh,%bh

+	movb	15(%esi),%bl

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch0:

+	movb	13(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch1:

+	movb	12(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch2:

+	movb	11(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch3:

+	movb	10(%esi),%cl

+	movl	%eax,12(%edi)

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch4:

+	movb	9(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch5:

+	movb	8(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch6:

+	movb	7(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch7:

+	movb	6(%esi),%cl

+	movl	%eax,8(%edi)

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch8:

+	movb	5(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch9:

+	movb	4(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch10:

+	movb	3(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch11:

+	movb	2(%esi),%cl

+	movl	%eax,4(%edi)

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch12:

+	movb	1(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch13:

+	movb	(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	movb	0x12345678(%ebx),%ah

+LBPatch14:

+	movl	C(lightright),%edx

+	movb	0x12345678(%ecx),%al

+LBPatch15:

+	movl	C(lightdelta),%ebp

+	movl	%eax,(%edi)

+	addl	C(sourcetstep),%esi

+	addl	C(surfrowbytes),%edi

+	addl	C(lightrightstep),%edx

+	addl	C(lightdeltastep),%ebp

+	movl	%edx,C(lightright)

+	jc		Lblockloop8_mip0

+//			if (pbasesource >= r_sourcemax)

+//				pbasesource -= stepback;

+	cmpl	C(r_sourcemax),%esi

+	jb		LSkip_mip0

+	subl	C(r_stepback),%esi

+LSkip_mip0:

+	movl	C(r_lightptr),%ebx

+	decl	sb_v

+	jnz		Lv_loop_mip0

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+//----------------------------------------------------------------------

+// Surface block drawer for mip level 1

+//----------------------------------------------------------------------

+	.align 4

+.globl C(R_DrawSurfaceBlock8_mip1)

+C(R_DrawSurfaceBlock8_mip1):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//		for (v=0 ; v<numvblocks ; v++)

+//		{

+	movl	C(r_lightptr),%ebx

+	movl	C(r_numvblocks),%eax

+	movl	%eax,sb_v

+	movl	C(prowdestbase),%edi

+	movl	C(pbasesource),%esi

+Lv_loop_mip1:

+//			lightleft = lightptr[0];

+//			lightright = lightptr[1];

+//			lightdelta = (lightleft - lightright) & 0xFFFFF;

+	movl	(%ebx),%eax			// lightleft

+	movl	4(%ebx),%edx		// lightright

+	movl	%eax,%ebp

+	movl	C(r_lightwidth),%ecx

+	movl	%edx,C(lightright)

+	subl	%edx,%ebp

+	andl	$0xFFFFF,%ebp

+	leal	(%ebx,%ecx,4),%ebx

+//			lightptr += lightwidth;

+	movl	%ebx,C(r_lightptr)

+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

+//					0xF0000000;

+	movl	4(%ebx),%ecx	// lightptr[1]

+	movl	(%ebx),%ebx		// lightptr[0]

+	subl	%eax,%ebx

+	subl	%edx,%ecx

+	sarl	$3,%ecx

+	orl		$0x70000000,%ebp

+	sarl	$3,%ebx

+	movl	%ecx,C(lightrightstep)

+	subl	%ecx,%ebx

+	andl	$0xFFFFF,%ebx

+	orl		$0xF0000000,%ebx

+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

+	movl	%ebx,C(lightdeltastep)

+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

+Lblockloop8_mip1:

+	movl	%ebp,C(lightdelta)

+	movb	6(%esi),%cl

+	sarl	$3,%ebp

+	movb	%dh,%bh

+	movb	7(%esi),%bl

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch22:

+	movb	5(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch23:

+	movb	4(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch24:

+	movb	3(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch25:

+	movb	2(%esi),%cl

+	movl	%eax,4(%edi)

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch26:

+	movb	1(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch27:

+	movb	(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	movb	0x12345678(%ebx),%ah

+LBPatch28:

+	movl	C(lightright),%edx

+	movb	0x12345678(%ecx),%al

+LBPatch29:

+	movl	C(lightdelta),%ebp

+	movl	%eax,(%edi)

+	movl	C(sourcetstep),%eax

+	addl	%eax,%esi

+	movl	C(surfrowbytes),%eax

+	addl	%eax,%edi

+	movl	C(lightrightstep),%eax

+	addl	%eax,%edx

+	movl	C(lightdeltastep),%eax

+	addl	%eax,%ebp

+	movl	%edx,C(lightright)

+	jc		Lblockloop8_mip1

+//			if (pbasesource >= r_sourcemax)

+//				pbasesource -= stepback;

+	cmpl	C(r_sourcemax),%esi

+	jb		LSkip_mip1

+	subl	C(r_stepback),%esi

+LSkip_mip1:

+	movl	C(r_lightptr),%ebx

+	decl	sb_v

+	jnz		Lv_loop_mip1

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+//----------------------------------------------------------------------

+// Surface block drawer for mip level 2

+//----------------------------------------------------------------------

+	.align 4

+.globl C(R_DrawSurfaceBlock8_mip2)

+C(R_DrawSurfaceBlock8_mip2):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//		for (v=0 ; v<numvblocks ; v++)

+//		{

+	movl	C(r_lightptr),%ebx

+	movl	C(r_numvblocks),%eax

+	movl	%eax,sb_v

+	movl	C(prowdestbase),%edi

+	movl	C(pbasesource),%esi

+Lv_loop_mip2:

+//			lightleft = lightptr[0];

+//			lightright = lightptr[1];

+//			lightdelta = (lightleft - lightright) & 0xFFFFF;

+	movl	(%ebx),%eax			// lightleft

+	movl	4(%ebx),%edx		// lightright

+	movl	%eax,%ebp

+	movl	C(r_lightwidth),%ecx

+	movl	%edx,C(lightright)

+	subl	%edx,%ebp

+	andl	$0xFFFFF,%ebp

+	leal	(%ebx,%ecx,4),%ebx

+//			lightptr += lightwidth;

+	movl	%ebx,C(r_lightptr)

+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

+//					0xF0000000;

+	movl	4(%ebx),%ecx	// lightptr[1]

+	movl	(%ebx),%ebx		// lightptr[0]

+	subl	%eax,%ebx

+	subl	%edx,%ecx

+	sarl	$2,%ecx

+	orl		$0x30000000,%ebp

+	sarl	$2,%ebx

+	movl	%ecx,C(lightrightstep)

+	subl	%ecx,%ebx

+	andl	$0xFFFFF,%ebx

+	orl		$0xF0000000,%ebx

+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

+	movl	%ebx,C(lightdeltastep)

+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

+Lblockloop8_mip2:

+	movl	%ebp,C(lightdelta)

+	movb	2(%esi),%cl

+	sarl	$2,%ebp

+	movb	%dh,%bh

+	movb	3(%esi),%bl

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	addl	%ebp,%edx

+	movb	0x12345678(%ebx),%ah

+LBPatch18:

+	movb	1(%esi),%bl

+	movb	0x12345678(%ecx),%al

+LBPatch19:

+	movb	(%esi),%cl

+	movb	%dh,%bh

+	addl	%ebp,%edx

+	rorl	$16,%eax

+	movb	%dh,%ch

+	movb	0x12345678(%ebx),%ah

+LBPatch20:

+	movl	C(lightright),%edx

+	movb	0x12345678(%ecx),%al

+LBPatch21:

+	movl	C(lightdelta),%ebp

+	movl	%eax,(%edi)

+	movl	C(sourcetstep),%eax

+	addl	%eax,%esi

+	movl	C(surfrowbytes),%eax

+	addl	%eax,%edi

+	movl	C(lightrightstep),%eax

+	addl	%eax,%edx

+	movl	C(lightdeltastep),%eax

+	addl	%eax,%ebp

+	movl	%edx,C(lightright)

+	jc		Lblockloop8_mip2

+//			if (pbasesource >= r_sourcemax)

+//				pbasesource -= stepback;

+	cmpl	C(r_sourcemax),%esi

+	jb		LSkip_mip2

+	subl	C(r_stepback),%esi

+LSkip_mip2:

+	movl	C(r_lightptr),%ebx

+	decl	sb_v

+	jnz		Lv_loop_mip2

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+//----------------------------------------------------------------------

+// Surface block drawer for mip level 3

+//----------------------------------------------------------------------

+	.align 4

+.globl C(R_DrawSurfaceBlock8_mip3)

+C(R_DrawSurfaceBlock8_mip3):

+	pushl	%ebp				// preserve caller's stack frame

+	pushl	%edi

+	pushl	%esi				// preserve register variables

+	pushl	%ebx

+//		for (v=0 ; v<numvblocks ; v++)

+//		{

+	movl	C(r_lightptr),%ebx

+	movl	C(r_numvblocks),%eax

+	movl	%eax,sb_v

+	movl	C(prowdestbase),%edi

+	movl	C(pbasesource),%esi

+Lv_loop_mip3:

+//			lightleft = lightptr[0];

+//			lightright = lightptr[1];

+//			lightdelta = (lightleft - lightright) & 0xFFFFF;

+	movl	(%ebx),%eax			// lightleft

+	movl	4(%ebx),%edx		// lightright

+	movl	%eax,%ebp

+	movl	C(r_lightwidth),%ecx

+	movl	%edx,C(lightright)

+	subl	%edx,%ebp

+	andl	$0xFFFFF,%ebp

+	leal	(%ebx,%ecx,4),%ebx

+	movl	%ebp,C(lightdelta)

+//			lightptr += lightwidth;

+	movl	%ebx,C(r_lightptr)

+//			lightleftstep = (lightptr[0] - lightleft) >> blockdivshift;

+//			lightrightstep = (lightptr[1] - lightright) >> blockdivshift;

+//			lightdeltastep = ((lightleftstep - lightrightstep) & 0xFFFFF) |

+//					0xF0000000;

+	movl	4(%ebx),%ecx	// lightptr[1]

+	movl	(%ebx),%ebx		// lightptr[0]

+	subl	%eax,%ebx

+	subl	%edx,%ecx

+	sarl	$1,%ecx

+	sarl	$1,%ebx

+	movl	%ecx,C(lightrightstep)

+	subl	%ecx,%ebx

+	andl	$0xFFFFF,%ebx

+	sarl	$1,%ebp

+	orl		$0xF0000000,%ebx

+	movl	%ebx,C(lightdeltastep)

+	subl	%ebx,%ebx	// high word must be 0 in loop for addressing

+	movb	1(%esi),%bl

+	subl	%ecx,%ecx	// high word must be 0 in loop for addressing

+	movb	%dh,%bh

+	movb	(%esi),%cl

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	movb	0x12345678(%ebx),%al

+LBPatch16:

+	movl	C(lightright),%edx

+	movb	%al,1(%edi)

+	movb	0x12345678(%ecx),%al

+LBPatch17:

+	movb	%al,(%edi)

+	movl	C(sourcetstep),%eax

+	addl	%eax,%esi

+	movl	C(surfrowbytes),%eax

+	addl	%eax,%edi

+	movl	C(lightdeltastep),%eax

+	movl	C(lightdelta),%ebp

+	movb	(%esi),%cl

+	addl	%eax,%ebp

+	movl	C(lightrightstep),%eax

+	sarl	$1,%ebp

+	addl	%eax,%edx

+	movb	%dh,%bh

+	movb	1(%esi),%bl

+	addl	%ebp,%edx

+	movb	%dh,%ch

+	movb	0x12345678(%ebx),%al

+LBPatch30:

+	movl	C(sourcetstep),%edx

+	movb	%al,1(%edi)

+	movb	0x12345678(%ecx),%al

+LBPatch31:

+	movb	%al,(%edi)

+	movl	C(surfrowbytes),%ebp

+	addl	%edx,%esi

+	addl	%ebp,%edi

+//			if (pbasesource >= r_sourcemax)

+//				pbasesource -= stepback;

+	cmpl	C(r_sourcemax),%esi

+	jb		LSkip_mip3

+	subl	C(r_stepback),%esi

+LSkip_mip3:

+	movl	C(r_lightptr),%ebx

+	decl	sb_v

+	jnz		Lv_loop_mip3

+	popl	%ebx				// restore register variables

+	popl	%esi

+	popl	%edi

+	popl	%ebp				// restore the caller's stack frame

+	ret

+.globl C(R_Surf8End)

+C(R_Surf8End):

+//----------------------------------------------------------------------

+// Code patching routines

+//----------------------------------------------------------------------

+	.data

+	.align 4

+LPatchTable8:

+	.long	LBPatch0-4

+	.long	LBPatch1-4

+	.long	LBPatch2-4

+	.long	LBPatch3-4

+	.long	LBPatch4-4

+	.long	LBPatch5-4

+	.long	LBPatch6-4

+	.long	LBPatch7-4

+	.long	LBPatch8-4

+	.long	LBPatch9-4

+	.long	LBPatch10-4

+	.long	LBPatch11-4

+	.long	LBPatch12-4

+	.long	LBPatch13-4

+	.long	LBPatch14-4

+	.long	LBPatch15-4

+	.long	LBPatch16-4

+	.long	LBPatch17-4

+	.long	LBPatch18-4

+	.long	LBPatch19-4

+	.long	LBPatch20-4

+	.long	LBPatch21-4

+	.long	LBPatch22-4

+	.long	LBPatch23-4

+	.long	LBPatch24-4

+	.long	LBPatch25-4

+	.long	LBPatch26-4

+	.long	LBPatch27-4

+	.long	LBPatch28-4

+	.long	LBPatch29-4

+	.long	LBPatch30-4

+	.long	LBPatch31-4

+	.text

+	.align 4

+.globl C(R_Surf8Patch)

+C(R_Surf8Patch):

+	pushl	%ebx

+	movl	C(colormap),%eax

+	movl	$LPatchTable8,%ebx

+	movl	$32,%ecx

+LPatchLoop8:

+	movl	(%ebx),%edx

+	addl	$4,%ebx

+	movl	%eax,(%edx)

+	decl	%ecx

+	jnz		LPatchLoop8

+	popl	%ebx

+	ret

+#endif	// id386

--- /dev/null

+++ b/u/sys_dosa.s

@@ -1,0 +1,95 @@

+//

+// sys_dosa.s

+// x86 assembly-language DOS-dependent routines.

+#include "asm_i386.h"

+#include "quakeasm.h"

+	.data

+	.align	4

+fpenv:

+	.long	0, 0, 0, 0, 0, 0, 0, 0

+	.text

+.globl C(MaskExceptions)

+C(MaskExceptions):

+	fnstenv	fpenv

+	orl		$0x3F,fpenv

+	fldenv	fpenv

+	ret

+/*

+.globl C(unmaskexceptions)

+C(unmaskexceptions):

+	fnstenv	fpenv

+	andl		$0xFFFFFFE0,fpenv

+	fldenv	fpenv

+	ret

+*/

+	.data

+	.align	4

+.globl	ceil_cw, single_cw, full_cw, cw, pushed_cw

+ceil_cw:	.long	0

+single_cw:	.long	0

+full_cw:	.long	0

+cw:			.long	0

+pushed_cw:	.long	0

+	.text

+.globl C(Sys_LowFPPrecision)

+C(Sys_LowFPPrecision):

+	fldcw	single_cw

+	ret

+.globl C(Sys_HighFPPrecision)

+C(Sys_HighFPPrecision):

+	fldcw	full_cw

+	ret

+.globl C(Sys_PushFPCW_SetHigh)

+C(Sys_PushFPCW_SetHigh):

+	fnstcw	pushed_cw

+	fldcw	full_cw

+	ret

+.globl C(Sys_PopFPCW)

+C(Sys_PopFPCW):

+	fldcw	pushed_cw

+	ret

+.globl C(Sys_SetFPCW)

+C(Sys_SetFPCW):

+	fnstcw	cw

+	movl	cw,%eax

+#ifdef	id386

+	andb	$0xF0,%ah

+	orb		$0x03,%ah	// round mode, 64-bit precision

+#endif

+	movl	%eax,full_cw

+#ifdef	id386

+	andb	$0xF0,%ah

+	orb		$0x0C,%ah	// chop mode, single precision

+#endif

+	movl	%eax,single_cw

+#ifdef	id386

+	andb	$0xF0,%ah

+	orb		$0x08,%ah	// ceil mode, single precision

+#endif

+	movl	%eax,ceil_cw

+	ret

--- /dev/null

+++ b/u/worlda.s

@@ -1,0 +1,125 @@

+//

+// worlda.s

+// x86 assembly-language server testing stuff

+//

+#define GLQUAKE	1	// don't include unneeded defs

+#include "asm_i386.h"

+#include "quakeasm.h"

+#include "d_ifacea.h"

+#ifdef id386

+	.data

+Ltemp:	.long	0

+	.text

+//----------------------------------------------------------------------

+// hull-point test

+//----------------------------------------------------------------------

+#define hull	4+8				// because only partially pushed

+#define	num		8+4				// because only partially pushed

+#define p		12+12			// because only partially pushed

+	.align 4

+.globl C(SV_HullPointContents)

+C(SV_HullPointContents):

+	pushl	%edi				// preserve register variables

+	movl	num(%esp),%eax

+	testl	%eax,%eax

+	js		Lhquickout

+//	float		d;

+//	dclipnode_t	*node;

+//	mplane_t	*plane;

+	pushl	%ebx

+	movl	hull(%esp),%ebx

+	pushl	%ebp

+	movl	p(%esp),%edx

+	movl	hu_clipnodes(%ebx),%edi

+	movl	hu_planes(%ebx),%ebp

+	subl	%ebx,%ebx

+	pushl	%esi

+// %ebx: 0

+// %eax: num

+// %edx: p

+// %edi: hull->clipnodes

+// %ebp: hull->planes

+//	while (num >= 0)

+//	{

+Lhloop:

+//		node = hull->clipnodes + num;

+//		plane = hull->planes + node->planenum;

+// !!! if the size of dclipnode_t changes, the scaling of %eax needs to be

+//     changed !!!

+	movl	nd_planenum(%edi,%eax,8),%ecx

+	movl	nd_children(%edi,%eax,8),%eax

+	movl	%eax,%esi

+	rorl	$16,%eax

+	leal	(%ecx,%ecx,4),%ecx

+//		if (plane->type < 3)

+//			d = p[plane->type] - plane->dist;

+	movb	pl_type(%ebp,%ecx,4),%bl

+	cmpb	$3,%bl

+	jb		Lnodot

+//		else

+//			d = DotProduct (plane->normal, p) - plane->dist;

+	flds	pl_normal(%ebp,%ecx,4)

+	fmuls	0(%edx)

+	flds	pl_normal+4(%ebp,%ecx,4)

+	fmuls	4(%edx)

+	flds	pl_normal+8(%ebp,%ecx,4)

+	fmuls	8(%edx)

+	fxch	%st(1)

+	faddp	%st(0),%st(2)

+	faddp	%st(0),%st(1)

+	fsubs	pl_dist(%ebp,%ecx,4)

+	jmp		Lsub

+Lnodot:

+	flds	pl_dist(%ebp,%ecx,4)

+	fsubrs	(%edx,%ebx,4)

+Lsub:

+	sarl	$16,%eax

+	sarl	$16,%esi

+//		if (d < 0)

+//			num = node->children[1];

+//		else

+//			num = node->children[0];

+	fstps	Ltemp

+	movl	Ltemp,%ecx

+	sarl	$31,%ecx

+	andl	%ecx,%esi

+	xorl	$0xFFFFFFFF,%ecx

+	andl	%ecx,%eax

+	orl		%esi,%eax

+	jns		Lhloop

+//	return num;

+Lhdone:

+	popl	%esi

+	popl	%ebp

+	popl	%ebx				// restore register variables

+Lhquickout:

+	popl	%edi

+	ret

+#endif	// id386

--- a/worlda.s

+++ /dev/null

@@ -1,125 +1,0 @@

-//

-// worlda.s

-// x86 assembly-language server testing stuff

-//

-#define GLQUAKE	1	// don't include unneeded defs

-#include "asm_i386.h"

-#include "quakeasm.h"

-#include "d_ifacea.h"

-#ifdef id386

-	.data

-Ltemp:	.long	0

-	.text

-//----------------------------------------------------------------------

-// hull-point test

-//----------------------------------------------------------------------

-#define hull	4+8				// because only partially pushed

-#define	num		8+4				// because only partially pushed

-#define p		12+12			// because only partially pushed

-	.align 4

-.globl C(SV_HullPointContents)

-C(SV_HullPointContents):

-	pushl	%edi				// preserve register variables

-	movl	num(%esp),%eax

-	testl	%eax,%eax

-	js		Lhquickout

-//	float		d;

-//	dclipnode_t	*node;

-//	mplane_t	*plane;

-	pushl	%ebx

-	movl	hull(%esp),%ebx

-	pushl	%ebp

-	movl	p(%esp),%edx

-	movl	hu_clipnodes(%ebx),%edi

-	movl	hu_planes(%ebx),%ebp

-	subl	%ebx,%ebx

-	pushl	%esi

-// %ebx: 0

-// %eax: num

-// %edx: p

-// %edi: hull->clipnodes

-// %ebp: hull->planes

-//	while (num >= 0)

-//	{

-Lhloop:

-//		node = hull->clipnodes + num;

-//		plane = hull->planes + node->planenum;

-// !!! if the size of dclipnode_t changes, the scaling of %eax needs to be

-//     changed !!!

-	movl	nd_planenum(%edi,%eax,8),%ecx

-	movl	nd_children(%edi,%eax,8),%eax

-	movl	%eax,%esi

-	rorl	$16,%eax

-	leal	(%ecx,%ecx,4),%ecx

-//		if (plane->type < 3)

-//			d = p[plane->type] - plane->dist;

-	movb	pl_type(%ebp,%ecx,4),%bl

-	cmpb	$3,%bl

-	jb		Lnodot

-//		else

-//			d = DotProduct (plane->normal, p) - plane->dist;

-	flds	pl_normal(%ebp,%ecx,4)

-	fmuls	0(%edx)

-	flds	pl_normal+4(%ebp,%ecx,4)

-	fmuls	4(%edx)

-	flds	pl_normal+8(%ebp,%ecx,4)

-	fmuls	8(%edx)

-	fxch	%st(1)

-	faddp	%st(0),%st(2)

-	faddp	%st(0),%st(1)

-	fsubs	pl_dist(%ebp,%ecx,4)

-	jmp		Lsub

-Lnodot:

-	flds	pl_dist(%ebp,%ecx,4)

-	fsubrs	(%edx,%ebx,4)

-Lsub:

-	sarl	$16,%eax

-	sarl	$16,%esi

-//		if (d < 0)

-//			num = node->children[1];

-//		else

-//			num = node->children[0];

-	fstps	Ltemp

-	movl	Ltemp,%ecx

-	sarl	$31,%ecx

-	andl	%ecx,%esi

-	xorl	$0xFFFFFFFF,%ecx

-	andl	%ecx,%eax

-	orl		%esi,%eax

-	jns		Lhloop

-//	return num;

-Lhdone:

-	popl	%esi

-	popl	%ebp

-	popl	%ebx				// restore register variables

-Lhquickout:

-	popl	%edi

-	ret

-#endif	// id386

--

⑨