shithub: qk1

Download patch

ref: cff9f164d3ee4e480522f28d440daaa841e8ce32
parent: d8f3a8ba78e93d6c421d0fad28cf3359ebb729da
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Thu May 11 10:54:33 EDT 2023

D_DrawSpans16 + loop unroll (thanks qbism & others)

--- a/d_init.c
+++ b/d_init.c
@@ -126,7 +126,7 @@
 	for (i=0 ; i<(NUM_MIPS-1) ; i++)
 		d_scalemip[i] = basemip[i] * d_mipscale.value;
 
-	d_drawspans = D_DrawSpans8;
+	d_drawspans = D_DrawSpans16;
 
 	d_aflatcolor = 0;
 }
--- a/d_scan.c
+++ b/d_scan.c
@@ -221,19 +221,18 @@
 	} while ((pspan = pspan->pnext) != nil);
 }
 
-
 /*
 =============
-D_DrawSpans8
+D_DrawSpans16
 =============
 */
-void D_DrawSpans8 (espan_t *pspan)
+void D_DrawSpans16 (espan_t *pspan) //qbism- up it from 8 to 16
 {
 	int				count, spancount;
 	unsigned char	*pbase, *pdest;
 	fixed16_t		s, t, snext, tnext, sstep, tstep;
 	float			sdivz, tdivz, zi, z, du, dv, spancountminus1;
-	float			sdivz8stepu, tdivz8stepu, zi8stepu;
+	float			sdivzstepu, tdivzstepu, zistepu;
 
 	sstep = 0;	// keep compiler happy
 	tstep = 0;	// ditto
@@ -240,9 +239,9 @@
 
 	pbase = (unsigned char *)cacheblock;
 
-	sdivz8stepu = d_sdivzstepu * 8;
-	tdivz8stepu = d_tdivzstepu * 8;
-	zi8stepu = d_zistepu * 8;
+	sdivzstepu = d_sdivzstepu * 16;
+	tdivzstepu = d_tdivzstepu * 16;
+	zistepu = d_zistepu * 16;
 
 	do
 	{
@@ -275,8 +274,8 @@
 		do
 		{
 		// calculate s and t at the far end of the span
-			if (count >= 8)
-				spancount = 8;
+			if (count >= 16)
+				spancount = 16;
 			else
 				spancount = count;
 
@@ -286,16 +285,16 @@
 			{
 			// calculate s/z, t/z, zi->fixed s and t at far end of span,
 			// calculate s and t steps across span by shifting
-				sdivz += sdivz8stepu;
-				tdivz += tdivz8stepu;
-				zi += zi8stepu;
+				sdivz += sdivzstepu;
+				tdivz += tdivzstepu;
+				zi += zistepu;
 				z = (float)0x10000 / zi;	// prescale to 16.16 fixed-point
 
 				snext = (int)(sdivz * z) + sadjust;
 				if (snext > bbextents)
 					snext = bbextents;
-				else if (snext < 8)
-					snext = 8;	// prevent round-off error on <0 steps from
+				else if (snext <= 16)
+					snext = 16;	// prevent round-off error on <0 steps from
 								//  from causing overstepping & running off the
 								//  edge of the texture
 
@@ -302,11 +301,11 @@
 				tnext = (int)(tdivz * z) + tadjust;
 				if (tnext > bbextentt)
 					tnext = bbextentt;
-				else if (tnext < 8)
-					tnext = 8;	// guard against round-off error on <0 steps
+				else if (tnext < 16)
+					tnext = 16;	// guard against round-off error on <0 steps
 
-				sstep = (snext - s) >> 3;
-				tstep = (tnext - t) >> 3;
+				sstep = (snext - s) >> 4;
+				tstep = (tnext - t) >> 4;
 			}
 			else
 			{
@@ -322,8 +321,8 @@
 				snext = (int)(sdivz * z) + sadjust;
 				if (snext > bbextents)
 					snext = bbextents;
-				else if (snext < 8)
-					snext = 8;	// prevent round-off error on <0 steps from
+				else if (snext < 16)
+					snext = 16;	// prevent round-off error on <0 steps from
 								//  from causing overstepping & running off the
 								//  edge of the texture
 
@@ -330,8 +329,8 @@
 				tnext = (int)(tdivz * z) + tadjust;
 				if (tnext > bbextentt)
 					tnext = bbextentt;
-				else if (tnext < 8)
-					tnext = 8;	// guard against round-off error on <0 steps
+				else if (tnext < 16)
+					tnext = 16;	// guard against round-off error on <0 steps
 
 				if (spancount > 1)
 				{
@@ -340,12 +339,27 @@
 				}
 			}
 
-			do
+			pdest += spancount;
+			switch (spancount)
 			{
-				*pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth);
-				s += sstep;
-				t += tstep;
-			} while (--spancount > 0);
+			case 16: pdest[-16] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 15: pdest[-15] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 14: pdest[-14] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 13: pdest[-13] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 12: pdest[-12] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 11: pdest[-11] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 10: pdest[-10] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 9: pdest[-9] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 8: pdest[-8] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 7: pdest[-7] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 6: pdest[-6] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 5: pdest[-5] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 4: pdest[-4] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 3: pdest[-3] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 2: pdest[-2] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 1: pdest[-1] = pbase[(s >> 16) + (t >> 16) * cachewidth];
+			case 0: break;
+			}
 
 			s = snext;
 			t = tnext;
@@ -354,7 +368,6 @@
 
 	} while ((pspan = pspan->pnext) != nil);
 }
-
 
 /*
 =============
--- a/qw/d_init.c
+++ b/qw/d_init.c
@@ -121,7 +121,7 @@
 	for (i=0 ; i<(NUM_MIPS-1) ; i++)
 		d_scalemip[i] = basemip[i] * d_mipscale.value;
 
-	d_drawspans = D_DrawSpans8;	/* no DrawSpans16 for non-asm heathens */
+	d_drawspans = D_DrawSpans16;
 	d_aflatcolor = 0;
 }
 
--- a/qw/d_scan.c
+++ b/qw/d_scan.c
@@ -221,16 +221,16 @@
 
 /*
 =============
-D_DrawSpans8
+D_DrawSpans16
 =============
 */
-void D_DrawSpans8 (espan_t *pspan)
+void D_DrawSpans16 (espan_t *pspan) //qbism- up it from 8 to 16
 {
 	int				count, spancount;
 	unsigned char	*pbase, *pdest;
 	fixed16_t		s, t, snext, tnext, sstep, tstep;
 	float			sdivz, tdivz, zi, z, du, dv, spancountminus1;
-	float			sdivz8stepu, tdivz8stepu, zi8stepu;
+	float			sdivzstepu, tdivzstepu, zistepu;
 
 	sstep = 0;	// keep compiler happy
 	tstep = 0;	// ditto
@@ -237,9 +237,9 @@
 
 	pbase = (unsigned char *)cacheblock;
 
-	sdivz8stepu = d_sdivzstepu * 8;
-	tdivz8stepu = d_tdivzstepu * 8;
-	zi8stepu = d_zistepu * 8;
+	sdivzstepu = d_sdivzstepu * 16;
+	tdivzstepu = d_tdivzstepu * 16;
+	zistepu = d_zistepu * 16;
 
 	do
 	{
@@ -272,8 +272,8 @@
 		do
 		{
 		// calculate s and t at the far end of the span
-			if (count >= 8)
-				spancount = 8;
+			if (count >= 16)
+				spancount = 16;
 			else
 				spancount = count;
 
@@ -283,16 +283,16 @@
 			{
 			// calculate s/z, t/z, zi->fixed s and t at far end of span,
 			// calculate s and t steps across span by shifting
-				sdivz += sdivz8stepu;
-				tdivz += tdivz8stepu;
-				zi += zi8stepu;
+				sdivz += sdivzstepu;
+				tdivz += tdivzstepu;
+				zi += zistepu;
 				z = (float)0x10000 / zi;	// prescale to 16.16 fixed-point
 
 				snext = (int)(sdivz * z) + sadjust;
 				if (snext > bbextents)
 					snext = bbextents;
-				else if (snext < 8)
-					snext = 8;	// prevent round-off error on <0 steps from
+				else if (snext <= 16)
+					snext = 16;	// prevent round-off error on <0 steps from
 								//  from causing overstepping & running off the
 								//  edge of the texture
 
@@ -299,11 +299,11 @@
 				tnext = (int)(tdivz * z) + tadjust;
 				if (tnext > bbextentt)
 					tnext = bbextentt;
-				else if (tnext < 8)
-					tnext = 8;	// guard against round-off error on <0 steps
+				else if (tnext < 16)
+					tnext = 16;	// guard against round-off error on <0 steps
 
-				sstep = (snext - s) >> 3;
-				tstep = (tnext - t) >> 3;
+				sstep = (snext - s) >> 4;
+				tstep = (tnext - t) >> 4;
 			}
 			else
 			{
@@ -319,8 +319,8 @@
 				snext = (int)(sdivz * z) + sadjust;
 				if (snext > bbextents)
 					snext = bbextents;
-				else if (snext < 8)
-					snext = 8;	// prevent round-off error on <0 steps from
+				else if (snext < 16)
+					snext = 16;	// prevent round-off error on <0 steps from
 								//  from causing overstepping & running off the
 								//  edge of the texture
 
@@ -327,8 +327,8 @@
 				tnext = (int)(tdivz * z) + tadjust;
 				if (tnext > bbextentt)
 					tnext = bbextentt;
-				else if (tnext < 8)
-					tnext = 8;	// guard against round-off error on <0 steps
+				else if (tnext < 16)
+					tnext = 16;	// guard against round-off error on <0 steps
 
 				if (spancount > 1)
 				{
@@ -337,12 +337,28 @@
 				}
 			}
 
-			do
+			pdest += spancount;
+
+			switch (spancount)
 			{
-				*pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth);
-				s += sstep;
-				t += tstep;
-			} while (--spancount > 0);
+			case 16: pdest[-16] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 15: pdest[-15] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 14: pdest[-14] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 13: pdest[-13] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 12: pdest[-12] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 11: pdest[-11] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 10: pdest[-10] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 9: pdest[-9] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 8: pdest[-8] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 7: pdest[-7] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 6: pdest[-6] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 5: pdest[-5] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 4: pdest[-4] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 3: pdest[-3] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 2: pdest[-2] = pbase[(s >> 16) + (t >> 16) * cachewidth]; s += sstep; t += tstep;
+			case 1: pdest[-1] = pbase[(s >> 16) + (t >> 16) * cachewidth];
+			case 0: break;
+			}
 
 			s = snext;
 			t = tnext;
--- a/qw/r_surf.c
+++ b/qw/r_surf.c
@@ -307,8 +307,8 @@
 */
 void R_DrawSurfaceBlock8_mip0 (void)
 {
-	int				v, i, b, lightstep, lighttemp, light;
-	unsigned char	pix, *psource, *prowdest;
+	int				v, i, lightstep, lighttemp, light;
+	unsigned char	*psource, *prowdest;
 
 	psource = pbasesource;
 	prowdest = prowdestbase;
@@ -330,14 +330,23 @@
 
 			light = lightright;
 
-			for (b=15; b>=0; b--)
-			{
-				pix = psource[b];
-				prowdest[b] = ((unsigned char *)vid.colormap)
-						[(light & 0xFF00) + pix];
-				light += lightstep;
-			}
-	
+			prowdest[15] = vid.colormap[((light += lightstep) & 0xFF00) + psource[15]];
+			prowdest[14] = vid.colormap[((light += lightstep) & 0xFF00) + psource[14]];
+			prowdest[13] = vid.colormap[((light += lightstep) & 0xFF00) + psource[13]];
+			prowdest[12] = vid.colormap[((light += lightstep) & 0xFF00) + psource[12]];
+			prowdest[11] = vid.colormap[((light += lightstep) & 0xFF00) + psource[11]];
+			prowdest[10] = vid.colormap[((light += lightstep) & 0xFF00) + psource[10]];
+			prowdest[9] = vid.colormap[((light += lightstep) & 0xFF00) + psource[9]];
+			prowdest[8] = vid.colormap[((light += lightstep) & 0xFF00) + psource[8]];
+			prowdest[7] = vid.colormap[((light += lightstep) & 0xFF00) + psource[7]];
+			prowdest[6] = vid.colormap[((light += lightstep) & 0xFF00) + psource[6]];
+			prowdest[5] = vid.colormap[((light += lightstep) & 0xFF00) + psource[5]];
+			prowdest[4] = vid.colormap[((light += lightstep) & 0xFF00) + psource[4]];
+			prowdest[3] = vid.colormap[((light += lightstep) & 0xFF00) + psource[3]];
+			prowdest[2] = vid.colormap[((light += lightstep) & 0xFF00) + psource[2]];
+			prowdest[1] = vid.colormap[((light += lightstep) & 0xFF00) + psource[1]];
+			prowdest[0] = vid.colormap[(light & 0xFF00) + psource[0]];
+
 			psource += sourcetstep;
 			lightright += lightrightstep;
 			lightleft += lightleftstep;
--- a/r_surf.c
+++ b/r_surf.c
@@ -294,8 +294,8 @@
 */
 void R_DrawSurfaceBlock8_mip0 (void)
 {
-	int				v, i, b, lightstep, lighttemp, light;
-	unsigned char	pix, *psource, *prowdest;
+	int				v, i, lightstep, lighttemp, light;
+	unsigned char	*psource, *prowdest;
 
 	psource = pbasesource;
 	prowdest = prowdestbase;
@@ -317,14 +317,23 @@
 
 			light = lightright;
 
-			for (b=15; b>=0; b--)
-			{
-				pix = psource[b];
-				prowdest[b] = ((unsigned char *)vid.colormap)
-						[(light & 0xFF00) + pix];
-				light += lightstep;
-			}
-	
+			prowdest[15] = vid.colormap[((light += lightstep) & 0xFF00) + psource[15]];
+			prowdest[14] = vid.colormap[((light += lightstep) & 0xFF00) + psource[14]];
+			prowdest[13] = vid.colormap[((light += lightstep) & 0xFF00) + psource[13]];
+			prowdest[12] = vid.colormap[((light += lightstep) & 0xFF00) + psource[12]];
+			prowdest[11] = vid.colormap[((light += lightstep) & 0xFF00) + psource[11]];
+			prowdest[10] = vid.colormap[((light += lightstep) & 0xFF00) + psource[10]];
+			prowdest[9] = vid.colormap[((light += lightstep) & 0xFF00) + psource[9]];
+			prowdest[8] = vid.colormap[((light += lightstep) & 0xFF00) + psource[8]];
+			prowdest[7] = vid.colormap[((light += lightstep) & 0xFF00) + psource[7]];
+			prowdest[6] = vid.colormap[((light += lightstep) & 0xFF00) + psource[6]];
+			prowdest[5] = vid.colormap[((light += lightstep) & 0xFF00) + psource[5]];
+			prowdest[4] = vid.colormap[((light += lightstep) & 0xFF00) + psource[4]];
+			prowdest[3] = vid.colormap[((light += lightstep) & 0xFF00) + psource[3]];
+			prowdest[2] = vid.colormap[((light += lightstep) & 0xFF00) + psource[2]];
+			prowdest[1] = vid.colormap[((light += lightstep) & 0xFF00) + psource[1]];
+			prowdest[0] = vid.colormap[(light & 0xFF00) + psource[0]];
+
 			psource += sourcetstep;
 			lightright += lightrightstep;
 			lightleft += lightleftstep;