shithub: qk1

Download patch

ref: a2f781fe0e786384ed36edb93aceb42b3b1b76ab
parent: a53e14793a11a6af0b83bdbd94772c47ee41e44b
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Tue Jan 9 18:12:46 EST 2024

first experiment in making drawing more parallel

--- a/d_edge.c
+++ b/d_edge.c
@@ -1,3 +1,9 @@
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <limits.h>
+#include "fast_barrier.h"
 #include "quakedef.h"
 
 float scale_for_mip;
@@ -23,7 +29,7 @@
 }
 
 static void
-D_DrawSolidSurface(surf_t *surf, pixel_t color)
+D_DrawSolidSurface(surf_t *surf, pixel_t color, int first, int end)
 {
 	espan_t *span;
 	pixel_t *pdest;
@@ -31,6 +37,8 @@
 	int u, u2;
 
 	for(span = surf->spans; span; span=span->pnext){
+		if(span->v < first || span->v >= end)
+			continue;
 		pdest = dvars.fb + span->v*dvars.w;
 		pz = dvars.zb + span->v*dvars.w;
 		memset(pz, 0xfe, span->count*sizeof(*pz));
@@ -80,9 +88,31 @@
 	tv->t.bbextent = ((pface->extents[1] << 16) >> miplevel) - 1;
 }
 
-void
-D_DrawSurfaces(view_t *v0)
+static fast_barrier_t spansgobrr, spansgohome;
+static pthread_spinlock_t spancache;
+
+typedef struct span_thread_t span_thread_t;
+
+struct span_thread_t {
+	pthread_t tid;
+	int n;
+	int first;
+	int end;
+};
+
+static int nthreads = 8;
+static bool spawned = false;
+static view_t *v0;
+
+static void
+spancachelock(int n)
 {
+	(n ? pthread_spin_lock : pthread_spin_unlock)(&spancache);
+}
+
+static void
+spannothread(view_t *v0, int first, int end)
+{
 	vec3_t local_modelorg, transformed_modelorg, world_transformed_modelorg;
 	surfcache_t *pcurrentcache;
 	drawsurf_t ds = {0};
@@ -90,23 +120,35 @@
 	int miplevel;
 	entity_t *e;
 	texvars_t t;
-	surf_t *s;
 	byte alpha;
 	bool blend;
+	surf_t *s;
 	view_t v;
+	espan_t *sp;
+	bool yes;
 
+	///uvlong t0 = nanosec();
+
 	memmove(&v, v0, sizeof(v));
 	TransformVector(v.modelorg, transformed_modelorg, &v);
 	VectorCopy(transformed_modelorg, world_transformed_modelorg);
 
 	// TODO: could preset a lot of this at mode set time
-	for(s = &surfaces[1]; s < surface_p; s++){
-		if(!s->spans)
+	for(s = surfaces+1; s < surface_p; s++){
+		e = s->entity;
+		if(!s->spans || ((surfdrawflags(s->flags) | entdrawflags(e)) ^ r_drawflags))
 			continue;
 
-		e = s->entity;
-		if((surfdrawflags(s->flags) | entdrawflags(e)) ^ r_drawflags)
+		yes = false;
+		for(sp = s->spans; sp != nil; sp = sp->pnext){
+			if(sp->v >= first && sp->v < end){
+				yes = true;
+				break;
+			}
+		}
+		if(!yes)
 			continue;
+
 		alpha = 255;
 		if(enthasalpha(e) && e->alpha != 255)
 			alpha = e->alpha;
@@ -127,20 +169,20 @@
 
 		pface = s->data;
 		if(s->flags & SURF_DRAWSKY){
-			D_DrawSkyScans8(s->spans);
+			D_DrawSkyScans8(s->spans, first, end);
 		}else if(s->flags & SURF_DRAWBACKGROUND){
-			D_DrawSolidSurface(s, q1pal[(int)r_clearcolor.value & 0xFF]);
+			D_DrawSolidSurface(s, q1pal[(int)r_clearcolor.value & 0xFF], first, end);
 		}else if(s->flags & SURF_DRAWTURB){
 			t.p = pface->texinfo->texture->pixels;
 			t.w = 64;
 			D_CalcGradients(0, pface, transformed_modelorg, &v, &t);
-			D_DrawSpans(s->spans, &t, alpha, SPAN_TURB);
+			D_DrawSpans(s->spans, &t, alpha, SPAN_TURB, first, end);
 		}else{
 			miplevel = D_MipLevelForScale(s->nearzi * scale_for_mip * pface->texinfo->mipadjust);
 			if(s->flags & SURF_FENCE)
 				miplevel = max(miplevel-1, 0);
 
-			pcurrentcache = D_CacheSurface(s->entity, pface, &ds, miplevel);
+			pcurrentcache = D_CacheSurface(s->entity, pface, &ds, miplevel, spancachelock);
 			t.p = pcurrentcache->pixels;
 			t.w = pcurrentcache->width;
 			D_CalcGradients(miplevel, pface, transformed_modelorg, &v, &t);
@@ -148,7 +190,9 @@
 			D_DrawSpans(s->spans, &t, alpha,
 				(alpha == 255 && (s->flags & SURF_FENCE))
 					? SPAN_FENCE
-					: (blend ? SPAN_BLEND : SPAN_SOLID)
+					: (blend ? SPAN_BLEND : SPAN_SOLID),
+				first,
+				end
 			);
 		}
 
@@ -156,5 +200,182 @@
 			VectorCopy(world_transformed_modelorg, transformed_modelorg);
 			memmove(&v, v0, sizeof(v));
 		}
+	}
+
+	///uvlong t1 = nanosec();
+	///if(first != 0 || end != vid.height)
+	///	fprintf(stderr, "@%d %llu\n", 0, t1-t0);
+}
+
+static void *
+spanthread(void *th_)
+{
+	vec3_t local_modelorg, transformed_modelorg, world_transformed_modelorg;
+	surfcache_t *pcurrentcache;
+	span_thread_t *th = th_;
+	drawsurf_t ds = {0};
+	msurface_t *pface;
+	int miplevel, ns;
+	entity_t *e;
+	texvars_t t;
+	byte alpha;
+	bool blend;
+	surf_t *s;
+	espan_t *sp;
+	bool yes;
+	view_t v;
+
+	for(;;){
+		fast_barrier_wait(&spansgobrr);
+
+		//uvlong t0 = nanosec();
+		memmove(&v, v0, sizeof(v));
+		TransformVector(v.modelorg, transformed_modelorg, &v);
+		VectorCopy(transformed_modelorg, world_transformed_modelorg);
+		ns = 0;
+
+		// TODO: could preset a lot of this at mode set time
+		for(s = surfaces+1; s < surface_p; s++){
+			e = s->entity;
+			if(!s->spans || ((surfdrawflags(s->flags) | entdrawflags(e)) ^ r_drawflags))
+				continue;
+			yes = false;
+			for(sp = s->spans; sp != nil; sp = sp->pnext){
+				if(sp->v >= th->first && sp->v < th->end){
+					yes = true;
+					break;
+				}
+			}
+			if(!yes)
+				continue;
+			ns++;
+			alpha = 255;
+			if(enthasalpha(e) && e->alpha != 255)
+				alpha = e->alpha;
+			else if(s->flags & SURF_TRANS)
+				alpha *= alphafor(s->flags);
+			if(alpha < 1)
+				alpha = 255;
+
+			t.z.stepu = s->d_zistepu;
+			t.z.stepv = s->d_zistepv;
+			t.z.origin = s->d_ziorigin;
+
+			if(insubmodel(s)){
+				VectorSubtract(v.org, e->origin, local_modelorg);
+				TransformVector(local_modelorg, transformed_modelorg, &v);
+				R_RotateBmodel(e, &v);
+			}
+
+			pface = s->data;
+			if(s->flags & SURF_DRAWSKY){
+				D_DrawSkyScans8(s->spans, th->first, th->end);
+			}else if(s->flags & SURF_DRAWBACKGROUND){
+				D_DrawSolidSurface(s, q1pal[(int)r_clearcolor.value & 0xFF], th->first, th->end);
+			}else if(s->flags & SURF_DRAWTURB){
+				t.p = pface->texinfo->texture->pixels;
+				t.w = 64;
+				D_CalcGradients(0, pface, transformed_modelorg, &v, &t);
+				D_DrawSpans(s->spans, &t, alpha, SPAN_TURB, th->first, th->end);
+			}else{
+				miplevel = D_MipLevelForScale(s->nearzi * scale_for_mip * pface->texinfo->mipadjust);
+				if(s->flags & SURF_FENCE)
+					miplevel = max(miplevel-1, 0);
+
+				pcurrentcache = D_CacheSurface(s->entity, pface, &ds, miplevel, spancachelock);
+				t.p = pcurrentcache->pixels;
+				t.w = pcurrentcache->width;
+				D_CalcGradients(miplevel, pface, transformed_modelorg, &v, &t);
+				blend = (s->flags & SURF_FENCE) || (r_drawflags & DRAW_BLEND);
+				D_DrawSpans(s->spans, &t, alpha,
+					(alpha == 255 && (s->flags & SURF_FENCE))
+						? SPAN_FENCE
+						: (blend ? SPAN_BLEND : SPAN_SOLID),
+					th->first,
+					th->end
+				);
+			}
+
+			if(insubmodel(s)){
+				VectorCopy(world_transformed_modelorg, transformed_modelorg);
+				memmove(&v, v0, sizeof(v));
+			}
+		}
+
+		///uvlong t1 = nanosec();
+		///fprintf(stderr, "@%d %llu\n", th->n, t1-t0);
+		//fprintf(stderr, "!%d %d\n", th->n, ns);
+		fast_barrier_wait(&spansgohome);
+	}
+
+	return nil;
+}
+
+static span_thread_t *threads;
+
+void
+D_DrawSurfaces(view_t *v0_)
+{
+	static int lastheight = -1;
+	span_thread_t *t;
+	int i, split, dt, n, y;
+
+	if(lastheight < 0)
+		pthread_spin_init(&spancache, PTHREAD_PROCESS_PRIVATE);
+
+	if(nthreads > 1 && threads == nil){
+		pthread_barrierattr_t battr;
+		cpu_set_t set;
+
+		pthread_barrierattr_setpshared(&battr, PTHREAD_PROCESS_PRIVATE);
+		fast_barrier_init(&spansgobrr, &battr, nthreads);
+		fast_barrier_init(&spansgohome, &battr, nthreads);
+
+		threads = calloc(1, sizeof(*threads) * nthreads);
+		for(t = threads, i = 0; i < nthreads; i++, t++){
+			t->n = i;
+			CPU_ZERO(&set);
+			CPU_SET(2*i, &set);
+			if(i == 0){
+				sched_setaffinity(getpid(), sizeof(set), &set);
+			}else{
+				pthread_create(&t->tid, nil, spanthread, t);
+				pthread_setaffinity_np(t->tid, sizeof(set), &set);
+			}
+		}
+		spawned = true;
+	}
+	if(threads != nil && lastheight != vid.height){
+		lastheight = vid.height;
+		split = (nthreads+2)*nthreads/8;
+		dt = vid.height/2 / split;
+		n = dt*nthreads/2;
+		y = 0;
+		for(t = threads, i = 0; i < nthreads; i++, t++){
+			t->first = y;
+			t->end = y = y + n;
+			if((n -= dt) == 0){
+				dt = -dt;
+				n = -dt;
+			}
+			///fprintf(stderr, "# %d: %d...%d\n", i, t->first, t->end);
+		}
+		t[-1].end = vid.height;
+	}
+
+	v0 = v0_;
+	if(nthreads < 2 || (r_drawflags & DRAW_BLEND) != 0){
+		// overhead (lots of small objects + synchronization)
+		// not worth it - run it all in the same thread
+		spannothread(v0, 0, vid.height);
+	}else{
+		///uvlong t0 = nanosec();
+		fast_barrier_wait(&spansgobrr);
+		///uvlong t1 = nanosec();
+		spannothread(v0, threads[0].first, threads[0].end);
+		///uvlong t2 = nanosec();
+		fast_barrier_wait(&spansgohome);
+		///uvlong t3 = nanosec();
+		///fprintf(stderr, "---------- total=%llu start_barrier=%llu end_barrier=%llu\n", t3-t0, t1-t0, t3-t2);
 	}
 }
--- a/d_local.h
+++ b/d_local.h
@@ -3,7 +3,7 @@
 enum {
 	DS_SPAN_LIST_END = -128,
 
-	SURFCACHE_SIZE_AT_320X200 = 600*1024,
+	SURFCACHE_SIZE_AT_320X200 = 8*1024*1024,
 };
 
 typedef struct {
@@ -79,11 +79,11 @@
 	SPAN_TURB,
 };
 
-void D_DrawSpans(espan_t *pspan, texvars_t *t, byte alpha, int spanfunc);
+void D_DrawSpans(espan_t *pspan, texvars_t *t, byte alpha, int spanfunc, int first, int end);
 
-void D_DrawSkyScans8 (espan_t *pspan);
+void D_DrawSkyScans8 (espan_t *pspan, int first, int end);
 
-surfcache_t	*D_CacheSurface(entity_t *e, msurface_t *ms, drawsurf_t *ds, int miplevel);
+surfcache_t	*D_CacheSurface(entity_t *e, msurface_t *ms, drawsurf_t *ds, int miplevel, void (*lock)(int n));
 
 extern int	*d_pscantable;
 extern int	d_scantable[MAXHEIGHT];
--- a/d_scan.c
+++ b/d_scan.c
@@ -69,7 +69,7 @@
 }
 
 void
-D_DrawSpans(espan_t *pspan, texvars_t *tv, byte alpha, int spanfunc)
+D_DrawSpans(espan_t *pspan, texvars_t *tv, byte alpha, int spanfunc, int first, int end)
 {
 	int			count, spancount, izistep, spancountminus1, spanshift, spanmax;
 	pixel_t		*pdest;
@@ -95,6 +95,8 @@
 	fogenabled = isfogged();
 
 	do{
+		if(pspan->v < first || pspan->v >= end)
+			continue;
 		pdest = dvars.fb + pspan->v*dvars.w + pspan->u;
 		pz = dvars.zb + pspan->v*dvars.w + pspan->u;
 		zi = tv->z.origin + pspan->v*tv->z.stepv + pspan->u*tv->z.stepu;
--- a/d_sky.c
+++ b/d_sky.c
@@ -36,7 +36,8 @@
 D_DrawSkyScans8
 =================
 */
-void D_DrawSkyScans8 (espan_t *pspan)
+void
+D_DrawSkyScans8(espan_t *pspan, int first, int end)
 {
 	int count, spancount, u, v, spancountminus1;
 	pixel_t *pdest, pix;
@@ -57,6 +58,8 @@
 
 	do
 	{
+		if(pspan->v < first || pspan->v >= end)
+			continue;
 		pdest = dvars.fb + pspan->v*dvars.w + pspan->u;
 		count = pspan->count;
 		pz = dvars.zb + pspan->v*dvars.w + pspan->u;
--- a/d_surf.c
+++ b/d_surf.c
@@ -181,7 +181,7 @@
 ================
 */
 surfcache_t *
-D_CacheSurface(entity_t *e, msurface_t *ms, drawsurf_t *ds, int miplevel)
+D_CacheSurface(entity_t *e, msurface_t *ms, drawsurf_t *ds, int miplevel, void (*lock)(int))
 {
 	surfcache_t *cache;
 
@@ -192,6 +192,8 @@
 	ds->lightadj[2] = d_lightstylevalue[ms->styles[2]];
 	ds->lightadj[3] = d_lightstylevalue[ms->styles[3]];
 
+	if(lock != nil)
+		lock(1);
 	// see if the cache holds apropriate data
 	cache = ms->cachespots[miplevel];
 
@@ -201,8 +203,11 @@
 			&& cache->lightadj[0] == ds->lightadj[0]
 			&& cache->lightadj[1] == ds->lightadj[1]
 			&& cache->lightadj[2] == ds->lightadj[2]
-			&& cache->lightadj[3] == ds->lightadj[3] )
+			&& cache->lightadj[3] == ds->lightadj[3] ){
+		if(lock != nil)
+			lock(0);
 		return cache;
+	}
 
 	// determine shape of surface
 	surfscale = 1.0 / (1<<miplevel);
@@ -228,6 +233,8 @@
 	cache->lightadj[3] = ds->lightadj[3];
 	ds->m = ms;
 	R_DrawSurface(e, ds);
+	if(lock != nil)
+		lock(0);
 
 	return ms->cachespots[miplevel];
 }
--- a/r_local.h
+++ b/r_local.h
@@ -101,8 +101,6 @@
 void R_DrawSolidClippedSubmodelPolygons (model_t *pmodel, view_t *v);
 void R_DrawSubmodelPolygons (model_t *pmodel, view_t *v, int clipflags);
 
-void R_AddPolygonEdges (emitpoint_t *pverts, int numverts, int miplevel);
-surf_t *R_GetSurf (void);
 void R_AliasDrawModel (alight_t *plighting, view_t *v);
 void R_BeginEdgeFrame (void);
 void R_ScanEdges(view_t *v);
@@ -110,13 +108,6 @@
 void R_InsertNewEdges (edge_t *edgestoadd, edge_t *edgelist);
 void R_StepActiveU (edge_t *pedge);
 void R_RemoveEdges (edge_t *pedge);
-
-extern void R_Surf8Start (void);
-extern void R_Surf8End (void);
-extern void R_Surf16Start (void);
-extern void R_Surf16End (void);
-extern void R_EdgeCodeStart (void);
-extern void R_EdgeCodeEnd (void);
 
 extern void R_RotateBmodel (entity_t *e, view_t *v);
 
--- a/screen.c
+++ b/screen.c
@@ -321,6 +321,7 @@
 		fps = host_framecount - lastcnt;
 		lastcnt = host_framecount;
 		lastframetime = t;
+fprintf(stderr, "%d\n", fps);
 	}
 	n = snprint(s, sizeof(s), "%d", fps);
 	Draw_String(vid.width - n*8, 0, s);