shithub: mc

Download patch

ref: de10733eadd13153e899a30538a4c21f95af4b9c
parent: 0b5b219f27ff55efda4f33df0ca3cd87140be712
parent: 156c048f0239b99b870cf1694c6886e39b85ae09
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Aug 8 21:40:57 EDT 2020

merge

--- a/6/asm.h
+++ b/6/asm.h
@@ -46,6 +46,27 @@
 } Mode;
 
 typedef enum {
+	PassInNoPref,
+	PassInSSE,
+	PassInInt,
+	PassInMemory,
+} PassIn;
+
+typedef enum {
+	ArgVoid,
+	/* Either int or flt, depending on Loc* type */
+	ArgReg,
+	/* Small aggregates packed into registers */
+	ArgAggrI,
+	ArgAggrF,
+	ArgAggrII,
+	ArgAggrFI,
+	ArgAggrIF,
+	ArgAggrFF,
+	ArgBig,
+} ArgType;
+
+typedef enum {
 	Classbad,
 	Classint,
 	Classflt,
@@ -141,6 +162,7 @@
 	Htab *envoff;	/* Loc* -> int envoff map */
 	size_t stksz;	/* stack size */
 	Node *ret;	/* return value */
+	ArgType rettype;	/* how to actually get ret out */
 
 	Cfg  *cfg;	/* flow graph */
 	char isexport;	/* is this exported from the asm? */
@@ -174,6 +196,7 @@
 	Asmbb *curbb;
 
 	Node *ret;          /* we store the return into here */
+	ArgType rettype;    /* how ret actually gets out of the function */
 	Htab *spillslots;   /* reg id  => int stkoff */
 	Htab *reglocs;      /* decl id => Loc *reg */
 	Htab *stkoff;       /* decl id => int stkoff */
@@ -295,6 +318,8 @@
 Loc *coreg(Reg r, Mode m);
 int isfloatmode(Mode m);
 int isintmode(Mode m);
+int issubreg(Loc *, Loc *);
+int dumbmov(Loc *, Loc *);
 
 /* emitting instructions */
 Insn *mkinsn(int op, ...);
@@ -314,6 +339,9 @@
 size_t size(Node *n);
 ssize_t tyoffset(Type *ty, Node *memb);
 ssize_t offset(Node *aggr, Node *memb);
+size_t countargs(Type *t);
+ArgType classify(Type *t);
+int isaggregate(Type *t);
 int stacknode(Node *n);
 int floatnode(Node *n);
 void breakhere();
--- a/6/gengas.c
+++ b/6/gengas.c
@@ -107,12 +107,6 @@
 	}
 }
 
-static int
-issubreg(Loc *a, Loc *b)
-{
-	return rclass(a) == rclass(b) && a->mode != b->mode;
-}
-
 void
 iprintf(FILE *fd, Insn *insn)
 {
@@ -132,29 +126,22 @@
 				insn->args[1] = coreg(insn->args[1]->reg.colour, ModeL);
 			}
 		}
-		/* moving a reg to itself is dumb. */
-		if (insn->args[0]->reg.colour == insn->args[1]->reg.colour)
-                    return;
+		if(dumbmov(insn->args[0], insn->args[1]))
+			return;
 		break;
 	case Imovs:
-		if (insn->args[0]->reg.colour == Rnone || insn->args[1]->reg.colour == Rnone)
-			break;
-		/* moving a reg to itself is dumb. */
-		if (insn->args[0]->reg.colour == insn->args[1]->reg.colour)
+		if(dumbmov(insn->args[0], insn->args[1]))
 			return;
 		break;
 	case Imov:
 		assert(!isfloatmode(insn->args[0]->mode));
-		if (insn->args[0]->type != Locreg || insn->args[1]->type != Locreg)
-			break;
-		if (insn->args[0]->reg.colour == Rnone || insn->args[1]->reg.colour == Rnone)
-			break;
-		/* if one reg is a subreg of another, we can just use the right
-		 * mode to move between them. */
+		/* 
+		 * if one reg is a subreg of another, we can just use the right
+		 * mode to move between them, without any cost.
+		 */
 		if (issubreg(insn->args[0], insn->args[1]))
 			insn->args[0] = coreg(insn->args[0]->reg.colour, insn->args[1]->mode);
-		/* moving a reg to itself is dumb. */
-		if (insn->args[0]->reg.colour == insn->args[1]->reg.colour)
+		if(dumbmov(insn->args[0], insn->args[1]))
 			return;
 		break;
 	default:
@@ -357,6 +344,7 @@
 	is.envoff = fn->envoff;
 	is.globls = globls;
 	is.ret = fn->ret;
+	is.rettype = fn->rettype;
 	is.cfg = fn->cfg;
 	is.cwd = strdup(cwd);
 
--- a/6/genp9.c
+++ b/6/genp9.c
@@ -105,12 +105,6 @@
 	}
 }
 
-static int
-issubreg(Loc *a, Loc *b)
-{
-	return rclass(a) == rclass(b) && a->mode != b->mode;
-}
-
 static void
 iprintf(FILE *fd, Insn *insn)
 {
@@ -130,26 +124,22 @@
 				insn->args[1] = coreg(insn->args[1]->reg.colour, ModeL);
 			}
 		}
+		if(dumbmov(insn->args[0], insn->args[1]))
+			return;
 		break;
 	case Imovs:
-		if (insn->args[0]->reg.colour == Rnone || insn->args[1]->reg.colour == Rnone)
-			break;
-		/* moving a reg to itself is dumb. */
-		if (insn->args[0]->reg.colour == insn->args[1]->reg.colour)
+		if(dumbmov(insn->args[0], insn->args[1]))
 			return;
 		break;
 	case Imov:
 		assert(!isfloatmode(insn->args[0]->mode));
-		if (insn->args[0]->type != Locreg || insn->args[1]->type != Locreg)
-			break;
-		if (insn->args[0]->reg.colour == Rnone || insn->args[1]->reg.colour == Rnone)
-			break;
-		/* if one reg is a subreg of another, we can just use the right
-		 * mode to move between them. */
+		/* 
+		 * if one reg is a subreg of another, we can just use the right
+		 * mode to move between them, without any cost.
+		 */
 		if (issubreg(insn->args[0], insn->args[1]))
 			insn->args[0] = coreg(insn->args[0]->reg.colour, insn->args[1]->mode);
-		/* moving a reg to itself is dumb. */
-		if (insn->args[0]->reg.colour == insn->args[1]->reg.colour)
+		if(dumbmov(insn->args[0], insn->args[1]))
 			return;
 		break;
 	default:
@@ -360,6 +350,7 @@
 	is.envoff = fn->envoff;
 	is.globls = globls;
 	is.ret = fn->ret;
+	is.rettype = fn->rettype;
 	is.cfg = fn->cfg;
 	if (fn->hasenv)
 		is.envp = locreg(ModeQ);
--- a/6/isel.c
+++ b/6/isel.c
@@ -27,6 +27,10 @@
 	Rxmm4d, Rxmm5d, Rxmm6d, Rxmm7d,
 };
 regid intargregs[] = {Rrdi, Rrsi, Rrdx, Rrcx, Rr8, Rr9};
+#define Nfloatregrets 2
+#define Nintregrets 2
+regid fltretregs[] = {Rxmm0d, Rxmm1d};
+regid intretregs[] = {Rrax, Rrdx};
 
 /* used to decide which operator is appropriate
  * for implementing various conditional operators */
@@ -83,7 +87,49 @@
 	return ModeNone;
 }
 
+static Mode tymodepart(Type *t, int is_float, size_t displacement)
+{
+	assert(isstacktype(t));
+	size_t sz = tysize(t);
+
+	if (is_float) {
+		switch(sz - displacement) {
+		case 4: return ModeF; break;
+		default: return ModeD; break;
+		}
+	} else {
+		switch(sz - displacement) {
+		case 1: return ModeB; break;
+		case 2: return ModeW; break;
+		case 4: return ModeL; break;
+		default: return ModeQ; break;
+		}
+	}
+}
+
 static Mode
+forcefltmode(Mode m)
+{
+	assert(m != ModeNone);
+	switch (m) {
+	case ModeQ: return ModeD;
+	case ModeD: return ModeD;
+	default: return ModeF;
+	}
+}
+
+static Mode
+forceintmode(Mode m)
+{
+	assert(m != ModeNone);
+	switch (m) {
+	case ModeD: return ModeQ;
+	case ModeF: return ModeL;
+	default: return m;
+	}
+}
+
+static Mode
 mode(Node *n)
 {
 	if (n->type == Nexpr)
@@ -500,65 +546,186 @@
 	g(s, op, f, NULL);
 }
 
-static size_t
-countargs(Type *t)
+static void
+placearg(Isel *s, Node *argn, Loc *argloc, PassIn p, Loc *rsp, int vararg, size_t *nfloats, size_t *nints, size_t *argoff)
 {
-	size_t nargs;
+	/*
+	   placearg may be called when argn is stored at argloc, but it may also
+	   be called when argloc is a small piece of argn, as in the case when
+	   small structs are being passed. In those circumstances, p is PassInSSE
+	   or PassInInt, and argn is irrelevant. Therefore, argn should not be
+	   relied on when p is PassInSSE or PassInInt.
+	 */
+	Loc *src, *dst;
+	size_t a;
 
-	t = tybase(t);
-	nargs = t->nsub - 1;
-	if (isstacktype(t->sub[0]))
-		nargs++;
-	/* valists are replaced with hidden type parameter,
-	 * which we want on the stack for ease of ABI */
-	if (tybase(t->sub[t->nsub - 1])->type == Tyvalist)
-		nargs--;
-	return nargs;
+	if (p == PassInNoPref) {
+		if (stacknode(argn)) {
+			p = PassInMemory;
+		} else if (!vararg && isfloatmode(argloc->mode) && *nfloats < Nfloatregargs) {
+			p = PassInSSE;
+		} else if (!vararg && isintmode(argloc->mode) && *nints < Nintregargs) {
+			p = PassInInt;
+		} else {
+			p = PassInMemory;
+		}
+	}
+
+	switch (p) {
+	case PassInMemory:
+		if (stacknode(argn)) {
+			src = locreg(ModeQ);
+			g(s, Ilea, argloc, src, NULL);
+			a = tyalign(exprtype(argn));
+			blit(s, rsp, src, *argoff, 0, size(argn), a);
+			*argoff += size(argn);
+		} else {
+			dst = locmem(*argoff, rsp, NULL, argloc->mode);
+			argloc = inri(s, argloc);
+			stor(s, argloc, dst);
+			*argoff += size(argn);
+		}
+		break;
+	case PassInSSE:
+		dst = coreg(floatargregs[*nfloats], forcefltmode(argloc->mode));
+		argloc = inri(s, argloc);
+		if (isfloatmode(argloc->mode)) {
+			g(s, Imovs, argloc, dst, NULL);
+		} else {
+			g(s, Imov, argloc, dst, NULL);
+		}
+		(*nfloats)++;
+		break;
+	case PassInInt:
+		dst = coreg(intargregs[*nints], forceintmode(argloc->mode));
+		argloc = inri(s, argloc);
+		g(s, Imov, argloc, dst, NULL);
+		(*nints)++;
+		break;
+	case PassInNoPref: /* impossible */
+		die("cannot determine how to pass arg");
+		break;
+	}
 }
 
+static int
+sufficientregs(ArgType a, size_t nfloats, size_t nints)
+{
+	static const struct {
+		int ireg;
+		int freg;
+	} needed[] = {
+	[ArgAggrI]   = {1, 0},
+	[ArgAggrFI]  = {1, 1},
+	[ArgAggrIF]  = {1, 1},
+	[ArgAggrII]  = {2, 0},
+	[ArgAggrFF]  = {0, 2},
+	};
+
+	return (needed[a].freg + nfloats <= Nfloatregargs) && (needed[a].ireg + nints <= Nintregargs);
+}
+
 static Loc *
+plus8(Isel *s, Loc *base)
+{
+	Loc *forcedreg = locreg(ModeQ);
+	if (base->type == Loclbl || (base->type == Locmeml && !base->mem.base)) {
+		forcedreg = loclitl(base->lbl);
+	} else {
+		g(s, Ilea, base, forcedreg, NULL);
+	}
+	return locmem(8, forcedreg, NULL, ModeQ);
+}
+
+static void
 gencall(Isel *s, Node *n)
 {
-	Loc *src, *dst, *arg;	/* values we reduced */
-	size_t argsz, argoff, nargs, vasplit;
+	Loc *arg;	/* values we reduced */
+	size_t argsz, argoff, nargs, falseargs, vasplit;
 	size_t nfloats, nints;
-	Loc *retloc, *rsp, *ret;	/* hard-coded registers */
+	Loc *retloc1, *retloc2, *rsp;	/* hard-coded registers */
+	Loc *ret;
+	size_t ri, rf;
 	Loc *stkbump;	/* calculated stack offset */
 	Type *t, *fn;
 	Node **args;
-	size_t i, a;
+	Node *retnode;
+	ArgType rettype;
+	size_t i;
 	int vararg;
 
 	rsp = locphysreg(Rrsp);
+
 	t = exprtype(n);
-	if (tybase(t)->type == Tyvoid || isstacktype(t)) {
-		retloc = NULL;
-		ret = NULL;
-	} else if (istyfloat(t)) {
-		retloc = coreg(Rxmm0d, mode(n));
-		ret = locreg(mode(n));
-	} else {
-		retloc = coreg(Rrax, mode(n));
-		ret = locreg(mode(n));
+	ri = 0;
+	rf = 0;
+	retloc1 = NULL;
+	retloc2 = NULL;
+	rettype = classify(t);
+
+	switch (rettype) {
+	case ArgVoid:
+	case ArgBig:
+		break;
+	case ArgReg:
+		retloc1 = coreg((istyfloat(t)) ?  Rxmm0d : Rrax, mode(n));
+		break;
+	case ArgAggrI:
+		retloc1 = coreg(intretregs[ri++], tymodepart(t, 0, 0));
+		break;
+	case ArgAggrF:
+		retloc1 = coreg(fltretregs[rf++], tymodepart(t, 1, 0));
+		break;
+	case ArgAggrII:
+		retloc1 = coreg(intretregs[ri++], tymodepart(t, 0, 0));
+		retloc2 = coreg(intretregs[ri++], tymodepart(t, 0, 8));
+		break;
+	case ArgAggrIF:
+		retloc1 = coreg(intretregs[ri++], tymodepart(t, 0, 0));
+		retloc2 = coreg(fltretregs[rf++], tymodepart(t, 1, 8));
+		break;
+	case ArgAggrFI:
+		retloc1 = coreg(fltretregs[rf++], tymodepart(t, 1, 0));
+		retloc2 = coreg(intretregs[ri++], tymodepart(t, 0, 8));
+		break;
+	case ArgAggrFF:
+		retloc1 = coreg(fltretregs[rf++], tymodepart(t, 1, 0));
+		retloc2 = coreg(fltretregs[rf++], tymodepart(t, 1, 8));
+		break;
 	}
+
 	fn = tybase(exprtype(n->expr.args[0]));
 	/* calculate the number of args we expect to see, adjust
 	 * for a hidden return argument. */
 	vasplit = countargs(fn);
 	argsz = 0;
-	if (exprop(n) == Ocall) {
-		args = &n->expr.args[1];
-		nargs = n->expr.nargs - 1;
-	} else {
-		args = &n->expr.args[2];
-		nargs = n->expr.nargs - 2;
+
+	/*
+	 * { the function itself, [optional environment], [optional return information], real arg 1, ... }
+	 */
+	falseargs = 1;
+	if (exprop(n) == Ocallind) {
+		falseargs++;
 	}
+	if (rettype != ArgVoid) {
+		retnode = n->expr.args[falseargs];
+		if (rettype != ArgBig) {
+			falseargs++;
+		}
+	}
+	args = &n->expr.args[falseargs];
+	nargs = n->expr.nargs - falseargs;
 	/* Have to calculate the amount to bump the stack
 	 * pointer by in one pass first, otherwise if we push
 	 * one at a time, we evaluate the args in reverse order.
 	 * Not good.
 	 *
-	 * Skip the first operand, since it's the function itself */
+	 * Skip the first operand, since it's the function itself
+	 *
+	 * Strictly speaking, we might waste a little space here,
+	 * since some of these args might actually get passed in
+	 * registers.
+	 */
 	for (i = 0; i < nargs; i++) {
 		argsz = align(argsz, min(size(args[i]), Ptrsz));
 		argsz += size(args[i]);
@@ -576,44 +743,98 @@
 	vararg = 0;
 	for (i = 0; i < nargs; i++) {
 		arg = selexpr(s, args[i]);
-		argoff = alignto(argoff, exprtype(args[i]));
+		t = exprtype(args[i]);
+		argoff = alignto(argoff, t);
+		ArgType a = ArgBig;
 		if (i >= vasplit)
 			vararg = 1;
 		else
 			argoff = align(argoff, 8);
-		if (stacknode(args[i])) {
-			src = locreg(ModeQ);
-			g(s, Ilea, arg, src, NULL);
-			a = tyalign(exprtype(args[i]));
-			blit(s, rsp, src, argoff, 0, size(args[i]), a);
-			argoff += size(args[i]);
-		} else if (!vararg && isfloatmode(arg->mode) && nfloats < Nfloatregargs) {
-			dst = coreg(floatargregs[nfloats], arg->mode);
-			arg = inri(s, arg);
-			g(s, Imovs, arg, dst, NULL);
-			nfloats++;
-		} else if (!vararg && isintmode(arg->mode) && nints < Nintregargs) {
-			dst = coreg(intargregs[nints], arg->mode);
-			arg = inri(s, arg);
-			g(s, Imov, arg, dst, NULL);
-			nints++;
-		} else {
-			dst = locmem(argoff, rsp, NULL, arg->mode);
-			arg = inri(s, arg);
-			stor(s, arg, dst);
-			argoff += size(args[i]);
+
+		if (!vararg) {
+			a = classify(t);
 		}
+
+		if (!sufficientregs(a, nfloats, nints)) {
+			a = ArgBig;
+		}
+
+		switch(a) {
+		case ArgVoid:
+			break;
+		case ArgReg:
+		case ArgBig:
+			/* placearg can figure this out */
+			placearg(s, args[i], arg, PassInNoPref, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrI:
+			placearg(s, args[i], arg, PassInInt, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrF:
+			placearg(s, args[i], arg, PassInSSE, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrII:
+			placearg(s, args[i],          arg , PassInInt, rsp, vararg, &nfloats, &nints, &argoff);
+			placearg(s, args[i], plus8(s, arg), PassInInt, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrIF:
+			placearg(s, args[i],          arg , PassInInt, rsp, vararg, &nfloats, &nints, &argoff);
+			placearg(s, args[i], plus8(s, arg), PassInSSE, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrFI:
+			placearg(s, args[i],          arg , PassInSSE, rsp, vararg, &nfloats, &nints, &argoff);
+			placearg(s, args[i], plus8(s, arg), PassInInt, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrFF:
+			placearg(s, args[i],          arg , PassInSSE, rsp, vararg, &nfloats, &nints, &argoff);
+			placearg(s, args[i], plus8(s, arg), PassInSSE, rsp, vararg, &nfloats, &nints, &argoff);
+			break;
+		}
+
 	}
 	call(s, n);
 	if (argsz)
 		g(s, Iadd, stkbump, rsp, NULL);
-	if (retloc) {
-		if (isfloatmode(retloc->mode))
-			g(s, Imovs, retloc, ret, NULL);
+
+	switch (rettype) {
+	case ArgVoid:
+	case ArgBig:
+		/*
+		 * No need to do anything. The return location, if any, was taken care of
+		 * as the hidden argument.
+		 */
+		break;
+	case ArgReg:
+		/* retnode is the actual thing we're storing in */
+		ret = varloc(s, retnode);
+		if (isfloatmode(retloc1->mode))
+			g(s, Imovs, retloc1, ret, NULL);
 		else
-			g(s, Imov, retloc, ret, NULL);
+			g(s, Imov, retloc1, ret, NULL);
+		break;
+	case ArgAggrI:
+		g(s, Imov, retloc1, locmem(0, inri(s, selexpr(s, retnode)), NULL, ModeQ), NULL);
+		break;
+	case ArgAggrF:
+		g(s, Imovs, retloc1, locmem(0, inri(s, selexpr(s, retnode)), NULL, ModeD), NULL);
+		break;
+	case ArgAggrII:
+		g(s, Imov, retloc1, locmem(0, inri(s, selexpr(s, retnode)), NULL, ModeQ), NULL);
+		g(s, Imov, retloc2, locmem(8, inri(s, selexpr(s, retnode)), NULL, ModeQ), NULL);
+		break;
+	case ArgAggrIF:
+		g(s, Imov,  retloc1, locmem(0, inri(s, selexpr(s, retnode)), NULL, ModeQ), NULL);
+		g(s, Imovs, retloc2, locmem(8, inri(s, selexpr(s, retnode)), NULL, ModeD), NULL);
+		break;
+	case ArgAggrFI:
+		g(s, Imovs, retloc1, locmem(0, inri(s, selexpr(s, retnode)), NULL, ModeD), NULL);
+		g(s, Imov,  retloc2, locmem(8, inri(s, selexpr(s, retnode)), NULL, ModeQ), NULL);
+		break;
+	case ArgAggrFF:
+		g(s, Imovs, retloc1, locmem(0, inri(s, selexpr(s, retnode)), NULL, ModeD), NULL);
+		g(s, Imovs, retloc2, locmem(8, inri(s, selexpr(s, retnode)), NULL, ModeD), NULL);
+		break;
 	}
-	return ret;
 }
 
 static Loc*
@@ -753,7 +974,7 @@
 		if (mode(args[0]) == ModeF) {
 			a = locreg(ModeF);
 			b = loclit(1LL << (31), ModeF);
-			g(s, Imovs, r, a);
+			g(s, Imovs, r, a, NULL);
 		} else if (mode(args[0]) == ModeD) {
 			a = locreg(ModeQ);
 			b = loclit(1LL << 63, ModeQ);
@@ -851,7 +1072,7 @@
 		break;
 	case Ocall:
 	case Ocallind:
-		r = gencall(s, n);
+		gencall(s, n);
 		break;
 	case Oret:
 		a = locstrlbl(s->cfg->end->lbls[0]);
@@ -982,6 +1203,51 @@
 	Rnone
 };
 
+static void
+movearg(Isel *s, Loc *dst, PassIn p, Mode m, size_t *nfloats, size_t *nints, size_t *argoff)
+{
+	Loc *a;
+	assert(m != ModeNone);
+
+	switch(p) {
+	case PassInInt:
+		a = coreg(intargregs[*nints], forceintmode(m));
+		g(s, Imov, a, dst, NULL);
+		(*nints)++;
+		break;
+	case PassInSSE:
+		a = coreg(floatargregs[*nfloats], forcefltmode(m));
+		g(s, Imovs, a, dst, NULL);
+		(*nfloats)++;
+		break;
+	default: /* no need to move if on stack */
+		break;
+	}
+}
+
+static void
+retrievearg(Isel *s, Node *argn, int vararg, size_t *nfloats, size_t *nints, size_t *argoff)
+{
+	Loc *l;
+
+	if (stacknode(argn)) {
+		htput(s->stkoff, argn, itop(-(*argoff + 2*Ptrsz)));
+		*argoff += size(argn);
+	} else if (!vararg && isfloatmode(mode(argn)) && *nfloats < Nfloatregargs) {
+		l = loc(s, argn);
+		movearg(s, l, PassInSSE, forcefltmode(mode(argn)), nfloats, nints, argoff);
+		htput(s->reglocs, argn, l);
+	} else if (!vararg && isintmode(mode(argn)) && *nints < Nintregargs) {
+		l = loc(s, argn);
+		movearg(s, l, PassInInt, forceintmode(mode(argn)), nfloats, nints, argoff);
+		htput(s->reglocs, argn, l);
+	} else if (tybase(decltype(argn))->type != Tyvoid) {
+		/* varargs go on the stack */
+		htput(s->stkoff, argn, itop(-(*argoff + 2*Ptrsz)));
+		*argoff += size(argn);
+	}
+}
+
 void
 addarglocs(Isel *s, Func *fn)
 {
@@ -989,7 +1255,7 @@
 	size_t argoff;
 	int vararg;
 	Node *arg;
-	Loc *a, *l;
+	Type *t;
 
 	argoff = 0;
 	nfloats = 0;
@@ -998,30 +1264,59 @@
 	nargs = countargs(fn->type);
 	for (i = 0; i < fn->nargs; i++) {
 		arg = fn->args[i];
-		argoff = alignto(argoff, decltype(arg));
+		t = decltype(arg);
+		argoff = alignto(argoff, t);
+		ArgType a = ArgBig;
+		Loc *l = NULL;
 		if (i >= nargs)
 			vararg = 1;
 		else
 			argoff = align(argoff, 8);
-		if (stacknode(arg)) {
-			htput(s->stkoff, arg, itop(-(argoff + 2*Ptrsz)));
-			argoff += size(arg);
-		} else if (!vararg && isfloatmode(mode(arg)) && nfloats < Nfloatregargs) {
-			a = coreg(floatargregs[nfloats], mode(arg));
+
+		if (!vararg) {
+			a = classify(t);
+		}
+
+		if (!sufficientregs(a, nfloats, nints)) {
+			a = ArgBig;
+		}
+
+		switch(a) {
+		case ArgVoid:
+			break;
+		case ArgReg:
+		case ArgBig:
+			/* retrievearg can figure this out */
+			retrievearg(s, arg, vararg, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrI:
 			l = loc(s, arg);
-			g(s, Imovs, a, l, NULL);
-			htput(s->reglocs, arg, l);
-			nfloats++;
-		} else if (!vararg && isintmode(mode(arg)) && nints < Nintregargs) {
-			a = coreg(intargregs[nints], mode(arg));
+			movearg(s, l, PassInInt, ModeQ, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrF:
 			l = loc(s, arg);
-			g(s, Imov, a, l, NULL);
-			htput(s->reglocs, arg, l);
-			nints++;
-		} else if (tybase(decltype(arg))->type != Tyvoid) {
-			/* varargs go on the stack */
-			htput(s->stkoff, arg, itop(-(argoff + 2*Ptrsz)));
-			argoff += size(arg);
+			movearg(s, l, PassInSSE, ModeD, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrII:
+			l = loc(s, arg);
+			movearg(s,          l , PassInInt, ModeQ, &nfloats, &nints, &argoff);
+			movearg(s, plus8(s, l), PassInInt, ModeQ, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrIF:
+			l = loc(s, arg);
+			movearg(s,          l , PassInInt, ModeQ, &nfloats, &nints, &argoff);
+			movearg(s, plus8(s, l), PassInSSE, ModeD, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrFI:
+			l = loc(s, arg);
+			movearg(s,          l , PassInSSE, ModeD, &nfloats, &nints, &argoff);
+			movearg(s, plus8(s, l), PassInInt, ModeQ, &nfloats, &nints, &argoff);
+			break;
+		case ArgAggrFF:
+			l = loc(s, arg);
+			movearg(s,          l , PassInSSE, ModeD, &nfloats, &nints, &argoff);
+			movearg(s, plus8(s, l), PassInSSE, ModeD, &nfloats, &nints, &argoff);
+			break;
 		}
 	}
 }
@@ -1065,24 +1360,63 @@
 	Loc *rsp, *rbp;
 	Loc *ret;
 	size_t i;
+	size_t ri = 0, rf = 0;
 
 	rsp = locphysreg(Rrsp);
 	rbp = locphysreg(Rrbp);
-	if (s->ret) {
+	switch (s->rettype) {
+	case ArgVoid:
+		break;
+	case ArgReg:
+		/* s->ret is a value, and will be returned that way */
 		ret = loc(s, s->ret);
 		if (istyfloat(exprtype(s->ret)))
 			g(s, Imovs, ret, coreg(Rxmm0d, ret->mode), NULL);
 		else
 			g(s, Imov, ret, coreg(Rax, ret->mode), NULL);
+		break;
+	case ArgBig:
+		/* s->ret is an address, and will be returned that way */
+		ret = loc(s, s->ret);
+		g(s, Imov, ret, coreg(Rax, ret->mode), NULL);
+		break;
+	case ArgAggrI:
+		/* s->ret is an address, and will be returned as values */
+		ret = loc(s, s->ret);
+		load(s, locmem(0, ret, NULL, ModeQ), coreg(intretregs[ri++], ModeQ));
+		break;
+	case ArgAggrF:
+		ret = loc(s, s->ret);
+		load(s, locmem(0, ret, NULL, ModeD), coreg(fltretregs[rf++], ModeD));
+		break;
+	case ArgAggrII:
+		ret = loc(s, s->ret);
+		load(s, locmem(0, ret, NULL, ModeQ), coreg(intretregs[ri++], ModeQ));
+		load(s, locmem(8, ret, NULL, ModeQ), coreg(intretregs[ri++], ModeQ));
+		break;
+	case ArgAggrIF:
+		ret = loc(s, s->ret);
+		load(s, locmem(0, ret, NULL, ModeQ), coreg(intretregs[ri++], ModeQ));
+		load(s, locmem(8, ret, NULL, ModeD), coreg(fltretregs[rf++], ModeD));
+		break;
+	case ArgAggrFI:
+		ret = loc(s, s->ret);
+		load(s, locmem(0, ret, NULL, ModeD), coreg(fltretregs[rf++], ModeD));
+		load(s, locmem(8, ret, NULL, ModeQ), coreg(intretregs[ri++], ModeQ));
+		break;
+	case ArgAggrFF:
+		ret = loc(s, s->ret);
+		load(s, locmem(0, ret, NULL, ModeD), coreg(fltretregs[rf++], ModeD));
+		load(s, locmem(8, ret, NULL, ModeD), coreg(fltretregs[rf++], ModeD));
+		break;
 	}
+
 	/* restore registers */
-	for (i = 0; savedregs[i] != Rnone; i++) {
-		if (isfloatmode(s->calleesave[i]->mode)) {
+	for (i = 0; savedregs[i] != Rnone; i++)
+		if (isfloatmode(s->calleesave[i]->mode))
 			g(s, Imovs, s->calleesave[i], locphysreg(savedregs[i]), NULL);
-		} else {
+		else
 			g(s, Imov, s->calleesave[i], locphysreg(savedregs[i]), NULL);
-		}
-	}
 	/* leave function */
 	g(s, Imov, rbp, rsp, NULL);
 	g(s, Ipop, rbp, NULL);
@@ -1105,6 +1439,56 @@
 	return as;
 }
 
+static void
+handlesmallstructargs(Isel *is, Func *fn)
+{
+	/*
+	 * Perform a last-minute adjustment to fn->stksz to handle small structs
+	 * that will be passed in registers. We do this inside selfunc so that
+	 * generics will be specialized.
+	 */
+	size_t vasplit = countargs(fn->type);
+	size_t i = 0;
+	Type *t;
+	Node *arg;
+
+	for (i = 0; i < fn->nargs; i++) {
+		arg = fn->args[i];
+		t = decltype(arg);
+		int vararg = 0;
+		ArgType a = ArgBig;
+
+		if (i >= vasplit)
+			vararg = 1;
+
+		if (!vararg) {
+			a = classify(t);
+		}
+
+		switch(a) {
+		case ArgVoid:
+		case ArgReg:
+		case ArgBig:
+			/* No need for any extra space for this arg */
+			break;
+		case ArgAggrI:
+		case ArgAggrF:
+			fn->stksz += 8;
+			fn->stksz = align(fn->stksz, min(8, Ptrsz));
+			htput(fn->stkoff, fn->args[i], itop(fn->stksz));
+			break;
+		case ArgAggrII:
+		case ArgAggrIF:
+		case ArgAggrFI:
+		case ArgAggrFF:
+			fn->stksz += 16;
+			fn->stksz = align(fn->stksz, min(16, Ptrsz));
+			htput(fn->stkoff, fn->args[i], itop(fn->stksz));
+			break;
+		}
+	}
+}
+
 void
 selfunc(Isel *is, Func *fn, Htab *globls, Htab *strtab)
 {
@@ -1130,6 +1514,7 @@
 		g(is, Iloc, locstrlbl(buf), NULL);
 	}
 
+	handlesmallstructargs(is, fn);
 	prologue(is, fn, fn->stksz);
 	lastline = -1;
 	for (j = 0; j < fn->cfg->nbb - 1; j++) {
--- a/6/locs.c
+++ b/6/locs.c
@@ -176,6 +176,8 @@
 Loc *
 coreg(Reg r, Mode m)
 {
+	assert(m != ModeNone);
+
 	Reg crtab[][Nmode + 1] = {
 		[Ral]  = {Rnone, Ral,  Rax,  Reax, Rrax},
 		[Rcl]  = {Rnone, Rcl,  Rcx,  Recx, Rrcx},
--- a/6/ra.c
+++ b/6/ra.c
@@ -156,6 +156,31 @@
 	return Classbad;
 }
 
+int
+issubreg(Loc *a, Loc *b)
+{
+	if(a->type != Locreg || b->type != Locreg)
+		return 0;
+	if(a->reg.colour == Rnone || b->reg.colour == Rnone)
+		return 0;
+	return rclass(a) == rclass(b) && a->mode != b->mode;
+}
+
+int
+dumbmov(Loc *a, Loc *b)
+{
+	/*
+	 * moving a reg to itself is dumb,
+	 * but we generate a lot of these as part
+	 * of register coalescing.
+	 */
+	if (a->type != Locreg || b->type != Locreg)
+		return 0;
+	if (a->reg.colour == Rnone || b->reg.colour == Rnone)
+		return 0;
+	return a->reg.colour == b->reg.colour;
+}
+
 /* %esp, %ebp are not in the allocatable pool */
 static int
 isfixreg(Loc *l)
--- a/6/simp.c
+++ b/6/simp.c
@@ -35,8 +35,8 @@
 	/* return handling */
 	Node *endlbl;
 	Node *ret;
+	ArgType rettype;
 	int hasenv;
-	int isbigret;
 
 	/* location handling */
 	Node **blobs;
@@ -1065,6 +1065,7 @@
 	size_t i, nargs;
 	Node **args;
 	Type *ft;
+	ArgType rettype;
 	Op op;
 
 	/* NB: If we called rval() on a const function, we would end up with
@@ -1089,8 +1090,23 @@
 		lappend(&args, &nargs, getenvptr(s, fn));
 	}
 
-	if (exprtype(n)->type != Tyvoid && isstacktype(exprtype(n)))
+	rettype = classify(exprtype(n));
+	switch (rettype) {
+	case ArgVoid:
+		break;
+	case ArgBig:
+	case ArgAggrI:
+	case ArgAggrF:
+	case ArgAggrII:
+	case ArgAggrIF:
+	case ArgAggrFI:
+	case ArgAggrFF:
 		lappend(&args, &nargs, addr(s, r, exprtype(n)));
+		break;
+	case ArgReg:
+		lappend(&args, &nargs, r);
+		break;
+	}
 
 	for (i = 1; i < n->expr.nargs; i++) {
 		if (i < ft->nsub && tybase(ft->sub[i])->type == Tyvalist)
@@ -1110,11 +1126,7 @@
 
 	call = mkexprl(n->loc, op, args, nargs);
 	call->expr.type = exprtype(n);
-	if (r && !isstacktype(exprtype(n))) {
-		append(s, set(r, call));
-	} else {
-		append(s, call);
-	}
+	append(s, call);
 	return r;
 }
 
@@ -1237,20 +1249,40 @@
 		fatal(n, "'_' may not be an rvalue");
 		break;
 	case Oret:
-		if (s->isbigret) {
+		/*
+		 * Compute and put the correct value into s->ret. In the case of ArgBig
+		 * and ArgReg, exfiltrate the value from the function. In the case of
+		 * ArgAggr_XYZ, put a pointer to the value where the function
+		 * epilogue can access it.
+		 */
+		switch (s->rettype) {
+		case ArgAggrI:
+		case ArgAggrF:
+		case ArgAggrII:
+		case ArgAggrIF:
+		case ArgAggrFI:
+		case ArgAggrFF:
+			t = s->ret;
+			u = rval(s, args[0], NULL);
+			u = addr(s, u, exprtype(args[0]));
+			v = set(t, u);
+			append(s, v);
+		case ArgBig:
 			t = rval(s, args[0], NULL);
 			t = addr(s, t, exprtype(args[0]));
 			u = disp(n->loc, size(args[0]));
 			v = mkexpr(n->loc, Oblit, s->ret, t, u, NULL);
 			append(s, v);
-		} else {
+			break;
+		case ArgVoid:
+			rval(s, args[0], NULL);
+			break;
+		case ArgReg:
 			t = s->ret;
 			u = rval(s, args[0], NULL);
-			/* void calls return nothing */
-			if (t) {
-				t = set(t, u);
-				append(s, t);
-			}
+			t = set(t, u);
+			append(s, t);
+			break;
 		}
 		append(s, mkexpr(n->loc, Oret, NULL));
 		break;
@@ -1371,17 +1403,30 @@
 	s->nstmts = 0;
 	s->stmts = NULL;
 	s->endlbl = genlbl(f->loc);
-	s->ret = NULL;
+	s->rettype = ArgVoid;
 
 	/* make a temp for the return type */
 	ty = f->func.type->sub[0];
-	if (isstacktype(ty)) {
-		s->isbigret = 1;
+	s->rettype = classify(ty);
+
+	switch(s->rettype) {
+	case ArgVoid:
+		break;
+	case ArgAggrI:
+	case ArgAggrF:
+	case ArgAggrII:
+	case ArgAggrIF:
+	case ArgAggrFI:
+	case ArgAggrFF:
 		s->ret = gentemp(f->loc, mktyptr(f->loc, ty), &dcl);
+		break;
+	case ArgBig:
+		s->ret = gentemp(f->loc, mktyptr(f->loc, ty), &dcl);
 		declarearg(s, dcl);
-	} else if (tybase(ty)->type != Tyvoid) {
-		s->isbigret = 0;
-		s->ret = gentemp(f->loc, ty, &dcl);
+		break;
+	case ArgReg:
+		s->ret = gentemp(f->loc, ty, NULL);
+		break;
 	}
 
 	for (i = 0; i < f->func.nargs; i++) {
@@ -1486,6 +1531,7 @@
 	fn->stkoff = s->stkoff;
 	fn->envoff = s->envoff;
 	fn->ret = s->ret;
+	fn->rettype = s->rettype;
 	fn->args = s->args;
 	fn->nargs = s->nargs;
 	fn->cfg = cfg;
--- a/6/typeinfo.c
+++ b/6/typeinfo.c
@@ -331,7 +331,7 @@
 		break;
 	case Tytuple:
 		for (i = 0; i < ty->nsub; i++)
-			align = max(align, tyalign(ty->sub[0]));
+			align = max(align, tyalign(ty->sub[i]));
 		break;
 	case Tyunion:
 		align = 4;
@@ -408,3 +408,188 @@
 	return tyoffset(exprtype(aggr), memb);
 }
 
+size_t
+countargs(Type *t)
+{
+	size_t nargs;
+
+	t = tybase(t);
+	nargs = t->nsub - 1;
+	if (classify(t->sub[0]) == ArgBig)
+		nargs++;
+	/* valists are replaced with hidden type parameter,
+	 * which we want on the stack for ease of ABI */
+	if (tybase(t->sub[t->nsub - 1])->type == Tyvalist)
+		nargs--;
+	return nargs;
+}
+
+static void join_classification(PassIn *current, PassIn new)
+{
+	if (*current == PassInNoPref) {
+		*current = new;
+	} else if ((*current == PassInInt) || (new == PassInInt)) {
+		*current = PassInInt;
+	} else if (*current != new) {
+		*current = PassInMemory;
+	}
+}
+
+static void
+classify_recursive(Type *t, PassIn *p, size_t *total_offset)
+{
+	size_t i = 0, sz = tysize(t);
+	size_t cur_offset = *total_offset;
+	PassIn *cur = 0;
+
+	if (!t)
+		die("cannot pass empty type.");
+	if (cur_offset + sz > 16) {
+		p[0] = PassInMemory;
+		p[1] = PassInMemory;
+		return;
+	}
+	cur = &p[cur_offset / 8];
+
+	switch(t->type) {
+	case Tyvoid: break;
+	case Tybool:
+	case Tybyte:
+	case Tychar:
+	case Tyint:
+	case Tyint16:
+	case Tyint32:
+	case Tyint64:
+	case Tyint8:
+	case Typtr:
+	case Tyuint:
+	case Tyuint16:
+	case Tyuint32:
+	case Tyuint64:
+	case Tyuint8:
+		join_classification(cur, PassInInt);
+		break;
+	case Tyslice:
+		/* Slices are too myrddin-specific, they go on the stack. */
+		join_classification(&p[0], PassInMemory);
+		join_classification(&p[1], PassInMemory);
+		break;
+	case Tyflt32:
+	case Tyflt64:
+		join_classification(cur, PassInSSE);
+		break;
+	case Tyname:
+		classify_recursive(t->sub[0], p, total_offset);
+		break;
+	case Tybad:
+	case Tycode:
+	case Tyfunc:
+	case Tygeneric:
+	case Typaram:
+	case Tyunres:
+	case Tyvalist:
+	case Tyvar:
+	case Ntypes:
+		/* We shouldn't even be in this function */
+		join_classification(cur, PassInMemory);
+		break;
+	case Tytuple:
+		for (i = 0; i < t->nsub; ++i) {
+			*total_offset = alignto(*total_offset, t->sub[i]);
+			classify_recursive(t->sub[i], p, total_offset);
+		}
+		*total_offset = alignto(*total_offset, t);
+		break;
+	case Tystruct:
+		for (i = 0; i < t->nmemb; ++i) {
+			Type *fieldt = decltype(t->sdecls[i]);
+			*total_offset = alignto(*total_offset, fieldt);
+			classify_recursive(fieldt, p, total_offset);
+		}
+		*total_offset = alignto(*total_offset, t);
+		break;
+	case Tyunion:
+		/*
+		 * General enums are too complicated to interop with C, which is the only
+		 * reason for anything other than PassInMemory.
+		 */
+		if (isenum(t))
+			join_classification(cur, PassInInt);
+		else
+			join_classification(cur, PassInMemory);
+		break;
+	case Tyarray:
+		if (t->asize) {
+			t->asize = fold(t->asize, 1);
+			assert(exprop(t->asize) == Olit);
+			for (i = 0; i < t->asize->expr.args[0]->lit.intval; ++i) {
+				classify_recursive(t->sub[0], p, total_offset);
+			}
+		}
+	}
+
+	*total_offset = align(cur_offset + sz, tyalign(t));
+}
+
+int
+isaggregate(Type *t)
+{
+	t = tybase(t);
+	return (t->type == Tystruct || t->type == Tyarray || t->type == Tytuple ||
+		(t->type == Tyunion && !isenum(t)));
+}
+
+ArgType
+classify(Type *t)
+{
+	size_t sz = tysize(t);
+	size_t total_offset = 0;
+
+	/* p must be of length exactly 2 */
+	PassIn pi[2] = { PassInNoPref, PassInNoPref };
+
+	if (tybase(t)->type == Tyvoid) {
+		return ArgVoid;
+	} else if (isstacktype(t)) {
+		if (isaggregate(t) && sz <= 16) {
+			classify_recursive(t, pi, &total_offset);
+			if (pi[0] == PassInMemory || pi[1] == PassInMemory) {
+				return ArgBig;
+			}
+
+			switch(pi[0]) {
+			case PassInInt:
+				if (sz <= 8) {
+					return ArgAggrI;
+				}
+				switch(pi[1]) {
+				case PassInInt: return ArgAggrII;
+				case PassInSSE: return ArgAggrIF;
+				default:
+					die("Impossible return from classify_recursive");
+					break;
+				}
+				break;
+			case PassInSSE:
+				if (sz <= 8) {
+					return ArgAggrF;
+				}
+				switch(pi[1]) {
+				case PassInInt: return ArgAggrFI;
+				case PassInSSE: return ArgAggrFF;
+				default:
+					die("Impossible return from classify_recursive");
+					break;
+				}
+				break;
+			default:
+				die("Impossible return from classify_recursive");
+				break;
+			}
+		}
+
+		return ArgBig;
+	}
+
+	return ArgReg;
+}
--- a/configure
+++ b/configure
@@ -71,7 +71,8 @@
         echo export INST_MAN=$prefix/man/man >> config.mk
         echo 'const Sys = "FreeBSD"' >> mbld/config.myr
         echo 'const Linkcmd = ["ld", "--gc-sections"]' >> mbld/config.myr
-        echo 'const Dlflags : byte[:][:] = [][:]' >> mbld/config.myr
+        echo 'const Dlflags : byte[:][:] = ["-L/usr/lib", '\
+		'"-dynamic-linker", "/libexec/ld-elf.so.1"][:]' >> mbld/config.myr
         echo "const Manpath = \"man/man\"" >> mbld/config.myr
         ;;
     *NetBSD*)
@@ -87,8 +88,8 @@
         echo export INST_MAN=$prefix/man/man >> config.mk
         echo 'const Sys = "OpenBSD"' >> mbld/config.myr
         echo 'const Linkcmd = ["ld", "-nopie", "--gc-sections"]' >> mbld/config.myr
-        echo 'const Dlflags = ["-dynamic-linker",' \
-            '"/usr/libexec/ld.so"]' >> mbld/config.myr
+	echo 'const Dlflags = ["-L/usr/lib",'\
+		'"-dynamic-linker", "/usr/libexec/ld.so"]' >> mbld/config.myr
         echo "const Manpath = \"man/man\"" >> mbld/config.myr
         ;;
     *)
--- a/lib/std/varargs.myr
+++ b/lib/std/varargs.myr
@@ -58,7 +58,7 @@
 	ty = vatype(ap)
 	match typedesc(ty)
 	| `Tyslice enc:	-> [.args=sliceptr(ap.args), .tc=[.nelt=slicelen(ap.args), .rem=enc, .isiter=false]]
-	| `Tytuple tc:	-> [.args=ap.args, .tc=tc]
+	| `Tytuple tc:	-> [.args=cursoralign(ap.args, ty), .tc=tc]
 	| `Tystruct tc:	-> [.args=cursoralign(ap.args, ty), .tc=tc]
 	| `Tyarray (sz, enc):	-> [.args=ap.args, .tc=[.nelt=sz, .rem=enc, .isiter=false]]
 	| `Tyname (name, enc):	-> [.args=ap.args, .tc=typeenccursor(enc)]
--- a/mbld/libs.myr
+++ b/mbld/libs.myr
@@ -23,7 +23,7 @@
 ;;
 
 /* Keep in sync with parse/parse.h */
-const Abiversion = 22
+const Abiversion = 23
 
 const builtlib = {b, mt, dep, dyndep
 	var ldep, l, u
--- a/parse/parse.h
+++ b/parse/parse.h
@@ -1,4 +1,4 @@
-#define Abiversion 22
+#define Abiversion 23
 
 typedef struct Srcloc Srcloc;
 typedef struct Tysubst Tysubst;
--- a/rt/start-freebsd.s
+++ b/rt/start-freebsd.s
@@ -1,9 +1,15 @@
 .data
 /* sys.__cenvp : byte## */
+.globl environ
 .globl sys$__cenvp
+environ:
 sys$__cenvp:
-    .quad 0
+	.quad 0
 
+.globl __progname
+__progname:
+	.quad 0
+
 .globl thread$__tls
 thread$__tls:
     .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
@@ -25,6 +31,8 @@
 	/* load argc, argv, envp from stack */
 	movq	(%rdi),%rax		/* argc */
 	leaq	8(%rdi),%rbx		/* argv */
+	movq	(%rbx),%rcx		/* save progname */
+	movq	%rcx,__progname
 	leaq	16(%rdi,%rax,8),%rcx	/* envp = argv + 8*argc + 8 */
 
 	/* store envp for some syscalls to use without converting */
--- a/test/runtest.rc
+++ b/test/runtest.rc
@@ -1,11 +1,22 @@
 #!/bin/rc
 
 rfork e
-MYR_MC=../6/6.out
-MYR_MUSE=../muse/6.out
+MYR_MC=`{cd .. ; pwd}^/6/6.out
+MYR_MUSE=`{cd .. ; pwd}^/muse/6.out
 fn build {
-	rm -f $1 $1^.6 $1^.use
-	../obj/mbld/mbld -Bnone -o 'out' -b $1 -I../obj/lib/std -I../obj/lib/sys -I../obj/lib/regex -r../rt/_myrrt.6 $1^.myr
+	dir=`{basename -d $1}
+	if(~ $dir '.') {
+		rm -f $1 $1^.6 $1^.use
+		../obj/mbld/mbld -Bnone -o 'out' -b $1 -I../obj/lib/std -I../obj/lib/sys -I../obj/lib/regex -r../rt/_myrrt.6 $1^.myr
+	}
+	if not {
+		target=`{basename $1}
+		top=`{pwd}
+		mkdir -p out/$dir
+		cd $dir
+		$top/../obj/mbld/mbld -Bnone -o $top/out/$dir -I$top/../obj/lib/std -I$top/../obj/lib/sys -I$top/../obj/lib/regex -r$top/../rt/_myrrt.o clean
+		$top/../obj/mbld/mbld -Bnone -o $top/out/$dir -I$top/../obj/lib/std -I$top/../obj/lib/sys -I$top/../obj/lib/regex -r$top/../rt/_myrrt.o :$target
+	}
 }
 
 fn pass {
@@ -62,7 +73,9 @@
 	res=$1; shift
 
 	echo 'test' $test '<<{!'
+	here=`{pwd}
 	build $test
+	cd $here
 	switch($type) {
 	case E
 		expectstatus $test $res
@@ -77,16 +90,19 @@
 
 fn F {
 	echo 'test ' ^ $1 '<<{!'
+	here=`{pwd}
 	@{ build $1 } >[2=1]
 	if (~ $status '')
 		fail $1
 	if not
 		pass $1
+	cd $here
 }
 
+fn posixonly {
+	status=''
+}
+
 echo 'MTEST ' `{grep '^[BF]' tests | wc -l}
 
 . tests
-
-
-
--- a/test/runtest.sh
+++ b/test/runtest.sh
@@ -1,15 +1,25 @@
 #!/bin/sh
-export PATH=.:$PATH
-export MYR_MC=../6/6m
-export MYR_MUSE=../muse/muse
+export PATH=$(pwd):$PATH
+export MYR_MC=$(cd ..; pwd)/6/6m
+export MYR_MUSE=$(cd ..; pwd)/muse/muse
 ARGS=$*
 NFAILURES=0
 NPASSES=0
 
 build() {
-	rm -f out/$1 out/$1.o out/$1.s out/$1.use
-	mkdir -p out
-	../obj/mbld/mbld -Bnone -o 'out' -b $1 -I../obj/lib/std -I../obj/lib/sys -I../obj/lib/regex -r../rt/_myrrt.o $1.myr
+	dir=$(echo $1 | egrep -o '.*/')
+	if [ -z $dir ]; then
+		rm -f out/$1 out/$1.o out/$1.s out/$1.use
+		mkdir -p out
+		../obj/mbld/mbld -Bnone -o 'out' -b $1 -I../obj/lib/std -I../obj/lib/sys -I../obj/lib/regex -r../rt/_myrrt.o $1.myr
+	else
+		target=$(echo $1 | egrep -o '[^/]*$')
+		top=$(pwd)
+		mkdir -p out/$dir
+		cd $dir
+		$top/../obj/mbld/mbld -Bnone -o $top/out/$dir -I$top/../obj/lib/std -I$top/../obj/lib/sys -I$top/../obj/lib/regex -r$top/../rt/_myrrt.o clean
+		$top/../obj/mbld/mbld -Bnone -o $top/out/$dir -I$top/../obj/lib/std -I$top/../obj/lib/sys -I$top/../obj/lib/regex -r$top/../rt/_myrrt.o :$target
+	fi
 }
 
 pass() {
@@ -95,7 +105,9 @@
 		args="$1"; shift
 	fi
 	echo "test $test <<{!"
+	here=$(pwd)
 	build $test
+	cd $here
 	case $type in
 	"E")  expectstatus "$test" "$res";;
 	"P")  expectprint "$test" "$res";;
@@ -111,12 +123,18 @@
 	fi
 
 	echo "test $1 <<{!"
-	(build $1) > /dev/null 2>1
+	here=$(pwd)
+	(build $1) > /dev/null 2>&1
 	if [ $? -eq '1' ]; then
 		pass $1
 	else
 		fail $1
 	fi
+	cd $here
+}
+
+posixonly() {
+	$*
 }
 
 echo "MTEST $(egrep '^[BF]' tests | wc -l)"
--- a/test/tests
+++ b/test/tests
@@ -4,9 +4,12 @@
 #	B: Expect that this test will build.
 #	F: Expect that this test will not build.
 #    testname: Test case
-#	The test that will run. We will try to
-#	compile 'testname.myr' to 'testname',
-#	and then execute it, verifying the result
+#	The test that will run. If testname contains
+#	no '/', we will try to compile 'testname.myr'
+#	to 'testname', and then execute it, verifying the
+#	result. If the testname is of form a/b/c/.../y/z,
+#	we will try to mbld target z located in subdir
+#	a/b/c/.../y and execute, again verifying.
 #    [E|P|C]: Result type
 #	E tells us that the result is an exit status
 #	P tells us that the result is on stdout,
@@ -176,5 +179,12 @@
 B nestedgoto	E	0
 B initializer	E	0
 B fmtalign	E	0
+B fmtnest	P	_.f2_2,_.f3__3,_4.4__
 B implexpr	P	12,z,hello
-B implexpr-concrete	P	zigzag
+implexpr-concrete	P	zigzag
+posixonly B abi/001-return-tuple	E	0
+posixonly B abi/002-arg-alignment	E	0
+posixonly B abi/003-ret-alignment	E	0
+posixonly B abi/004-torture-1	E	0
+posixonly B abi/005-torture-2	E	0
+posixonly B abi/006-torture-3	E	0