shithub: riscv

Download patch

ref: 1e9bb75854a336de2a63f9a4fb4d67dd75d682f1
parent: fade7acea6378225a2c6e3c66b6f00f99063411a
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sun Jun 18 12:14:10 EDT 2023

7c: more registers, improve constant propagation, implement load pipelining

use all 15 (before was 7) registers (R9-R23)
for variable registerization.

don't replace $0 integer constant with register
in constprop(), as we have a zero-register (R31) on arm64,
which the linker is able to encode just fine.

handle floating-point constants in constprop().

when we have a load immediately followed by
the load-dependent instruction, see if we
can find another in-dependent instruction to
put after the load. this results in nice
pipelined code and also clusters loads and
stores (which we could later optimize with
MOVP instructions). this optimization gave
me around 20% speedup for sha2 on cortex a53.

--- a/sys/src/cmd/7c/7.out.h
+++ b/sys/src/cmd/7c/7.out.h
@@ -12,14 +12,10 @@
 #define	REGRET		0
 #define	REGARG		0
 /* R1 to R7 are potential parameter/return registers */
-#define	REGIRL		8	/* indirect result location (TO DO) */
-/* compiler allocates R9 up as temps */
-/* compiler allocates register variables R10 up */
+/* compiler allocates register variables R9 up */
 #define	REGMIN		9
-#define	REGMAX		15
-#define	REGIP0		16
-#define	REGIP1		17
-#define	REGTMP		REGIP1
+#define	REGMAX		23
+#define	REGTMP		17
 /* compiler allocates external registers R27 down */
 #define	REGEXT		27
 #define	REGSB		28
--- a/sys/src/cmd/7c/peep.c
+++ b/sys/src/cmd/7c/peep.c
@@ -8,6 +8,24 @@
 static void
 storeprop(int as, Adr *a, Adr *v, Reg *r);
 
+static void
+swapprog(Prog *p1, Prog *p2)
+{
+	Prog tmp = *p1;
+
+	p1->as = p2->as;
+	p1->scond = p2->scond;
+	p1->from = p2->from;
+	p1->to = p2->to;
+	p1->reg = p2->reg;
+
+	p2->as = tmp.as;
+	p2->scond = tmp.scond;
+	p2->from = tmp.from;
+	p2->to = tmp.to;
+	p2->reg = tmp.reg;
+}
+
 static int
 isu32op(Prog *p)
 {
@@ -102,6 +120,74 @@
 	return 0;
 }
 
+static int
+independent(Prog *p1, Prog *p2)
+{
+	switch(p1->as){
+	case ACMP:
+	case ACMPW:
+	case AFCMPS:
+	case AFCMPD:
+
+	case AB:
+	case ABL:
+	case ARET:
+	case ARETURN:
+		return 0;
+	}
+
+	if(regtyp(&p1->to)){
+		if(!copyu(p2, &p1->to, A))
+			return 1;
+		return 0;
+	}
+
+	if(p2->from.type == D_CONST || p2->from.type == D_FCONST)
+		return 1;
+
+	if(p1->to.type == D_OREG){
+		int w;
+
+		if(p2->from.type != D_OREG)
+			return 1;
+
+		switch(p1->as){
+		default:
+			return 0;
+		case AMOV:
+		case AFMOVD:
+			w = 8;
+			break;
+		case AMOVW:
+		case AMOVWU:
+		case AFMOVS:
+			w = 4;
+			break;
+		case AMOVH:
+		case AMOVHU:
+			w = 2;
+			break;
+		case AMOVB:
+		case AMOVBU:
+			w = 1;
+			break;
+		}
+
+		if(p1->to.reg != REGSP && p1->to.name <= D_NONE)
+			return 0;
+		if(p2->from.reg != REGSP && p2->from.name <= D_NONE)
+			return 0;
+
+		if(p1->to.name != p2->from.name
+		|| p1->to.reg != p2->from.reg
+		|| abs(p1->to.offset - p2->from.offset) >= w)
+			return 1;
+	}
+
+	/* assume not independent */
+	return 0;
+}
+
 void
 peep(void)
 {
@@ -146,10 +232,11 @@
 	for(r=firstr; r!=R; r=r->link) {
 		p = r->prog;
 
-		/* registerize local loads following stores */
-		if(p->as == AMOV || p->as == AMOVW || p->as == AMOVWU || p->as == AFMOVS || p->as == AFMOVD)
-			if(p->from.type == D_REG && p->to.type == D_OREG && (p->to.name == D_AUTO || p->to.name == D_PARAM))
+		/* registerize variable loads following stores */
+		if(p->as == AMOV || p->as == AMOVW || p->as == AMOVWU || p->as == AFMOVS || p->as == AFMOVD){
+			if(p->from.type == D_REG && p->to.type == D_OREG && p->to.name > D_NONE)
 				storeprop(p->as, &p->from, &p->to, r->s1);
+		}
 
 		if(p->as == ALSL || p->as == ALSR || p->as == AASR
 		|| p->as == ALSLW || p->as == ALSRW || p->as == AASRW) {
@@ -189,7 +276,7 @@
 
 		if(p->as == AMOV || p->as == AMOVW || p->as == AFMOVS || p->as == AFMOVD)
 		if(regtyp(&p->to)) {
-			if(p->from.type == D_CONST)
+			if(p->from.type == D_CONST || p->from.type == D_FCONST)
 				constprop(&p->from, &p->to, r->s1);
 			else if(regtyp(&p->from))
 			if(p->from.type == p->to.type) {
@@ -219,6 +306,7 @@
 	}
 	if(t)
 		goto loop1;
+
 	/*
 	 * look for MOVB x,R; MOVB R,R
 	 */
@@ -360,6 +448,86 @@
 	}
 #endif
 
+	/*
+	 * software pipeline loads:
+	 *
+	 * insert a independent instruction (YYY) after a load:
+	 * MOV v, r1
+	 * XXX r1, x
+	 * YYY ... (not reading x or touching r1)
+	 * ---
+	 * MOV v, r1
+	 * YYY ... (not reading x or touching r1)
+	 * XXX r1, x
+	 */
+	for(r=firstr; r!=R; r=r->link) {
+		p = r->prog;
+		switch(p->as){
+		default:
+			continue;
+		case AFMOVD:
+		case AFMOVS:
+			if(p->from.type != D_OREG || p->to.type != D_FREG)
+				continue;
+			break;
+		case AMOV:
+		case AMOVW:
+		case AMOVWU:
+		case AMOVH:
+		case AMOVHU:
+		case AMOVB:
+		case AMOVBU:
+			if(p->from.type != D_OREG || p->to.type != D_REG)
+				continue;
+			break;
+		}
+		for(r1 = uniqs(r); r1 != R && r1->prog->as == ANOP; r1 = uniqs(r1)){
+			if(uniqp(r1) == R){
+				r1 = R;
+				break;
+			}
+		}
+		if(r1 == R || uniqp(r1) == R)
+			continue;
+		p1 = r1->prog;
+		if(!copyu(p1, &p->to, A))
+			continue;
+
+		for(r2 = uniqs(r1); r2 != R && r2->prog->as == ANOP; r2 = uniqs(r2)){
+			if(uniqp(r2) == R){
+				r2 = R;
+				break;
+			}
+		}
+		if(r2 == R || uniqp(r2) == R)
+			continue;
+		if(copyu(r2->prog, &p->to, A))
+			continue;
+
+		if(!independent(p1, r2->prog))
+			continue;
+		if(!independent(r2->prog, p1))
+			continue;
+
+		/*
+		 * if YYY happens to be a move from v, use register:
+		 * MOV v, r1
+		 * MOV v, r2
+		 * ---
+		 * MOV v, r1
+		 * MOV r1, r2
+		 */
+		if(p->as == r2->prog->as
+		&& (p->from.reg == REGSP || p->from.name > D_NONE)
+		&& copyas(&p->from, &r2->prog->from))
+			r2->prog->from = p->to;
+
+		swapprog(p1, r2->prog);
+		t++;
+	}
+	if(t)
+		goto loop1;
+
 #ifdef XXX
 	predicate();
 #endif
@@ -724,8 +892,13 @@
 {
 	Prog *p;
 
+	/* should be encodable with ZR */
+	if(c1->type == D_CONST && c1->sym == S && c1->offset == 0)
+		return;
+
 	if(debug['C'])
 		print("constprop %D->%D\n", c1, v1);
+
 	for(; r != R; r = r->s1) {
 		p = r->prog;
 		if(debug['C'])
@@ -735,7 +908,8 @@
 				print("; merge; return\n");
 			return;
 		}
-		if(p->as == AMOVW && copyas(&p->from, c1)) {
+		if((p->as == AMOVW || p->as == AMOVWU || p->as == AMOV || p->as == AFMOVD || p->as == AFMOVS)
+		&& copyas(&p->from, c1)) {
 			if(debug['C'])
 				print("; sub%D/%D", &p->from, v1);
 			p->from = *v1;
@@ -785,7 +959,7 @@
 			return;
 
 		if(p->to.type == D_OREG || p->to.type == D_XPRE || p->to.type == D_XPOST)
-			if(p->to.name == D_NONE || copyas(&p->to, v))
+			if(p->to.name <= D_NONE || copyas(&p->to, v))
 				return;
 
 		if(r->s2)
@@ -1446,7 +1620,6 @@
 int
 copyas(Adr *a, Adr *v)
 {
-
 	if(regtyp(v)) {
 		if(a->type == v->type)
 		if(a->reg == v->reg)
@@ -1455,9 +1628,13 @@
 		if(a->type == v->type)
 		if(a->name == v->name)
 		if(a->sym == v->sym)
-		if(a->reg == v->reg)
-		if(a->offset == v->offset)
-			return 1;
+		if(a->reg == v->reg){
+			if(a->type == D_FCONST){
+				if(a->dval == v->dval)
+					return 1;
+			} else if(a->offset == v->offset)
+				return 1;
+		}
 	}
 	return 0;
 }
@@ -1468,7 +1645,6 @@
 int
 copyau(Adr *a, Adr *v)
 {
-
 	if(copyas(a, v))
 		return 1;
 	if(v->type == D_REG) {
--- a/sys/src/cmd/7c/reg.c
+++ b/sys/src/cmd/7c/reg.c
@@ -1133,7 +1133,7 @@
  *	0	R9
  *	1	R10
  *	...	...
- *	6	R15
+ *	14	R23
  */
 long
 RtoB(int r)
@@ -1146,7 +1146,8 @@
 int
 BtoR(long b)
 {
-	b &= 0x07fL;
+	b &= (1 << (1+REGMAX-REGMIN))-1;
+	b &= ~(1 << (REGTMP-REGMIN));
 	if(b == 0)
 		return 0;
 	return bitno(b) + REGMIN;
@@ -1162,7 +1163,7 @@
 long
 FtoB(int f)
 {
-	if(f < FREGMIN || f > FREGEXT)
+	if(f < FREGMIN || f >= FREGEXT)
 		return 0;
 	return 1L << (f - FREGMIN + 22);
 }
@@ -1171,7 +1172,7 @@
 BtoF(long b)
 {
 
-	b &= 0x3fc00000L;
+	b &= ((1 << (FREGEXT - FREGMIN))-1) << 22;
 	if(b == 0)
 		return 0;
 	return bitno(b) - 22 + FREGMIN;
--- a/sys/src/cmd/7c/txt.c
+++ b/sys/src/cmd/7c/txt.c
@@ -291,7 +291,7 @@
 regalloc(Node *n, Node *tn, Node *o)
 {
 	int i, j;
-	static int lasti;
+	static int lasti, lastf;
 
 	switch(tn->type->etype) {
 	case TCHAR:
@@ -316,6 +316,7 @@
 				j = REGRET+1;
 			if(reg[j] == 0 && resvreg[j] == 0) {
 				i = j;
+				lasti = (i - REGRET) % 5;
 				goto out;
 			}
 			j++;
@@ -330,12 +331,13 @@
 			if(i >= NREG && i < NREG+NFREG)
 				goto out;
 		}
-		j = lasti + NREG;
+		j = lastf + NREG;
 		for(i=NREG; i<NREG+NFREG; i++) {
 			if(j >= NREG+NFREG)
 				j = NREG;
 			if(reg[j] == 0) {
 				i = j;
+				lastf = (i - NREG-1) % 5;
 				goto out;
 			}
 			j++;
@@ -350,8 +352,6 @@
 out:
 	reg[i]++;
 	lasti++;
-	if(lasti >= 5)
-		lasti = 0;
 	nodreg(n, tn, i);
 }