ref: 1e9bb75854a336de2a63f9a4fb4d67dd75d682f1
parent: fade7acea6378225a2c6e3c66b6f00f99063411a
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sun Jun 18 12:14:10 EDT 2023
7c: more registers, improve constant propagation, implement load pipelining use all 15 (before was 7) registers (R9-R23) for variable registerization. don't replace $0 integer constant with register in constprop(), as we have a zero-register (R31) on arm64, which the linker is able to encode just fine. handle floating-point constants in constprop(). when we have a load immediately followed by the load-dependent instruction, see if we can find another in-dependent instruction to put after the load. this results in nice pipelined code and also clusters loads and stores (which we could later optimize with MOVP instructions). this optimization gave me around 20% speedup for sha2 on cortex a53.
--- a/sys/src/cmd/7c/7.out.h
+++ b/sys/src/cmd/7c/7.out.h
@@ -12,14 +12,10 @@
#define REGRET 0
#define REGARG 0
/* R1 to R7 are potential parameter/return registers */
-#define REGIRL 8 /* indirect result location (TO DO) */
-/* compiler allocates R9 up as temps */
-/* compiler allocates register variables R10 up */
+/* compiler allocates register variables R9 up */
#define REGMIN 9
-#define REGMAX 15
-#define REGIP0 16
-#define REGIP1 17
-#define REGTMP REGIP1
+#define REGMAX 23
+#define REGTMP 17
/* compiler allocates external registers R27 down */
#define REGEXT 27
#define REGSB 28
--- a/sys/src/cmd/7c/peep.c
+++ b/sys/src/cmd/7c/peep.c
@@ -8,6 +8,24 @@
static void
storeprop(int as, Adr *a, Adr *v, Reg *r);
+static void
+swapprog(Prog *p1, Prog *p2)
+{
+ Prog tmp = *p1;
+
+ p1->as = p2->as;
+ p1->scond = p2->scond;
+ p1->from = p2->from;
+ p1->to = p2->to;
+ p1->reg = p2->reg;
+
+ p2->as = tmp.as;
+ p2->scond = tmp.scond;
+ p2->from = tmp.from;
+ p2->to = tmp.to;
+ p2->reg = tmp.reg;
+}
+
static int
isu32op(Prog *p)
{
@@ -102,6 +120,74 @@
return 0;
}
+static int
+independent(Prog *p1, Prog *p2)
+{
+ switch(p1->as){
+ case ACMP:
+ case ACMPW:
+ case AFCMPS:
+ case AFCMPD:
+
+ case AB:
+ case ABL:
+ case ARET:
+ case ARETURN:
+ return 0;
+ }
+
+ if(regtyp(&p1->to)){
+ if(!copyu(p2, &p1->to, A))
+ return 1;
+ return 0;
+ }
+
+ if(p2->from.type == D_CONST || p2->from.type == D_FCONST)
+ return 1;
+
+ if(p1->to.type == D_OREG){
+ int w;
+
+ if(p2->from.type != D_OREG)
+ return 1;
+
+ switch(p1->as){
+ default:
+ return 0;
+ case AMOV:
+ case AFMOVD:
+ w = 8;
+ break;
+ case AMOVW:
+ case AMOVWU:
+ case AFMOVS:
+ w = 4;
+ break;
+ case AMOVH:
+ case AMOVHU:
+ w = 2;
+ break;
+ case AMOVB:
+ case AMOVBU:
+ w = 1;
+ break;
+ }
+
+ if(p1->to.reg != REGSP && p1->to.name <= D_NONE)
+ return 0;
+ if(p2->from.reg != REGSP && p2->from.name <= D_NONE)
+ return 0;
+
+ if(p1->to.name != p2->from.name
+ || p1->to.reg != p2->from.reg
+ || abs(p1->to.offset - p2->from.offset) >= w)
+ return 1;
+ }
+
+ /* assume not independent */
+ return 0;
+}
+
void
peep(void)
{
@@ -146,10 +232,11 @@
for(r=firstr; r!=R; r=r->link) {
p = r->prog;
- /* registerize local loads following stores */
- if(p->as == AMOV || p->as == AMOVW || p->as == AMOVWU || p->as == AFMOVS || p->as == AFMOVD)
- if(p->from.type == D_REG && p->to.type == D_OREG && (p->to.name == D_AUTO || p->to.name == D_PARAM))
+ /* registerize variable loads following stores */
+ if(p->as == AMOV || p->as == AMOVW || p->as == AMOVWU || p->as == AFMOVS || p->as == AFMOVD){
+ if(p->from.type == D_REG && p->to.type == D_OREG && p->to.name > D_NONE)
storeprop(p->as, &p->from, &p->to, r->s1);
+ }
if(p->as == ALSL || p->as == ALSR || p->as == AASR
|| p->as == ALSLW || p->as == ALSRW || p->as == AASRW) {
@@ -189,7 +276,7 @@
if(p->as == AMOV || p->as == AMOVW || p->as == AFMOVS || p->as == AFMOVD)
if(regtyp(&p->to)) {
- if(p->from.type == D_CONST)
+ if(p->from.type == D_CONST || p->from.type == D_FCONST)
constprop(&p->from, &p->to, r->s1);
else if(regtyp(&p->from))
if(p->from.type == p->to.type) {
@@ -219,6 +306,7 @@
}
if(t)
goto loop1;
+
/*
* look for MOVB x,R; MOVB R,R
*/
@@ -360,6 +448,86 @@
}
#endif
+ /*
+ * software pipeline loads:
+ *
+ * insert a independent instruction (YYY) after a load:
+ * MOV v, r1
+ * XXX r1, x
+ * YYY ... (not reading x or touching r1)
+ * ---
+ * MOV v, r1
+ * YYY ... (not reading x or touching r1)
+ * XXX r1, x
+ */
+ for(r=firstr; r!=R; r=r->link) {
+ p = r->prog;
+ switch(p->as){
+ default:
+ continue;
+ case AFMOVD:
+ case AFMOVS:
+ if(p->from.type != D_OREG || p->to.type != D_FREG)
+ continue;
+ break;
+ case AMOV:
+ case AMOVW:
+ case AMOVWU:
+ case AMOVH:
+ case AMOVHU:
+ case AMOVB:
+ case AMOVBU:
+ if(p->from.type != D_OREG || p->to.type != D_REG)
+ continue;
+ break;
+ }
+ for(r1 = uniqs(r); r1 != R && r1->prog->as == ANOP; r1 = uniqs(r1)){
+ if(uniqp(r1) == R){
+ r1 = R;
+ break;
+ }
+ }
+ if(r1 == R || uniqp(r1) == R)
+ continue;
+ p1 = r1->prog;
+ if(!copyu(p1, &p->to, A))
+ continue;
+
+ for(r2 = uniqs(r1); r2 != R && r2->prog->as == ANOP; r2 = uniqs(r2)){
+ if(uniqp(r2) == R){
+ r2 = R;
+ break;
+ }
+ }
+ if(r2 == R || uniqp(r2) == R)
+ continue;
+ if(copyu(r2->prog, &p->to, A))
+ continue;
+
+ if(!independent(p1, r2->prog))
+ continue;
+ if(!independent(r2->prog, p1))
+ continue;
+
+ /*
+ * if YYY happens to be a move from v, use register:
+ * MOV v, r1
+ * MOV v, r2
+ * ---
+ * MOV v, r1
+ * MOV r1, r2
+ */
+ if(p->as == r2->prog->as
+ && (p->from.reg == REGSP || p->from.name > D_NONE)
+ && copyas(&p->from, &r2->prog->from))
+ r2->prog->from = p->to;
+
+ swapprog(p1, r2->prog);
+ t++;
+ }
+ if(t)
+ goto loop1;
+
#ifdef XXX
predicate();
#endif
@@ -724,8 +892,13 @@
{
Prog *p;
+ /* should be encodable with ZR */
+ if(c1->type == D_CONST && c1->sym == S && c1->offset == 0)
+ return;
+
if(debug['C'])
print("constprop %D->%D\n", c1, v1);
+
for(; r != R; r = r->s1) {
p = r->prog;
if(debug['C'])
@@ -735,7 +908,8 @@
print("; merge; return\n");
return;
}
- if(p->as == AMOVW && copyas(&p->from, c1)) {
+ if((p->as == AMOVW || p->as == AMOVWU || p->as == AMOV || p->as == AFMOVD || p->as == AFMOVS)
+ && copyas(&p->from, c1)) {
if(debug['C'])
print("; sub%D/%D", &p->from, v1);
p->from = *v1;
@@ -785,7 +959,7 @@
return;
if(p->to.type == D_OREG || p->to.type == D_XPRE || p->to.type == D_XPOST)
- if(p->to.name == D_NONE || copyas(&p->to, v))
+ if(p->to.name <= D_NONE || copyas(&p->to, v))
return;
if(r->s2)
@@ -1446,7 +1620,6 @@
int
copyas(Adr *a, Adr *v)
{
-
if(regtyp(v)) {
if(a->type == v->type)
if(a->reg == v->reg)
@@ -1455,9 +1628,13 @@
if(a->type == v->type)
if(a->name == v->name)
if(a->sym == v->sym)
- if(a->reg == v->reg)
- if(a->offset == v->offset)
- return 1;
+ if(a->reg == v->reg){
+ if(a->type == D_FCONST){
+ if(a->dval == v->dval)
+ return 1;
+ } else if(a->offset == v->offset)
+ return 1;
+ }
}
return 0;
}
@@ -1468,7 +1645,6 @@
int
copyau(Adr *a, Adr *v)
{
-
if(copyas(a, v))
return 1;
if(v->type == D_REG) {
--- a/sys/src/cmd/7c/reg.c
+++ b/sys/src/cmd/7c/reg.c
@@ -1133,7 +1133,7 @@
* 0 R9
* 1 R10
* ... ...
- * 6 R15
+ * 14 R23
*/
long
RtoB(int r)
@@ -1146,7 +1146,8 @@
int
BtoR(long b)
{
- b &= 0x07fL;
+ b &= (1 << (1+REGMAX-REGMIN))-1;
+ b &= ~(1 << (REGTMP-REGMIN));
if(b == 0)
return 0;
return bitno(b) + REGMIN;
@@ -1162,7 +1163,7 @@
long
FtoB(int f)
{
- if(f < FREGMIN || f > FREGEXT)
+ if(f < FREGMIN || f >= FREGEXT)
return 0;
return 1L << (f - FREGMIN + 22);
}
@@ -1171,7 +1172,7 @@
BtoF(long b)
{
- b &= 0x3fc00000L;
+ b &= ((1 << (FREGEXT - FREGMIN))-1) << 22;
if(b == 0)
return 0;
return bitno(b) - 22 + FREGMIN;
--- a/sys/src/cmd/7c/txt.c
+++ b/sys/src/cmd/7c/txt.c
@@ -291,7 +291,7 @@
regalloc(Node *n, Node *tn, Node *o)
{
int i, j;
- static int lasti;
+ static int lasti, lastf;
switch(tn->type->etype) {
case TCHAR:
@@ -316,6 +316,7 @@
j = REGRET+1;
if(reg[j] == 0 && resvreg[j] == 0) {
i = j;
+ lasti = (i - REGRET) % 5;
goto out;
}
j++;
@@ -330,12 +331,13 @@
if(i >= NREG && i < NREG+NFREG)
goto out;
}
- j = lasti + NREG;
+ j = lastf + NREG;
for(i=NREG; i<NREG+NFREG; i++) {
if(j >= NREG+NFREG)
j = NREG;
if(reg[j] == 0) {
i = j;
+ lastf = (i - NREG-1) % 5;
goto out;
}
j++;
@@ -350,8 +352,6 @@
out:
reg[i]++;
lasti++;
- if(lasti >= 5)
- lasti = 0;
nodreg(n, tn, i);
}