shithub: elf-loader

Download patch

ref: e5e12f806ecf322946c093fe59d909beb9160d0c
author: kws <kws@cirno>
date: Sun Aug 4 16:50:18 EDT 2024

elf loader

--- /dev/null
+++ b/bind.rc
@@ -1,0 +1,4 @@
+#!/bin/rc
+
+bind -b sys/include /sys/include
+bind -bc sys/src/9/port /sys/src/9/port
--- /dev/null
+++ b/hello/Makefile
@@ -1,0 +1,23 @@
+CFLAGS=\
+	-O2\
+	-Wall\
+	-fno-pie\
+	-mno-mmx\
+	-mno-sse\
+	-mno-sse2\
+	-ffreestanding
+
+LDFLAGS=\
+	-nostdlib\
+	-no-pie\
+	-Tlinker.ld
+
+hello.elf:	l.o hello.o linker.ld
+	$(LD) -o $@ l.o hello.o $(LDFLAGS)
+
+CC=	x86_64-elf-gcc
+AS=	x86_64-elf-as
+LD=	x86_64-elf-ld
+
+clean:
+	rm -f *.o hello.elf
--- /dev/null
+++ b/hello/hello.c
@@ -1,0 +1,44 @@
+#define SYS_EXITS	8
+#define SYS_PWRITE	51
+
+typedef long long	vlong;
+
+extern vlong syscall(vlong, ...);
+
+_Noreturn
+void
+exits(char *msg)
+{
+	syscall(SYS_EXITS, msg);
+	__builtin_unreachable();
+}
+
+long
+pwrite(int fd, void *buf, long nbytes, vlong offset)
+{
+	return syscall(SYS_PWRITE, fd, buf, nbytes, offset);
+}
+
+long
+strlen(char *s)
+{
+	char *p;
+	for(p = s; *s; s++);
+	return s-p;
+}
+
+void
+puts(char *s)
+{
+	pwrite(1, s, strlen(s), -1LL);
+	pwrite(1, "\n", 1, -1LL);
+}
+
+_Noreturn
+void
+main(char *argv[])
+{
+	while(*argv)
+		puts(*argv++);
+	exits(0);
+}
--- /dev/null
+++ b/hello/l.s
@@ -1,0 +1,23 @@
+	.global _start
+_start:
+	lea	0x08(%rsp), %rdi
+	call	main
+1:	jmp	1b
+
+	.global syscall
+syscall:
+	sub	$0x40, %rsp
+	mov	%rbp, 0x28(%rsp)
+	mov	%rbx, 0x30(%rsp)
+
+	mov	%rdi, %rbp
+	mov	%rsi, 0x08(%rsp)
+	mov	%rdx, 0x10(%rsp)
+	mov	%rcx, 0x18(%rsp)
+	mov	%r8, 0x20(%rsp)
+	syscall
+
+	mov	0x28(%rsp), %rbp
+	mov	0x30(%rsp), %rbx
+	add	$0x40, %rsp
+	ret
--- /dev/null
+++ b/hello/linker.ld
@@ -1,0 +1,33 @@
+OUTPUT_FORMAT(elf64-x86-64)
+
+PHDRS {
+	text	PT_LOAD FILEHDR PHDRS;
+	data	PT_LOAD;
+}
+
+UTZERO = 0x200000;
+MAXPAGESIZE = 2M;
+
+SECTIONS {
+	. = UTZERO + SIZEOF_HEADERS;
+
+	.text : {
+		*(.text*)
+	} :text
+
+	. = ALIGN(MAXPAGESIZE);
+
+	.data : {
+		*(.rodata*)
+		*(.data*)
+	} :data
+
+	.bss : {
+		*(.bss*)
+	} :data
+
+	/DISCARD/ : {
+		*(.comment*)
+		*(.eh_frame*)
+	}
+}
--- /dev/null
+++ b/sys/include/elf.h
@@ -1,0 +1,47 @@
+typedef struct Elfhdr	Elfhdr;
+typedef struct Elfphdr	Elfphdr;
+
+#define ELF_MAGIC	0x7f454c46
+
+enum {
+	EI_NIDENT	= 16,
+
+	EM_X86_64	= 62,
+	EM_AARCH64	= 183,
+};
+
+struct Elfhdr {
+	uchar	ident[16];
+	u16int	type;
+	u16int	machine;
+	u32int	version;
+	usize	entry;
+	usize	phoff;
+	usize	shoff;
+	u32int	flags;
+	u16int	ehsize;
+	u16int	phentsize;
+	u16int	phnum;
+	u16int	shentsize;
+	u16int	shnum;
+	u16int	shstrndx;
+};
+
+struct Elfphdr {
+	u32int	type;
+	u32int	flags;
+	usize	off;
+	usize	vaddr;
+	usize	paddr;
+	usize	filesz;
+	usize	memsz;
+	usize	align;
+};
+
+enum {
+	PT_LOAD	= 1,
+
+	PF_X	= 1<<0,
+	PF_W	= 1<<1,
+	PF_R	= 1<<2,
+};
--- /dev/null
+++ b/sys/src/9/port/sysproc.c
@@ -1,0 +1,1301 @@
+#include	"u.h"
+#include	"tos.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+#include	"edf.h"
+
+#include	<a.out.h>
+#include	<elf.h>
+
+uintptr
+sysr1(va_list)
+{
+	if(!iseve())
+		error(Eperm);
+	return 0;
+}
+
+static void
+abortion(void)
+{
+	pexit("fork aborted", 1);
+}
+
+uintptr
+sysrfork(va_list list)
+{
+	Proc *p;
+	int n, i;
+	Fgrp *ofg;
+	Pgrp *opg;
+	Rgrp *org;
+	Egrp *oeg;
+	ulong pid, flag;
+	char *devs;
+
+	flag = va_arg(list, ulong);
+	/* Check flags before we commit */
+	if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+		error(Ebadarg);
+	if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
+		error(Ebadarg);
+	if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
+		error(Ebadarg);
+
+	/*
+	 * Code using RFNOMNT expects to block all but
+	 * the following devices.
+	 */
+	devs = "|decp";
+	if((flag&RFPROC) == 0) {
+		if(flag & (RFMEM|RFNOWAIT))
+			error(Ebadarg);
+		if(flag & (RFFDG|RFCFDG)) {
+			ofg = up->fgrp;
+			if(flag & RFFDG)
+				up->fgrp = dupfgrp(ofg);
+			else
+				up->fgrp = dupfgrp(nil);
+			closefgrp(ofg);
+		}
+		if(flag & (RFNAMEG|RFCNAMEG)) {
+			opg = up->pgrp;
+			up->pgrp = newpgrp();
+			if(flag & RFNAMEG)
+				pgrpcpy(up->pgrp, opg);
+			/* inherit notallowed */
+			memmove(up->pgrp->notallowed, opg->notallowed, sizeof up->pgrp->notallowed);
+			closepgrp(opg);
+		}
+		if(flag & RFNOMNT)
+			devmask(up->pgrp, 1, devs);
+		if(flag & RFREND) {
+			org = up->rgrp;
+			up->rgrp = newrgrp();
+			closergrp(org);
+		}
+		if(flag & (RFENVG|RFCENVG)) {
+			oeg = up->egrp;
+			up->egrp = smalloc(sizeof(Egrp));
+			up->egrp->ref = 1;
+			if(flag & RFENVG)
+				envcpy(up->egrp, oeg);
+			closeegrp(oeg);
+		}
+		if(flag & RFNOTEG){
+			qlock(&up->debug);
+			setnoteid(up, 0);	/* can't error() with 0 argument */
+			qunlock(&up->debug);
+		}
+		return 0;
+	}
+
+	if((p = newproc()) == nil)
+		error("no procs");
+
+	qlock(&up->debug);
+	qlock(&p->debug);
+
+	p->scallnr = up->scallnr;
+	p->s = up->s;
+	p->slash = up->slash;
+	p->dot = up->dot;
+	incref(p->dot);
+
+	p->nnote = 0;
+ 	p->notify = up->notify;
+	p->notified = 0;
+	p->notepending = 0;
+	p->lastnote = nil;
+
+	if((flag & RFNOTEG) == 0)
+		p->noteid = up->noteid;
+
+	p->procmode = up->procmode;
+	p->privatemem = up->privatemem;
+	p->noswap = up->noswap;
+	p->hang = up->hang;
+	if(up->procctl == Proc_tracesyscall)
+		p->procctl = Proc_tracesyscall;
+	p->kp = 0;
+
+	/*
+	 * Craft a return frame which will cause the child to pop out of
+	 * the scheduler in user mode with the return register zero
+	 */
+	forkchild(p, up->dbgreg);
+
+	kstrdup(&p->text, up->text);
+	kstrdup(&p->user, up->user);
+	kstrdup(&p->args, "");
+	p->nargs = 0;
+	p->setargs = 0;
+
+	p->insyscall = 0;
+	memset(p->time, 0, sizeof(p->time));
+	p->time[TReal] = MACHP(0)->ticks;
+	p->kentry = up->kentry;
+	p->pcycles = -p->kentry;
+
+	pid = pidalloc(p);
+
+	qunlock(&p->debug);
+	qunlock(&up->debug);
+
+	/* Abort the child process on error */
+	if(waserror()){
+		p->kp = 1;
+		kprocchild(p, abortion);
+		ready(p);
+		nexterror();
+	}
+
+	/* Make a new set of memory segments */
+	n = flag & RFMEM;
+	qlock(&p->seglock);
+	if(waserror()){
+		qunlock(&p->seglock);
+		nexterror();
+	}
+	for(i = 0; i < NSEG; i++)
+		if(up->seg[i] != nil)
+			p->seg[i] = dupseg(up->seg, i, n);
+	qunlock(&p->seglock);
+	poperror();
+
+	/* File descriptors */
+	if(flag & (RFFDG|RFCFDG)) {
+		if(flag & RFFDG)
+			p->fgrp = dupfgrp(up->fgrp);
+		else
+			p->fgrp = dupfgrp(nil);
+	}
+	else {
+		p->fgrp = up->fgrp;
+		incref(p->fgrp);
+	}
+
+	/* Process groups */
+	if(flag & (RFNAMEG|RFCNAMEG)) {
+		p->pgrp = newpgrp();
+		if(flag & RFNAMEG)
+			pgrpcpy(p->pgrp, up->pgrp);
+		/* inherit notallowed */
+		memmove(p->pgrp->notallowed, up->pgrp->notallowed, sizeof p->pgrp->notallowed);
+	}
+	else {
+		p->pgrp = up->pgrp;
+		incref(p->pgrp);
+	}
+	if(flag & RFNOMNT)
+		devmask(p->pgrp, 1, devs);
+
+	if(flag & RFREND)
+		p->rgrp = newrgrp();
+	else {
+		incref(up->rgrp);
+		p->rgrp = up->rgrp;
+	}
+
+	/* Environment group */
+	if(flag & (RFENVG|RFCENVG)) {
+		p->egrp = smalloc(sizeof(Egrp));
+		p->egrp->ref = 1;
+		if(flag & RFENVG)
+			envcpy(p->egrp, up->egrp);
+	}
+	else {
+		p->egrp = up->egrp;
+		incref(p->egrp);
+	}
+
+	procfork(p);
+
+	poperror();	/* abortion */
+
+	if((flag&RFNOWAIT) == 0){
+		p->parent = up;
+		lock(&up->exl);
+		up->nchild++;
+		unlock(&up->exl);
+	}
+
+	/*
+	 *  since the bss/data segments are now shareable,
+	 *  any mmu info about this process is now stale
+	 *  (i.e. has bad properties) and has to be discarded.
+	 */
+	flushmmu();
+
+	procpriority(p, up->basepri, up->fixedpri);
+	if(up->wired)
+		procwired(p, up->affinity);
+
+	ready(p);
+	sched();
+	return pid;
+}
+
+static int
+shargs(char *s, int n, char **ap, int nap)
+{
+	char *p;
+	int i;
+
+	if(n <= 2 || s[0] != '#' || s[1] != '!')
+		return -1;
+	s += 2;
+	n -= 2;		/* skip #! */
+	if((p = memchr(s, '\n', n)) == nil)
+		return 0;
+	*p = 0;
+	i = tokenize(s, ap, nap-1);
+	ap[i] = nil;
+	return i;
+}
+
+ulong
+beswal(ulong l)
+{
+	uchar *p;
+
+	p = (uchar*)&l;
+	return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
+}
+
+uvlong
+beswav(uvlong v)
+{
+	uchar *p;
+
+	p = (uchar*)&v;
+	return ((uvlong)p[0]<<56) | ((uvlong)p[1]<<48) | ((uvlong)p[2]<<40)
+				  | ((uvlong)p[3]<<32) | ((uvlong)p[4]<<24)
+				  | ((uvlong)p[5]<<16) | ((uvlong)p[6]<<8)
+				  | (uvlong)p[7];
+}
+
+uintptr
+sysexec(va_list list)
+{
+	union {
+		struct {
+			Exec;
+			uvlong	hdr[1];
+		} ehdr;
+		struct {
+			Elfhdr	ehdr;
+			Elfphdr	phdrs[2];
+		} elf;
+		char buf[256];
+	} u;
+	char line[256];
+	char *progarg[32+1];
+	volatile char *args, *elem, *file0;
+	char **argv, **argp, **argp0;
+	char *a, *e, *charp, *file;
+	int i, n, indir;
+	ulong magic, ssize, nargs, nbytes;
+	uintptr t, d, b, entry, text, data, bss, bssend, tstk, align;
+	uintptr textoff, dataoff;
+	Segment *s, *ts;
+	Image *img;
+	Tos *tos;
+	Chan *tc;
+	Fgrp *f;
+
+	textoff = 0;
+	dataoff = 0;
+	args = elem = nil;
+	file0 = va_arg(list, char*);
+	validaddr((uintptr)file0, 1, 0);
+	argp0 = va_arg(list, char**);
+	evenaddr((uintptr)argp0);
+	validaddr((uintptr)argp0, 2*BY2WD, 0);
+	if(*argp0 == nil)
+		error(Ebadarg);
+	file0 = validnamedup(file0, 1);
+	if(waserror()){
+		free(file0);
+		free(elem);
+		free(args);
+		/* Disaster after commit */
+		if(up->seg[SSEG] == nil)
+			pexit(up->errstr, 1);
+		nexterror();
+	}
+	align = BY2PG-1;
+	indir = 0;
+	file = file0;
+	for(;;){
+		tc = namec(file, Aopen, OEXEC, 0);
+		if(waserror()){
+			cclose(tc);
+			nexterror();
+		}
+		if(!indir)
+			kstrdup(&elem, up->genbuf);
+
+		n = devtab[tc->type]->read(tc, u.buf, sizeof(u.buf), 0);
+		if(n < 4)
+			error(Ebadexec);
+		magic = beswal(u.ehdr.magic);
+		if(magic == AOUT_MAGIC){
+			if(n < sizeof(Exec))
+				error(Ebadexec);
+			if(magic & HDR_MAGIC) {
+				if(n < sizeof(u.ehdr))
+					error(Ebadexec);
+				entry = beswav(u.ehdr.hdr[0]);
+				text = UTZERO+sizeof(u.ehdr);
+			} else {
+				entry = beswal(u.ehdr.entry);
+				text = UTZERO+sizeof(Exec);
+			}
+			if(entry < text)
+				error(Ebadexec);
+			text += beswal(u.ehdr.text);
+			dataoff = text-UTZERO;
+			if(text <= entry || text >= (USTKTOP-USTKSIZE))
+				error(Ebadexec);
+			data = beswal(u.ehdr.data);
+			bss = beswal(u.ehdr.bss);
+			switch(magic){
+			case S_MAGIC:	/* 2MB segment alignment for amd64 */
+				align = 0x1fffff;
+				break;
+			case P_MAGIC:	/* 16K segment alignment for spim */
+			case V_MAGIC:	/* 16K segment alignment for mips */
+				align = 0x3fff;
+				break;
+			case R_MAGIC:	/* 64K segment alignment for arm64 */
+				align = 0xffff;
+				break;
+			}
+			break; /* for binary */
+		}
+		if(magic == ELF_MAGIC){
+			if(n < sizeof(u.elf))
+				error(Ebadexec);
+			if(u.elf.ehdr.phnum != 2)
+				error(Ebadexec);
+			if(u.elf.phdrs[0].flags != (PF_X|PF_R) || u.elf.phdrs[1].flags != (PF_R|PF_W))
+				error(Ebadexec);
+			entry = u.elf.ehdr.entry;
+			text = UTZERO+u.elf.phdrs[0].filesz;
+			textoff = u.elf.phdrs[0].off;
+			data = u.elf.phdrs[1].filesz;
+			dataoff = u.elf.phdrs[1].off;
+			bss = u.elf.phdrs[1].memsz - u.elf.phdrs[1].filesz;
+			switch(u.elf.ehdr.machine){
+			case EM_X86_64:		/* 2MB segment alignment for amd64 */
+				align = 0x1fffff;
+				break;
+			case EM_AARCH64:	/* 64K segment alignment for arm64 */
+				align = 0xffff;
+				break;
+			}
+			break;
+		}
+
+		if(indir++)
+			error(Ebadexec);
+
+		/*
+		 * Process #! /bin/sh args ...
+		 */
+		memmove(line, u.buf, n);
+		n = shargs(line, n, progarg, nelem(progarg));
+		if(n < 1)
+			error(Ebadexec);
+		/*
+		 * First arg becomes complete file name
+		 */
+		progarg[n++] = file;
+		progarg[n] = nil;
+		argp0++;
+		file = progarg[0];
+		progarg[0] = elem;
+		poperror();
+		cclose(tc);
+	}
+
+	t = (text+align) & ~align;
+	text -= UTZERO;
+	align = BY2PG-1;
+	d = (t + data + align) & ~align;
+	bssend = t + data + bss;
+	b = (bssend + align) & ~align;
+	if(t >= (USTKTOP-USTKSIZE) || d >= (USTKTOP-USTKSIZE) || b >= (USTKTOP-USTKSIZE))
+		error(Ebadexec);
+
+	/*
+	 * Args: pass 1: count
+	 */
+	nbytes = sizeof(Tos);		/* hole for profiling clock at top of stack (and more) */
+	nargs = 0;
+	if(indir){
+		argp = progarg;
+		while(*argp != nil){
+			a = *argp++;
+			nbytes += strlen(a) + 1;
+			nargs++;
+		}
+	}
+	argp = argp0;
+	while(*argp != nil){
+		a = *argp++;
+		if(((uintptr)argp&(BY2PG-1)) < BY2WD)
+			validaddr((uintptr)argp, BY2WD, 0);
+		validaddr((uintptr)a, 1, 0);
+		e = vmemchr(a, 0, USTKSIZE);
+		if(e == nil)
+			error(Ebadarg);
+		nbytes += (e - a) + 1;
+		if(nbytes >= USTKSIZE)
+			error(Enovmem);
+		nargs++;
+	}
+	ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
+
+	/*
+	 * 8-byte align SP for those (e.g. sparc) that need it.
+	 * execregs() will subtract another 4 bytes for argc.
+	 */
+	if(BY2WD == 4 && (ssize+4) & 7)
+		ssize += 4;
+
+	if(PGROUND(ssize) >= USTKSIZE)
+		error(Enovmem);
+
+	/*
+	 * Build the stack segment, putting it in kernel virtual for the moment
+	 */
+	qlock(&up->seglock);
+	if(waserror()){
+		s = up->seg[ESEG];
+		if(s != nil){
+			up->seg[ESEG] = nil;
+			putseg(s);
+		}
+		qunlock(&up->seglock);
+		nexterror();
+	}
+
+	s = up->seg[SSEG];
+	do {
+		tstk = s->base;
+		if(tstk <= USTKSIZE)
+			error(Enovmem);
+	} while((s = isoverlap(tstk-USTKSIZE, USTKSIZE)) != nil);
+	up->seg[ESEG] = newseg(SG_STACK | SG_NOEXEC, tstk-USTKSIZE, USTKSIZE/BY2PG);
+
+	/*
+	 * Args: pass 2: assemble; the pages will be faulted in
+	 */
+	tos = (Tos*)(tstk - sizeof(Tos));
+	tos->cyclefreq = m->cyclefreq;
+	tos->kcycles = 0;
+	tos->pcycles = 0;
+	tos->clock = 0;
+
+	argv = (char**)(tstk - ssize);
+	charp = (char*)(tstk - nbytes);
+	if(indir)
+		argp = progarg;
+	else
+		argp = argp0;
+
+	for(i=0; i<nargs; i++){
+		if(indir && *argp==nil) {
+			indir = 0;
+			argp = argp0;
+		}
+		*argv++ = charp + (USTKTOP-tstk);
+		a = *argp++;
+		if(indir)
+			e = strchr(a, 0);
+		else {
+			if(charp >= (char*)tos)
+				error(Ebadarg);
+			validaddr((uintptr)a, 1, 0);
+			e = vmemchr(a, 0, (char*)tos - charp);
+			if(e == nil)
+				error(Ebadarg);
+		}
+		n = (e - a) + 1;
+		memmove(charp, a, n);
+		charp += n;
+	}
+
+	/* copy args; easiest from new process's stack */
+	a = (char*)(tstk - nbytes);
+	n = charp - a;
+	if(n > 128)	/* don't waste too much space on huge arg lists */
+		n = 128;
+	args = smalloc(n);
+	memmove(args, a, n);
+	if(n>0 && args[n-1]!='\0'){
+		/* make sure last arg is NUL-terminated */
+		/* put NUL at UTF-8 character boundary */
+		for(i=n-1; i>0; --i)
+			if(fullrune(args+i, n-i))
+				break;
+		args[i] = 0;
+		n = i+1;
+	}
+
+	/*
+	 * Committed.
+	 * Free old memory.
+	 * Special segments are maintained across exec
+	 */
+	for(i = SSEG; i <= BSEG; i++) {
+		s = up->seg[i];
+		if(s != nil) {
+			/* prevent a second free if we have an error */
+			up->seg[i] = nil;
+			putseg(s);
+		}
+	}
+	for(i = ESEG+1; i < NSEG; i++) {
+		s = up->seg[i];
+		if(s != nil && (s->type&SG_CEXEC) != 0) {
+			up->seg[i] = nil;
+			putseg(s);
+		}
+	}
+
+	/* Text.  Shared. Attaches to cache image if possible */
+	/* attachimage returns a locked cache image */
+	img = attachimage(SG_TEXT | SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
+	ts = img->s;
+	up->seg[TSEG] = ts;
+	ts->flushme = 1;
+	ts->fstart = textoff;
+	ts->flen = text;
+	unlock(img);
+
+	/* Data. Shared. */
+	s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
+	up->seg[DSEG] = s;
+
+	/* Attached by hand */
+	incref(img);
+	s->image = img;
+	s->fstart = dataoff;
+	s->flen = data;
+
+	/* BSS. Zero fill on demand */
+	up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
+
+	/*
+	 * Move the stack
+	 */
+	s = up->seg[ESEG];
+	up->seg[ESEG] = nil;
+	s->base = USTKTOP-USTKSIZE;
+	s->top = USTKTOP;
+	relocateseg(s, USTKTOP-tstk);
+	up->seg[SSEG] = s;
+	qunlock(&up->seglock);
+	poperror();	/* seglock */
+
+	/*
+	 * Close on exec
+	 */
+	if((f = up->fgrp) != nil) {
+		for(i=0; i<=f->maxfd; i++)
+			fdclose(i, CCEXEC);
+	}
+
+	poperror();	/* tc */
+	cclose(tc);
+	poperror();	/* file0 */
+	free(file0);
+
+	qlock(&up->debug);
+	free(up->text);
+	up->text = elem;
+	free(up->args);
+	up->args = args;
+	up->nargs = n;
+	up->setargs = 0;
+
+	freenotes(up);
+	freenote(up->lastnote);
+	up->lastnote = nil;
+	up->notify = nil;
+	up->notified = 0;
+	up->ureg = nil;
+	up->privatemem = 0;
+	up->noswap = 0;
+	up->pcycles = -up->kentry;
+	procsetup(up);
+	qunlock(&up->debug);
+
+	up->errbuf0[0] = '\0';
+	up->errbuf1[0] = '\0';
+
+	/*
+	 *  At this point, the mmu contains info about the old address
+	 *  space and needs to be flushed
+	 */
+	flushmmu();
+
+	if(up->hang)
+		up->procctl = Proc_stopme;
+	return execregs(entry, ssize, nargs);
+}
+
+int
+return0(void*)
+{
+	return 0;
+}
+
+uintptr
+syssleep(va_list list)
+{
+	long ms;
+
+	ms = va_arg(list, long);
+	if(ms <= 0) {
+		if (up->edf != nil && (up->edf->flags & Admitted))
+			edfyield();
+		else
+			yield();
+	} else {
+		tsleep(&up->sleep, return0, 0, ms);
+	}
+	return 0;
+}
+
+uintptr
+sysalarm(va_list list)
+{
+	return procalarm(va_arg(list, ulong));
+}
+
+
+uintptr
+sysexits(va_list list)
+{
+	char *status;
+	char *inval = "invalid exit string";
+	char buf[ERRMAX];
+
+	status = va_arg(list, char*);
+	if(status != nil){
+		if(waserror())
+			status = inval;
+		else{
+			validaddr((uintptr)status, 1, 0);
+			if(vmemchr(status, 0, ERRMAX) == nil){
+				memmove(buf, status, ERRMAX);
+				buf[ERRMAX-1] = 0;
+				status = buf;
+			}
+			poperror();
+		}
+
+	}
+	pexit(status, 1);
+}
+
+uintptr
+sys_wait(va_list list)
+{
+	ulong pid;
+	Waitmsg w;
+	OWaitmsg *ow;
+
+	ow = va_arg(list, OWaitmsg*);
+	if(ow == nil)
+		pid = pwait(nil);
+	else {
+		validaddr((uintptr)ow, sizeof(OWaitmsg), 1);
+		evenaddr((uintptr)ow);
+		pid = pwait(&w);
+	}
+	if(ow != nil){
+		readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
+		readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
+		readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
+		readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
+		strncpy(ow->msg, w.msg, sizeof(ow->msg)-1);
+		ow->msg[sizeof(ow->msg)-1] = '\0';
+	}
+	return pid;
+}
+
+uintptr
+sysawait(va_list list)
+{
+	char *p;
+	Waitmsg w;
+	uint n;
+
+	p = va_arg(list, char*);
+	n = va_arg(list, uint);
+	validaddr((uintptr)p, n, 1);
+	pwait(&w);
+	return (uintptr)snprint(p, n, "%d %lud %lud %lud %q",
+		w.pid,
+		w.time[TUser], w.time[TSys], w.time[TReal],
+		w.msg);
+}
+
+void
+werrstr(char *fmt, ...)
+{
+	va_list va;
+
+	if(up == nil)
+		return;
+
+	va_start(va, fmt);
+	vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
+	va_end(va);
+}
+
+static int
+generrstr(char *buf, uint nbuf)
+{
+	char *err;
+
+	if(nbuf == 0)
+		error(Ebadarg);
+	if(nbuf > ERRMAX)
+		nbuf = ERRMAX;
+	validaddr((uintptr)buf, nbuf, 1);
+
+	err = up->errstr;
+	utfecpy(err, err+nbuf, buf);
+	utfecpy(buf, buf+nbuf, up->syserrstr);
+
+	up->errstr = up->syserrstr;
+	up->syserrstr = err;
+	
+	return 0;
+}
+
+uintptr
+syserrstr(va_list list)
+{
+	char *buf;
+	uint len;
+
+	buf = va_arg(list, char*);
+	len = va_arg(list, uint);
+	return (uintptr)generrstr(buf, len);
+}
+
+/* compatibility for old binaries */
+uintptr
+sys_errstr(va_list list)
+{
+	return (uintptr)generrstr(va_arg(list, char*), 64);
+}
+
+uintptr
+sysnotify(va_list list)
+{
+	int (*f)(void*, char*);
+	f = va_arg(list, void*);
+	if(f != nil)
+		validaddr((uintptr)f, sizeof(void*), 0);
+	up->notify = f;
+	return 0;
+}
+
+uintptr
+sysnoted(va_list list)
+{
+	if(va_arg(list, int) != NRSTR && !up->notified)
+		error(Egreg);
+	return 0;
+}
+
+uintptr
+syssegbrk(va_list list)
+{
+	int i;
+	uintptr addr;
+	Segment *s;
+
+	addr = va_arg(list, uintptr);
+	for(i = 0; i < NSEG; i++) {
+		s = up->seg[i];
+		if(s == nil || addr < s->base || addr >= s->top)
+			continue;
+		switch(s->type&SG_TYPE) {
+		case SG_TEXT:
+		case SG_DATA:
+		case SG_STACK:
+		case SG_PHYSICAL:
+		case SG_FIXED:
+		case SG_STICKY:
+			error(Ebadarg);
+		default:
+			return ibrk(va_arg(list, uintptr), i);
+		}
+	}
+	error(Ebadarg);
+}
+
+uintptr
+syssegattach(va_list list)
+{
+	int attr;
+	char *name;
+	uintptr va;
+	ulong len;
+
+	attr = va_arg(list, int);
+	name = va_arg(list, char*);
+	va = va_arg(list, uintptr);
+	len = va_arg(list, ulong);
+	validaddr((uintptr)name, 1, 0);
+	name = validnamedup(name, 1);
+	if(waserror()){
+		free(name);
+		nexterror();
+	}
+	va = segattach(attr, name, va, len);
+	free(name);
+	poperror();
+	return va;
+}
+
+uintptr
+syssegdetach(va_list list)
+{
+	int i;
+	uintptr addr;
+	Segment *s;
+
+	addr = va_arg(list, uintptr);
+
+	qlock(&up->seglock);
+	if(waserror()){
+		qunlock(&up->seglock);
+		nexterror();
+	}
+
+	for(i = 0; i < NSEG; i++)
+		if((s = up->seg[i]) != nil) {
+			qlock(s);
+			if((addr >= s->base && addr < s->top) ||
+			   (s->top == s->base && addr == s->base))
+				goto found;
+			qunlock(s);
+		}
+
+	error(Ebadarg);
+
+found:
+	/*
+	 * Check we are not detaching the initial stack segment.
+	 */
+	if(s == up->seg[SSEG]){
+		qunlock(s);
+		error(Ebadarg);
+	}
+	up->seg[i] = nil;
+	qunlock(s);
+	putseg(s);
+	qunlock(&up->seglock);
+	poperror();
+
+	/* Ensure we flush any entries from the lost segment */
+	flushmmu();
+	return 0;
+}
+
+uintptr
+syssegfree(va_list list)
+{
+	Segment *s;
+	uintptr from, to;
+
+	from = va_arg(list, uintptr);
+	to = va_arg(list, ulong);
+	to += from;
+	if(to < from)
+		error(Ebadarg);
+	s = seg(up, from, 1);
+	if(s == nil)
+		error(Ebadarg);
+	to &= ~(BY2PG-1);
+	from = PGROUND(from);
+	if(from >= to) {
+		qunlock(s);
+		return 0;
+	}
+	if(to > s->top) {
+		qunlock(s);
+		error(Ebadarg);
+	}
+	mfreeseg(s, from, (to - from) / BY2PG);
+	qunlock(s);
+	flushmmu();
+	return 0;
+}
+
+/* For binary compatibility */
+uintptr
+sysbrk_(va_list list)
+{
+	return ibrk(va_arg(list, uintptr), BSEG);
+}
+
+uintptr
+sysrendezvous(va_list list)
+{
+	uintptr tag, val, new;
+	Proc *p, **l;
+
+	tag = va_arg(list, uintptr);
+	new = va_arg(list, uintptr);
+	l = &REND(up->rgrp, tag);
+
+	lock(up->rgrp);
+	for(p = *l; p != nil; p = p->rendhash) {
+		if(p->rendtag == tag) {
+			*l = p->rendhash;
+			val = p->rendval;
+			p->rendval = new;
+			unlock(up->rgrp);
+
+			ready(p);
+
+			return val;
+		}
+		l = &p->rendhash;
+	}
+
+	/* Going to sleep here */
+	up->rendtag = tag;
+	up->rendval = new;
+	up->rendhash = *l;
+	*l = up;
+	up->state = Rendezvous;
+	unlock(up->rgrp);
+
+	sched();
+
+	return up->rendval;
+}
+
+/*
+ * The implementation of semaphores is complicated by needing
+ * to avoid rescheduling in syssemrelease, so that it is safe
+ * to call from real-time processes.  This means syssemrelease
+ * cannot acquire any qlocks, only spin locks.
+ * 
+ * Semacquire and semrelease must both manipulate the semaphore
+ * wait list.  Lock-free linked lists only exist in theory, not
+ * in practice, so the wait list is protected by a spin lock.
+ * 
+ * The semaphore value *addr is stored in user memory, so it
+ * cannot be read or written while holding spin locks.
+ * 
+ * Thus, we can access the list only when holding the lock, and
+ * we can access the semaphore only when not holding the lock.
+ * This makes things interesting.  Note that sleep's condition function
+ * is called while holding two locks - r and up->rlock - so it cannot
+ * access the semaphore value either.
+ * 
+ * An acquirer announces its intention to try for the semaphore
+ * by putting a Sema structure onto the wait list and then
+ * setting Sema.waiting.  After one last check of semaphore,
+ * the acquirer sleeps until Sema.waiting==0.  A releaser of n
+ * must wake up n acquirers who have Sema.waiting set.  It does
+ * this by clearing Sema.waiting and then calling wakeup.
+ * 
+ * There are three interesting races here.  
+ 
+ * The first is that in this particular sleep/wakeup usage, a single
+ * wakeup can rouse a process from two consecutive sleeps!  
+ * The ordering is:
+ * 
+ * 	(a) set Sema.waiting = 1
+ * 	(a) call sleep
+ * 	(b) set Sema.waiting = 0
+ * 	(a) check Sema.waiting inside sleep, return w/o sleeping
+ * 	(a) try for semaphore, fail
+ * 	(a) set Sema.waiting = 1
+ * 	(a) call sleep
+ * 	(b) call wakeup(a)
+ * 	(a) wake up again
+ * 
+ * This is okay - semacquire will just go around the loop
+ * again.  It does mean that at the top of the for(;;) loop in
+ * semacquire, phore.waiting might already be set to 1.
+ * 
+ * The second is that a releaser might wake an acquirer who is
+ * interrupted before he can acquire the lock.  Since
+ * release(n) issues only n wakeup calls -- only n can be used
+ * anyway -- if the interrupted process is not going to use his
+ * wakeup call he must pass it on to another acquirer.
+ * 
+ * The third race is similar to the second but more subtle.  An
+ * acquirer sets waiting=1 and then does a final canacquire()
+ * before going to sleep.  The opposite order would result in
+ * missing wakeups that happen between canacquire and
+ * waiting=1.  (In fact, the whole point of Sema.waiting is to
+ * avoid missing wakeups between canacquire() and sleep().) But
+ * there can be spurious wakeups between a successful
+ * canacquire() and the following semdequeue().  This wakeup is
+ * not useful to the acquirer, since he has already acquired
+ * the semaphore.  Like in the previous case, though, the
+ * acquirer must pass the wakeup call along.
+ * 
+ * This is all rather subtle.  The code below has been verified
+ * with the spin model /sys/src/9/port/semaphore.p.  The
+ * original code anticipated the second race but not the first
+ * or third, which were caught only with spin.  The first race
+ * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
+ * It was lucky that my abstract model of sleep/wakeup still managed
+ * to preserve that behavior.
+ *
+ * I remain slightly concerned about memory coherence
+ * outside of locks.  The spin model does not take 
+ * queued processor writes into account so we have to
+ * think hard.  The only variables accessed outside locks
+ * are the semaphore value itself and the boolean flag
+ * Sema.waiting.  The value is only accessed with cmpswap,
+ * whose job description includes doing the right thing as
+ * far as memory coherence across processors.  That leaves
+ * Sema.waiting.  To handle it, we call coherence() before each
+ * read and after each write.		- rsc
+ */
+
+/* Add semaphore p with addr a to list in seg. */
+static void
+semqueue(Segment *s, long *a, Sema *p)
+{
+	memset(p, 0, sizeof *p);
+	p->addr = a;
+	lock(&s->sema);	/* uses s->sema.Rendez.Lock, but no one else is */
+	p->next = &s->sema;
+	p->prev = s->sema.prev;
+	p->next->prev = p;
+	p->prev->next = p;
+	unlock(&s->sema);
+}
+
+/* Remove semaphore p from list in seg. */
+static void
+semdequeue(Segment *s, Sema *p)
+{
+	lock(&s->sema);
+	p->next->prev = p->prev;
+	p->prev->next = p->next;
+	unlock(&s->sema);
+}
+
+/* Wake up n waiters with addr a on list in seg. */
+static void
+semwakeup(Segment *s, long *a, long n)
+{
+	Sema *p;
+	
+	lock(&s->sema);
+	for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
+		if(p->addr == a && p->waiting){
+			p->waiting = 0;
+			coherence();
+			wakeup(p);
+			n--;
+		}
+	}
+	unlock(&s->sema);
+}
+
+/* Add delta to semaphore and wake up waiters as appropriate. */
+static long
+semrelease(Segment *s, long *addr, long delta)
+{
+	long value;
+
+	do
+		value = *addr;
+	while(!cmpswap(addr, value, value+delta));
+	semwakeup(s, addr, delta);
+	return value+delta;
+}
+
+/* Try to acquire semaphore using compare-and-swap */
+static int
+canacquire(long *addr)
+{
+	long value;
+	
+	while((value=*addr) > 0)
+		if(cmpswap(addr, value, value-1))
+			return 1;
+	return 0;
+}		
+
+/* Should we wake up? */
+static int
+semawoke(void *p)
+{
+	coherence();
+	return !((Sema*)p)->waiting;
+}
+
+/* Acquire semaphore (subtract 1). */
+static int
+semacquire(Segment *s, long *addr, int block)
+{
+	int acquired;
+	Sema phore;
+
+	if(canacquire(addr))
+		return 1;
+	if(!block)
+		return 0;
+	semqueue(s, addr, &phore);
+	if(acquired = !waserror()){
+		for(;;){
+			phore.waiting = 1;
+			coherence();
+			if(canacquire(addr))
+				break;
+			sleep(&phore, semawoke, &phore);
+		}
+		poperror();
+	}
+	semdequeue(s, &phore);
+	coherence();	/* not strictly necessary due to lock in semdequeue */
+	if(!phore.waiting)
+		semwakeup(s, addr, 1);
+	if(!acquired)
+		nexterror();
+	return 1;
+}
+
+/* Acquire semaphore or time-out */
+static int
+tsemacquire(Segment *s, long *addr, ulong ms)
+{
+	int timedout, acquired;
+	ulong t;
+	Sema phore;
+
+	if(canacquire(addr))
+		return 1;
+	if(ms == 0)
+		return 0;
+	timedout = 0;
+	semqueue(s, addr, &phore);
+	if(acquired = !waserror()){
+		for(;;){
+			phore.waiting = 1;
+			coherence();
+			if(canacquire(addr))
+				break;
+			t = MACHP(0)->ticks;
+			tsleep(&phore, semawoke, &phore, ms);
+			t = TK2MS(MACHP(0)->ticks - t);
+			if(t >= ms){
+				timedout = 1;
+				break;
+			}
+			ms -= t;
+		}
+		poperror();
+	}
+	semdequeue(s, &phore);
+	coherence();	/* not strictly necessary due to lock in semdequeue */
+	if(!phore.waiting)
+		semwakeup(s, addr, 1);
+	if(!acquired)
+		nexterror();
+	return !timedout;
+}
+
+uintptr
+syssemacquire(va_list list)
+{
+	int block;
+	long *addr;
+	Segment *s;
+
+	addr = va_arg(list, long*);
+	block = va_arg(list, int);
+	evenaddr((uintptr)addr);
+	s = seg(up, (uintptr)addr, 0);
+	if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
+		validaddr((uintptr)addr, sizeof(long), 1);
+		error(Ebadarg);
+	}
+	if(*addr < 0)
+		error(Ebadarg);
+	return (uintptr)semacquire(s, addr, block);
+}
+
+uintptr
+systsemacquire(va_list list)
+{
+	long *addr;
+	ulong ms;
+	Segment *s;
+
+	addr = va_arg(list, long*);
+	ms = va_arg(list, ulong);
+	evenaddr((uintptr)addr);
+	s = seg(up, (uintptr)addr, 0);
+	if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
+		validaddr((uintptr)addr, sizeof(long), 1);
+		error(Ebadarg);
+	}
+	if(*addr < 0)
+		error(Ebadarg);
+	return (uintptr)tsemacquire(s, addr, ms);
+}
+
+uintptr
+syssemrelease(va_list list)
+{
+	long *addr, delta;
+	Segment *s;
+
+	addr = va_arg(list, long*);
+	delta = va_arg(list, long);
+	evenaddr((uintptr)addr);
+	s = seg(up, (uintptr)addr, 0);
+	if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
+		validaddr((uintptr)addr, sizeof(long), 1);
+		error(Ebadarg);
+	}
+	/* delta == 0 is a no-op, not a release */
+	if(delta < 0 || *addr < 0)
+		error(Ebadarg);
+	return (uintptr)semrelease(s, addr, delta);
+}
+
+/* For binary compatibility */
+uintptr
+sys_nsec(va_list list)
+{
+	vlong *v;
+
+	/* return in register on 64bit machine */
+	if(sizeof(uintptr) == sizeof(vlong)){
+		USED(list);
+		return (uintptr)todget(nil);
+	}
+
+	v = va_arg(list, vlong*);
+	evenaddr((uintptr)v);
+	validaddr((uintptr)v, sizeof(vlong), 1);
+	*v = todget(nil);
+	return 0;
+}