ref: e5e12f806ecf322946c093fe59d909beb9160d0c
author: kws <kws@cirno>
date: Sun Aug 4 16:50:18 EDT 2024
elf loader
--- /dev/null
+++ b/bind.rc
@@ -1,0 +1,4 @@
+#!/bin/rc
+
+bind -b sys/include /sys/include
+bind -bc sys/src/9/port /sys/src/9/port
--- /dev/null
+++ b/hello/Makefile
@@ -1,0 +1,23 @@
+CFLAGS=\
+ -O2\
+ -Wall\
+ -fno-pie\
+ -mno-mmx\
+ -mno-sse\
+ -mno-sse2\
+ -ffreestanding
+
+LDFLAGS=\
+ -nostdlib\
+ -no-pie\
+ -Tlinker.ld
+
+hello.elf: l.o hello.o linker.ld
+ $(LD) -o $@ l.o hello.o $(LDFLAGS)
+
+CC= x86_64-elf-gcc
+AS= x86_64-elf-as
+LD= x86_64-elf-ld
+
+clean:
+ rm -f *.o hello.elf
--- /dev/null
+++ b/hello/hello.c
@@ -1,0 +1,44 @@
+#define SYS_EXITS 8
+#define SYS_PWRITE 51
+
+typedef long long vlong;
+
+extern vlong syscall(vlong, ...);
+
+_Noreturn
+void
+exits(char *msg)
+{
+ syscall(SYS_EXITS, msg);
+ __builtin_unreachable();
+}
+
+long
+pwrite(int fd, void *buf, long nbytes, vlong offset)
+{
+ return syscall(SYS_PWRITE, fd, buf, nbytes, offset);
+}
+
+long
+strlen(char *s)
+{
+ char *p;
+ for(p = s; *s; s++);
+ return s-p;
+}
+
+void
+puts(char *s)
+{
+ pwrite(1, s, strlen(s), -1LL);
+ pwrite(1, "\n", 1, -1LL);
+}
+
+_Noreturn
+void
+main(char *argv[])
+{
+ while(*argv)
+ puts(*argv++);
+ exits(0);
+}
--- /dev/null
+++ b/hello/l.s
@@ -1,0 +1,23 @@
+ .global _start
+_start:
+ lea 0x08(%rsp), %rdi
+ call main
+1: jmp 1b
+
+ .global syscall
+syscall:
+ sub $0x40, %rsp
+ mov %rbp, 0x28(%rsp)
+ mov %rbx, 0x30(%rsp)
+
+ mov %rdi, %rbp
+ mov %rsi, 0x08(%rsp)
+ mov %rdx, 0x10(%rsp)
+ mov %rcx, 0x18(%rsp)
+ mov %r8, 0x20(%rsp)
+ syscall
+
+ mov 0x28(%rsp), %rbp
+ mov 0x30(%rsp), %rbx
+ add $0x40, %rsp
+ ret
--- /dev/null
+++ b/hello/linker.ld
@@ -1,0 +1,33 @@
+OUTPUT_FORMAT(elf64-x86-64)
+
+PHDRS {
+ text PT_LOAD FILEHDR PHDRS;
+ data PT_LOAD;
+}
+
+UTZERO = 0x200000;
+MAXPAGESIZE = 2M;
+
+SECTIONS {
+ . = UTZERO + SIZEOF_HEADERS;
+
+ .text : {
+ *(.text*)
+ } :text
+
+ . = ALIGN(MAXPAGESIZE);
+
+ .data : {
+ *(.rodata*)
+ *(.data*)
+ } :data
+
+ .bss : {
+ *(.bss*)
+ } :data
+
+ /DISCARD/ : {
+ *(.comment*)
+ *(.eh_frame*)
+ }
+}
--- /dev/null
+++ b/sys/include/elf.h
@@ -1,0 +1,47 @@
+typedef struct Elfhdr Elfhdr;
+typedef struct Elfphdr Elfphdr;
+
+#define ELF_MAGIC 0x7f454c46
+
+enum {
+ EI_NIDENT = 16,
+
+ EM_X86_64 = 62,
+ EM_AARCH64 = 183,
+};
+
+struct Elfhdr {
+ uchar ident[16];
+ u16int type;
+ u16int machine;
+ u32int version;
+ usize entry;
+ usize phoff;
+ usize shoff;
+ u32int flags;
+ u16int ehsize;
+ u16int phentsize;
+ u16int phnum;
+ u16int shentsize;
+ u16int shnum;
+ u16int shstrndx;
+};
+
+struct Elfphdr {
+ u32int type;
+ u32int flags;
+ usize off;
+ usize vaddr;
+ usize paddr;
+ usize filesz;
+ usize memsz;
+ usize align;
+};
+
+enum {
+ PT_LOAD = 1,
+
+ PF_X = 1<<0,
+ PF_W = 1<<1,
+ PF_R = 1<<2,
+};
--- /dev/null
+++ b/sys/src/9/port/sysproc.c
@@ -1,0 +1,1301 @@
+#include "u.h"
+#include "tos.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+#include "edf.h"
+
+#include <a.out.h>
+#include <elf.h>
+
+uintptr
+sysr1(va_list)
+{
+ if(!iseve())
+ error(Eperm);
+ return 0;
+}
+
+static void
+abortion(void)
+{
+ pexit("fork aborted", 1);
+}
+
+uintptr
+sysrfork(va_list list)
+{
+ Proc *p;
+ int n, i;
+ Fgrp *ofg;
+ Pgrp *opg;
+ Rgrp *org;
+ Egrp *oeg;
+ ulong pid, flag;
+ char *devs;
+
+ flag = va_arg(list, ulong);
+ /* Check flags before we commit */
+ if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+ error(Ebadarg);
+ if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
+ error(Ebadarg);
+ if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
+ error(Ebadarg);
+
+ /*
+ * Code using RFNOMNT expects to block all but
+ * the following devices.
+ */
+ devs = "|decp";
+ if((flag&RFPROC) == 0) {
+ if(flag & (RFMEM|RFNOWAIT))
+ error(Ebadarg);
+ if(flag & (RFFDG|RFCFDG)) {
+ ofg = up->fgrp;
+ if(flag & RFFDG)
+ up->fgrp = dupfgrp(ofg);
+ else
+ up->fgrp = dupfgrp(nil);
+ closefgrp(ofg);
+ }
+ if(flag & (RFNAMEG|RFCNAMEG)) {
+ opg = up->pgrp;
+ up->pgrp = newpgrp();
+ if(flag & RFNAMEG)
+ pgrpcpy(up->pgrp, opg);
+ /* inherit notallowed */
+ memmove(up->pgrp->notallowed, opg->notallowed, sizeof up->pgrp->notallowed);
+ closepgrp(opg);
+ }
+ if(flag & RFNOMNT)
+ devmask(up->pgrp, 1, devs);
+ if(flag & RFREND) {
+ org = up->rgrp;
+ up->rgrp = newrgrp();
+ closergrp(org);
+ }
+ if(flag & (RFENVG|RFCENVG)) {
+ oeg = up->egrp;
+ up->egrp = smalloc(sizeof(Egrp));
+ up->egrp->ref = 1;
+ if(flag & RFENVG)
+ envcpy(up->egrp, oeg);
+ closeegrp(oeg);
+ }
+ if(flag & RFNOTEG){
+ qlock(&up->debug);
+ setnoteid(up, 0); /* can't error() with 0 argument */
+ qunlock(&up->debug);
+ }
+ return 0;
+ }
+
+ if((p = newproc()) == nil)
+ error("no procs");
+
+ qlock(&up->debug);
+ qlock(&p->debug);
+
+ p->scallnr = up->scallnr;
+ p->s = up->s;
+ p->slash = up->slash;
+ p->dot = up->dot;
+ incref(p->dot);
+
+ p->nnote = 0;
+ p->notify = up->notify;
+ p->notified = 0;
+ p->notepending = 0;
+ p->lastnote = nil;
+
+ if((flag & RFNOTEG) == 0)
+ p->noteid = up->noteid;
+
+ p->procmode = up->procmode;
+ p->privatemem = up->privatemem;
+ p->noswap = up->noswap;
+ p->hang = up->hang;
+ if(up->procctl == Proc_tracesyscall)
+ p->procctl = Proc_tracesyscall;
+ p->kp = 0;
+
+ /*
+ * Craft a return frame which will cause the child to pop out of
+ * the scheduler in user mode with the return register zero
+ */
+ forkchild(p, up->dbgreg);
+
+ kstrdup(&p->text, up->text);
+ kstrdup(&p->user, up->user);
+ kstrdup(&p->args, "");
+ p->nargs = 0;
+ p->setargs = 0;
+
+ p->insyscall = 0;
+ memset(p->time, 0, sizeof(p->time));
+ p->time[TReal] = MACHP(0)->ticks;
+ p->kentry = up->kentry;
+ p->pcycles = -p->kentry;
+
+ pid = pidalloc(p);
+
+ qunlock(&p->debug);
+ qunlock(&up->debug);
+
+ /* Abort the child process on error */
+ if(waserror()){
+ p->kp = 1;
+ kprocchild(p, abortion);
+ ready(p);
+ nexterror();
+ }
+
+ /* Make a new set of memory segments */
+ n = flag & RFMEM;
+ qlock(&p->seglock);
+ if(waserror()){
+ qunlock(&p->seglock);
+ nexterror();
+ }
+ for(i = 0; i < NSEG; i++)
+ if(up->seg[i] != nil)
+ p->seg[i] = dupseg(up->seg, i, n);
+ qunlock(&p->seglock);
+ poperror();
+
+ /* File descriptors */
+ if(flag & (RFFDG|RFCFDG)) {
+ if(flag & RFFDG)
+ p->fgrp = dupfgrp(up->fgrp);
+ else
+ p->fgrp = dupfgrp(nil);
+ }
+ else {
+ p->fgrp = up->fgrp;
+ incref(p->fgrp);
+ }
+
+ /* Process groups */
+ if(flag & (RFNAMEG|RFCNAMEG)) {
+ p->pgrp = newpgrp();
+ if(flag & RFNAMEG)
+ pgrpcpy(p->pgrp, up->pgrp);
+ /* inherit notallowed */
+ memmove(p->pgrp->notallowed, up->pgrp->notallowed, sizeof p->pgrp->notallowed);
+ }
+ else {
+ p->pgrp = up->pgrp;
+ incref(p->pgrp);
+ }
+ if(flag & RFNOMNT)
+ devmask(p->pgrp, 1, devs);
+
+ if(flag & RFREND)
+ p->rgrp = newrgrp();
+ else {
+ incref(up->rgrp);
+ p->rgrp = up->rgrp;
+ }
+
+ /* Environment group */
+ if(flag & (RFENVG|RFCENVG)) {
+ p->egrp = smalloc(sizeof(Egrp));
+ p->egrp->ref = 1;
+ if(flag & RFENVG)
+ envcpy(p->egrp, up->egrp);
+ }
+ else {
+ p->egrp = up->egrp;
+ incref(p->egrp);
+ }
+
+ procfork(p);
+
+ poperror(); /* abortion */
+
+ if((flag&RFNOWAIT) == 0){
+ p->parent = up;
+ lock(&up->exl);
+ up->nchild++;
+ unlock(&up->exl);
+ }
+
+ /*
+ * since the bss/data segments are now shareable,
+ * any mmu info about this process is now stale
+ * (i.e. has bad properties) and has to be discarded.
+ */
+ flushmmu();
+
+ procpriority(p, up->basepri, up->fixedpri);
+ if(up->wired)
+ procwired(p, up->affinity);
+
+ ready(p);
+ sched();
+ return pid;
+}
+
+static int
+shargs(char *s, int n, char **ap, int nap)
+{
+ char *p;
+ int i;
+
+ if(n <= 2 || s[0] != '#' || s[1] != '!')
+ return -1;
+ s += 2;
+ n -= 2; /* skip #! */
+ if((p = memchr(s, '\n', n)) == nil)
+ return 0;
+ *p = 0;
+ i = tokenize(s, ap, nap-1);
+ ap[i] = nil;
+ return i;
+}
+
+ulong
+beswal(ulong l)
+{
+ uchar *p;
+
+ p = (uchar*)&l;
+ return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
+}
+
+uvlong
+beswav(uvlong v)
+{
+ uchar *p;
+
+ p = (uchar*)&v;
+ return ((uvlong)p[0]<<56) | ((uvlong)p[1]<<48) | ((uvlong)p[2]<<40)
+ | ((uvlong)p[3]<<32) | ((uvlong)p[4]<<24)
+ | ((uvlong)p[5]<<16) | ((uvlong)p[6]<<8)
+ | (uvlong)p[7];
+}
+
+uintptr
+sysexec(va_list list)
+{
+ union {
+ struct {
+ Exec;
+ uvlong hdr[1];
+ } ehdr;
+ struct {
+ Elfhdr ehdr;
+ Elfphdr phdrs[2];
+ } elf;
+ char buf[256];
+ } u;
+ char line[256];
+ char *progarg[32+1];
+ volatile char *args, *elem, *file0;
+ char **argv, **argp, **argp0;
+ char *a, *e, *charp, *file;
+ int i, n, indir;
+ ulong magic, ssize, nargs, nbytes;
+ uintptr t, d, b, entry, text, data, bss, bssend, tstk, align;
+ uintptr textoff, dataoff;
+ Segment *s, *ts;
+ Image *img;
+ Tos *tos;
+ Chan *tc;
+ Fgrp *f;
+
+ textoff = 0;
+ dataoff = 0;
+ args = elem = nil;
+ file0 = va_arg(list, char*);
+ validaddr((uintptr)file0, 1, 0);
+ argp0 = va_arg(list, char**);
+ evenaddr((uintptr)argp0);
+ validaddr((uintptr)argp0, 2*BY2WD, 0);
+ if(*argp0 == nil)
+ error(Ebadarg);
+ file0 = validnamedup(file0, 1);
+ if(waserror()){
+ free(file0);
+ free(elem);
+ free(args);
+ /* Disaster after commit */
+ if(up->seg[SSEG] == nil)
+ pexit(up->errstr, 1);
+ nexterror();
+ }
+ align = BY2PG-1;
+ indir = 0;
+ file = file0;
+ for(;;){
+ tc = namec(file, Aopen, OEXEC, 0);
+ if(waserror()){
+ cclose(tc);
+ nexterror();
+ }
+ if(!indir)
+ kstrdup(&elem, up->genbuf);
+
+ n = devtab[tc->type]->read(tc, u.buf, sizeof(u.buf), 0);
+ if(n < 4)
+ error(Ebadexec);
+ magic = beswal(u.ehdr.magic);
+ if(magic == AOUT_MAGIC){
+ if(n < sizeof(Exec))
+ error(Ebadexec);
+ if(magic & HDR_MAGIC) {
+ if(n < sizeof(u.ehdr))
+ error(Ebadexec);
+ entry = beswav(u.ehdr.hdr[0]);
+ text = UTZERO+sizeof(u.ehdr);
+ } else {
+ entry = beswal(u.ehdr.entry);
+ text = UTZERO+sizeof(Exec);
+ }
+ if(entry < text)
+ error(Ebadexec);
+ text += beswal(u.ehdr.text);
+ dataoff = text-UTZERO;
+ if(text <= entry || text >= (USTKTOP-USTKSIZE))
+ error(Ebadexec);
+ data = beswal(u.ehdr.data);
+ bss = beswal(u.ehdr.bss);
+ switch(magic){
+ case S_MAGIC: /* 2MB segment alignment for amd64 */
+ align = 0x1fffff;
+ break;
+ case P_MAGIC: /* 16K segment alignment for spim */
+ case V_MAGIC: /* 16K segment alignment for mips */
+ align = 0x3fff;
+ break;
+ case R_MAGIC: /* 64K segment alignment for arm64 */
+ align = 0xffff;
+ break;
+ }
+ break; /* for binary */
+ }
+ if(magic == ELF_MAGIC){
+ if(n < sizeof(u.elf))
+ error(Ebadexec);
+ if(u.elf.ehdr.phnum != 2)
+ error(Ebadexec);
+ if(u.elf.phdrs[0].flags != (PF_X|PF_R) || u.elf.phdrs[1].flags != (PF_R|PF_W))
+ error(Ebadexec);
+ entry = u.elf.ehdr.entry;
+ text = UTZERO+u.elf.phdrs[0].filesz;
+ textoff = u.elf.phdrs[0].off;
+ data = u.elf.phdrs[1].filesz;
+ dataoff = u.elf.phdrs[1].off;
+ bss = u.elf.phdrs[1].memsz - u.elf.phdrs[1].filesz;
+ switch(u.elf.ehdr.machine){
+ case EM_X86_64: /* 2MB segment alignment for amd64 */
+ align = 0x1fffff;
+ break;
+ case EM_AARCH64: /* 64K segment alignment for arm64 */
+ align = 0xffff;
+ break;
+ }
+ break;
+ }
+
+ if(indir++)
+ error(Ebadexec);
+
+ /*
+ * Process #! /bin/sh args ...
+ */
+ memmove(line, u.buf, n);
+ n = shargs(line, n, progarg, nelem(progarg));
+ if(n < 1)
+ error(Ebadexec);
+ /*
+ * First arg becomes complete file name
+ */
+ progarg[n++] = file;
+ progarg[n] = nil;
+ argp0++;
+ file = progarg[0];
+ progarg[0] = elem;
+ poperror();
+ cclose(tc);
+ }
+
+ t = (text+align) & ~align;
+ text -= UTZERO;
+ align = BY2PG-1;
+ d = (t + data + align) & ~align;
+ bssend = t + data + bss;
+ b = (bssend + align) & ~align;
+ if(t >= (USTKTOP-USTKSIZE) || d >= (USTKTOP-USTKSIZE) || b >= (USTKTOP-USTKSIZE))
+ error(Ebadexec);
+
+ /*
+ * Args: pass 1: count
+ */
+ nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
+ nargs = 0;
+ if(indir){
+ argp = progarg;
+ while(*argp != nil){
+ a = *argp++;
+ nbytes += strlen(a) + 1;
+ nargs++;
+ }
+ }
+ argp = argp0;
+ while(*argp != nil){
+ a = *argp++;
+ if(((uintptr)argp&(BY2PG-1)) < BY2WD)
+ validaddr((uintptr)argp, BY2WD, 0);
+ validaddr((uintptr)a, 1, 0);
+ e = vmemchr(a, 0, USTKSIZE);
+ if(e == nil)
+ error(Ebadarg);
+ nbytes += (e - a) + 1;
+ if(nbytes >= USTKSIZE)
+ error(Enovmem);
+ nargs++;
+ }
+ ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
+
+ /*
+ * 8-byte align SP for those (e.g. sparc) that need it.
+ * execregs() will subtract another 4 bytes for argc.
+ */
+ if(BY2WD == 4 && (ssize+4) & 7)
+ ssize += 4;
+
+ if(PGROUND(ssize) >= USTKSIZE)
+ error(Enovmem);
+
+ /*
+ * Build the stack segment, putting it in kernel virtual for the moment
+ */
+ qlock(&up->seglock);
+ if(waserror()){
+ s = up->seg[ESEG];
+ if(s != nil){
+ up->seg[ESEG] = nil;
+ putseg(s);
+ }
+ qunlock(&up->seglock);
+ nexterror();
+ }
+
+ s = up->seg[SSEG];
+ do {
+ tstk = s->base;
+ if(tstk <= USTKSIZE)
+ error(Enovmem);
+ } while((s = isoverlap(tstk-USTKSIZE, USTKSIZE)) != nil);
+ up->seg[ESEG] = newseg(SG_STACK | SG_NOEXEC, tstk-USTKSIZE, USTKSIZE/BY2PG);
+
+ /*
+ * Args: pass 2: assemble; the pages will be faulted in
+ */
+ tos = (Tos*)(tstk - sizeof(Tos));
+ tos->cyclefreq = m->cyclefreq;
+ tos->kcycles = 0;
+ tos->pcycles = 0;
+ tos->clock = 0;
+
+ argv = (char**)(tstk - ssize);
+ charp = (char*)(tstk - nbytes);
+ if(indir)
+ argp = progarg;
+ else
+ argp = argp0;
+
+ for(i=0; i<nargs; i++){
+ if(indir && *argp==nil) {
+ indir = 0;
+ argp = argp0;
+ }
+ *argv++ = charp + (USTKTOP-tstk);
+ a = *argp++;
+ if(indir)
+ e = strchr(a, 0);
+ else {
+ if(charp >= (char*)tos)
+ error(Ebadarg);
+ validaddr((uintptr)a, 1, 0);
+ e = vmemchr(a, 0, (char*)tos - charp);
+ if(e == nil)
+ error(Ebadarg);
+ }
+ n = (e - a) + 1;
+ memmove(charp, a, n);
+ charp += n;
+ }
+
+ /* copy args; easiest from new process's stack */
+ a = (char*)(tstk - nbytes);
+ n = charp - a;
+ if(n > 128) /* don't waste too much space on huge arg lists */
+ n = 128;
+ args = smalloc(n);
+ memmove(args, a, n);
+ if(n>0 && args[n-1]!='\0'){
+ /* make sure last arg is NUL-terminated */
+ /* put NUL at UTF-8 character boundary */
+ for(i=n-1; i>0; --i)
+ if(fullrune(args+i, n-i))
+ break;
+ args[i] = 0;
+ n = i+1;
+ }
+
+ /*
+ * Committed.
+ * Free old memory.
+ * Special segments are maintained across exec
+ */
+ for(i = SSEG; i <= BSEG; i++) {
+ s = up->seg[i];
+ if(s != nil) {
+ /* prevent a second free if we have an error */
+ up->seg[i] = nil;
+ putseg(s);
+ }
+ }
+ for(i = ESEG+1; i < NSEG; i++) {
+ s = up->seg[i];
+ if(s != nil && (s->type&SG_CEXEC) != 0) {
+ up->seg[i] = nil;
+ putseg(s);
+ }
+ }
+
+ /* Text. Shared. Attaches to cache image if possible */
+ /* attachimage returns a locked cache image */
+ img = attachimage(SG_TEXT | SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
+ ts = img->s;
+ up->seg[TSEG] = ts;
+ ts->flushme = 1;
+ ts->fstart = textoff;
+ ts->flen = text;
+ unlock(img);
+
+ /* Data. Shared. */
+ s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
+ up->seg[DSEG] = s;
+
+ /* Attached by hand */
+ incref(img);
+ s->image = img;
+ s->fstart = dataoff;
+ s->flen = data;
+
+ /* BSS. Zero fill on demand */
+ up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
+
+ /*
+ * Move the stack
+ */
+ s = up->seg[ESEG];
+ up->seg[ESEG] = nil;
+ s->base = USTKTOP-USTKSIZE;
+ s->top = USTKTOP;
+ relocateseg(s, USTKTOP-tstk);
+ up->seg[SSEG] = s;
+ qunlock(&up->seglock);
+ poperror(); /* seglock */
+
+ /*
+ * Close on exec
+ */
+ if((f = up->fgrp) != nil) {
+ for(i=0; i<=f->maxfd; i++)
+ fdclose(i, CCEXEC);
+ }
+
+ poperror(); /* tc */
+ cclose(tc);
+ poperror(); /* file0 */
+ free(file0);
+
+ qlock(&up->debug);
+ free(up->text);
+ up->text = elem;
+ free(up->args);
+ up->args = args;
+ up->nargs = n;
+ up->setargs = 0;
+
+ freenotes(up);
+ freenote(up->lastnote);
+ up->lastnote = nil;
+ up->notify = nil;
+ up->notified = 0;
+ up->ureg = nil;
+ up->privatemem = 0;
+ up->noswap = 0;
+ up->pcycles = -up->kentry;
+ procsetup(up);
+ qunlock(&up->debug);
+
+ up->errbuf0[0] = '\0';
+ up->errbuf1[0] = '\0';
+
+ /*
+ * At this point, the mmu contains info about the old address
+ * space and needs to be flushed
+ */
+ flushmmu();
+
+ if(up->hang)
+ up->procctl = Proc_stopme;
+ return execregs(entry, ssize, nargs);
+}
+
+int
+return0(void*)
+{
+ return 0;
+}
+
+uintptr
+syssleep(va_list list)
+{
+ long ms;
+
+ ms = va_arg(list, long);
+ if(ms <= 0) {
+ if (up->edf != nil && (up->edf->flags & Admitted))
+ edfyield();
+ else
+ yield();
+ } else {
+ tsleep(&up->sleep, return0, 0, ms);
+ }
+ return 0;
+}
+
+uintptr
+sysalarm(va_list list)
+{
+ return procalarm(va_arg(list, ulong));
+}
+
+
+uintptr
+sysexits(va_list list)
+{
+ char *status;
+ char *inval = "invalid exit string";
+ char buf[ERRMAX];
+
+ status = va_arg(list, char*);
+ if(status != nil){
+ if(waserror())
+ status = inval;
+ else{
+ validaddr((uintptr)status, 1, 0);
+ if(vmemchr(status, 0, ERRMAX) == nil){
+ memmove(buf, status, ERRMAX);
+ buf[ERRMAX-1] = 0;
+ status = buf;
+ }
+ poperror();
+ }
+
+ }
+ pexit(status, 1);
+}
+
+uintptr
+sys_wait(va_list list)
+{
+ ulong pid;
+ Waitmsg w;
+ OWaitmsg *ow;
+
+ ow = va_arg(list, OWaitmsg*);
+ if(ow == nil)
+ pid = pwait(nil);
+ else {
+ validaddr((uintptr)ow, sizeof(OWaitmsg), 1);
+ evenaddr((uintptr)ow);
+ pid = pwait(&w);
+ }
+ if(ow != nil){
+ readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
+ readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
+ readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
+ readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
+ strncpy(ow->msg, w.msg, sizeof(ow->msg)-1);
+ ow->msg[sizeof(ow->msg)-1] = '\0';
+ }
+ return pid;
+}
+
+uintptr
+sysawait(va_list list)
+{
+ char *p;
+ Waitmsg w;
+ uint n;
+
+ p = va_arg(list, char*);
+ n = va_arg(list, uint);
+ validaddr((uintptr)p, n, 1);
+ pwait(&w);
+ return (uintptr)snprint(p, n, "%d %lud %lud %lud %q",
+ w.pid,
+ w.time[TUser], w.time[TSys], w.time[TReal],
+ w.msg);
+}
+
+void
+werrstr(char *fmt, ...)
+{
+ va_list va;
+
+ if(up == nil)
+ return;
+
+ va_start(va, fmt);
+ vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
+ va_end(va);
+}
+
+static int
+generrstr(char *buf, uint nbuf)
+{
+ char *err;
+
+ if(nbuf == 0)
+ error(Ebadarg);
+ if(nbuf > ERRMAX)
+ nbuf = ERRMAX;
+ validaddr((uintptr)buf, nbuf, 1);
+
+ err = up->errstr;
+ utfecpy(err, err+nbuf, buf);
+ utfecpy(buf, buf+nbuf, up->syserrstr);
+
+ up->errstr = up->syserrstr;
+ up->syserrstr = err;
+
+ return 0;
+}
+
+uintptr
+syserrstr(va_list list)
+{
+ char *buf;
+ uint len;
+
+ buf = va_arg(list, char*);
+ len = va_arg(list, uint);
+ return (uintptr)generrstr(buf, len);
+}
+
+/* compatibility for old binaries */
+uintptr
+sys_errstr(va_list list)
+{
+ return (uintptr)generrstr(va_arg(list, char*), 64);
+}
+
+uintptr
+sysnotify(va_list list)
+{
+ int (*f)(void*, char*);
+ f = va_arg(list, void*);
+ if(f != nil)
+ validaddr((uintptr)f, sizeof(void*), 0);
+ up->notify = f;
+ return 0;
+}
+
+uintptr
+sysnoted(va_list list)
+{
+ if(va_arg(list, int) != NRSTR && !up->notified)
+ error(Egreg);
+ return 0;
+}
+
+uintptr
+syssegbrk(va_list list)
+{
+ int i;
+ uintptr addr;
+ Segment *s;
+
+ addr = va_arg(list, uintptr);
+ for(i = 0; i < NSEG; i++) {
+ s = up->seg[i];
+ if(s == nil || addr < s->base || addr >= s->top)
+ continue;
+ switch(s->type&SG_TYPE) {
+ case SG_TEXT:
+ case SG_DATA:
+ case SG_STACK:
+ case SG_PHYSICAL:
+ case SG_FIXED:
+ case SG_STICKY:
+ error(Ebadarg);
+ default:
+ return ibrk(va_arg(list, uintptr), i);
+ }
+ }
+ error(Ebadarg);
+}
+
+uintptr
+syssegattach(va_list list)
+{
+ int attr;
+ char *name;
+ uintptr va;
+ ulong len;
+
+ attr = va_arg(list, int);
+ name = va_arg(list, char*);
+ va = va_arg(list, uintptr);
+ len = va_arg(list, ulong);
+ validaddr((uintptr)name, 1, 0);
+ name = validnamedup(name, 1);
+ if(waserror()){
+ free(name);
+ nexterror();
+ }
+ va = segattach(attr, name, va, len);
+ free(name);
+ poperror();
+ return va;
+}
+
+uintptr
+syssegdetach(va_list list)
+{
+ int i;
+ uintptr addr;
+ Segment *s;
+
+ addr = va_arg(list, uintptr);
+
+ qlock(&up->seglock);
+ if(waserror()){
+ qunlock(&up->seglock);
+ nexterror();
+ }
+
+ for(i = 0; i < NSEG; i++)
+ if((s = up->seg[i]) != nil) {
+ qlock(s);
+ if((addr >= s->base && addr < s->top) ||
+ (s->top == s->base && addr == s->base))
+ goto found;
+ qunlock(s);
+ }
+
+ error(Ebadarg);
+
+found:
+ /*
+ * Check we are not detaching the initial stack segment.
+ */
+ if(s == up->seg[SSEG]){
+ qunlock(s);
+ error(Ebadarg);
+ }
+ up->seg[i] = nil;
+ qunlock(s);
+ putseg(s);
+ qunlock(&up->seglock);
+ poperror();
+
+ /* Ensure we flush any entries from the lost segment */
+ flushmmu();
+ return 0;
+}
+
+uintptr
+syssegfree(va_list list)
+{
+ Segment *s;
+ uintptr from, to;
+
+ from = va_arg(list, uintptr);
+ to = va_arg(list, ulong);
+ to += from;
+ if(to < from)
+ error(Ebadarg);
+ s = seg(up, from, 1);
+ if(s == nil)
+ error(Ebadarg);
+ to &= ~(BY2PG-1);
+ from = PGROUND(from);
+ if(from >= to) {
+ qunlock(s);
+ return 0;
+ }
+ if(to > s->top) {
+ qunlock(s);
+ error(Ebadarg);
+ }
+ mfreeseg(s, from, (to - from) / BY2PG);
+ qunlock(s);
+ flushmmu();
+ return 0;
+}
+
+/* For binary compatibility */
+uintptr
+sysbrk_(va_list list)
+{
+ return ibrk(va_arg(list, uintptr), BSEG);
+}
+
+uintptr
+sysrendezvous(va_list list)
+{
+ uintptr tag, val, new;
+ Proc *p, **l;
+
+ tag = va_arg(list, uintptr);
+ new = va_arg(list, uintptr);
+ l = &REND(up->rgrp, tag);
+
+ lock(up->rgrp);
+ for(p = *l; p != nil; p = p->rendhash) {
+ if(p->rendtag == tag) {
+ *l = p->rendhash;
+ val = p->rendval;
+ p->rendval = new;
+ unlock(up->rgrp);
+
+ ready(p);
+
+ return val;
+ }
+ l = &p->rendhash;
+ }
+
+ /* Going to sleep here */
+ up->rendtag = tag;
+ up->rendval = new;
+ up->rendhash = *l;
+ *l = up;
+ up->state = Rendezvous;
+ unlock(up->rgrp);
+
+ sched();
+
+ return up->rendval;
+}
+
+/*
+ * The implementation of semaphores is complicated by needing
+ * to avoid rescheduling in syssemrelease, so that it is safe
+ * to call from real-time processes. This means syssemrelease
+ * cannot acquire any qlocks, only spin locks.
+ *
+ * Semacquire and semrelease must both manipulate the semaphore
+ * wait list. Lock-free linked lists only exist in theory, not
+ * in practice, so the wait list is protected by a spin lock.
+ *
+ * The semaphore value *addr is stored in user memory, so it
+ * cannot be read or written while holding spin locks.
+ *
+ * Thus, we can access the list only when holding the lock, and
+ * we can access the semaphore only when not holding the lock.
+ * This makes things interesting. Note that sleep's condition function
+ * is called while holding two locks - r and up->rlock - so it cannot
+ * access the semaphore value either.
+ *
+ * An acquirer announces its intention to try for the semaphore
+ * by putting a Sema structure onto the wait list and then
+ * setting Sema.waiting. After one last check of semaphore,
+ * the acquirer sleeps until Sema.waiting==0. A releaser of n
+ * must wake up n acquirers who have Sema.waiting set. It does
+ * this by clearing Sema.waiting and then calling wakeup.
+ *
+ * There are three interesting races here.
+
+ * The first is that in this particular sleep/wakeup usage, a single
+ * wakeup can rouse a process from two consecutive sleeps!
+ * The ordering is:
+ *
+ * (a) set Sema.waiting = 1
+ * (a) call sleep
+ * (b) set Sema.waiting = 0
+ * (a) check Sema.waiting inside sleep, return w/o sleeping
+ * (a) try for semaphore, fail
+ * (a) set Sema.waiting = 1
+ * (a) call sleep
+ * (b) call wakeup(a)
+ * (a) wake up again
+ *
+ * This is okay - semacquire will just go around the loop
+ * again. It does mean that at the top of the for(;;) loop in
+ * semacquire, phore.waiting might already be set to 1.
+ *
+ * The second is that a releaser might wake an acquirer who is
+ * interrupted before he can acquire the lock. Since
+ * release(n) issues only n wakeup calls -- only n can be used
+ * anyway -- if the interrupted process is not going to use his
+ * wakeup call he must pass it on to another acquirer.
+ *
+ * The third race is similar to the second but more subtle. An
+ * acquirer sets waiting=1 and then does a final canacquire()
+ * before going to sleep. The opposite order would result in
+ * missing wakeups that happen between canacquire and
+ * waiting=1. (In fact, the whole point of Sema.waiting is to
+ * avoid missing wakeups between canacquire() and sleep().) But
+ * there can be spurious wakeups between a successful
+ * canacquire() and the following semdequeue(). This wakeup is
+ * not useful to the acquirer, since he has already acquired
+ * the semaphore. Like in the previous case, though, the
+ * acquirer must pass the wakeup call along.
+ *
+ * This is all rather subtle. The code below has been verified
+ * with the spin model /sys/src/9/port/semaphore.p. The
+ * original code anticipated the second race but not the first
+ * or third, which were caught only with spin. The first race
+ * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
+ * It was lucky that my abstract model of sleep/wakeup still managed
+ * to preserve that behavior.
+ *
+ * I remain slightly concerned about memory coherence
+ * outside of locks. The spin model does not take
+ * queued processor writes into account so we have to
+ * think hard. The only variables accessed outside locks
+ * are the semaphore value itself and the boolean flag
+ * Sema.waiting. The value is only accessed with cmpswap,
+ * whose job description includes doing the right thing as
+ * far as memory coherence across processors. That leaves
+ * Sema.waiting. To handle it, we call coherence() before each
+ * read and after each write. - rsc
+ */
+
+/* Add semaphore p with addr a to list in seg. */
+static void
+semqueue(Segment *s, long *a, Sema *p)
+{
+ memset(p, 0, sizeof *p);
+ p->addr = a;
+ lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
+ p->next = &s->sema;
+ p->prev = s->sema.prev;
+ p->next->prev = p;
+ p->prev->next = p;
+ unlock(&s->sema);
+}
+
+/* Remove semaphore p from list in seg. */
+static void
+semdequeue(Segment *s, Sema *p)
+{
+ lock(&s->sema);
+ p->next->prev = p->prev;
+ p->prev->next = p->next;
+ unlock(&s->sema);
+}
+
+/* Wake up n waiters with addr a on list in seg. */
+static void
+semwakeup(Segment *s, long *a, long n)
+{
+ Sema *p;
+
+ lock(&s->sema);
+ for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
+ if(p->addr == a && p->waiting){
+ p->waiting = 0;
+ coherence();
+ wakeup(p);
+ n--;
+ }
+ }
+ unlock(&s->sema);
+}
+
+/* Add delta to semaphore and wake up waiters as appropriate. */
+static long
+semrelease(Segment *s, long *addr, long delta)
+{
+ long value;
+
+ do
+ value = *addr;
+ while(!cmpswap(addr, value, value+delta));
+ semwakeup(s, addr, delta);
+ return value+delta;
+}
+
+/* Try to acquire semaphore using compare-and-swap */
+static int
+canacquire(long *addr)
+{
+ long value;
+
+ while((value=*addr) > 0)
+ if(cmpswap(addr, value, value-1))
+ return 1;
+ return 0;
+}
+
+/* Should we wake up? */
+static int
+semawoke(void *p)
+{
+ coherence();
+ return !((Sema*)p)->waiting;
+}
+
+/* Acquire semaphore (subtract 1). */
+static int
+semacquire(Segment *s, long *addr, int block)
+{
+ int acquired;
+ Sema phore;
+
+ if(canacquire(addr))
+ return 1;
+ if(!block)
+ return 0;
+ semqueue(s, addr, &phore);
+ if(acquired = !waserror()){
+ for(;;){
+ phore.waiting = 1;
+ coherence();
+ if(canacquire(addr))
+ break;
+ sleep(&phore, semawoke, &phore);
+ }
+ poperror();
+ }
+ semdequeue(s, &phore);
+ coherence(); /* not strictly necessary due to lock in semdequeue */
+ if(!phore.waiting)
+ semwakeup(s, addr, 1);
+ if(!acquired)
+ nexterror();
+ return 1;
+}
+
+/* Acquire semaphore or time-out */
+static int
+tsemacquire(Segment *s, long *addr, ulong ms)
+{
+ int timedout, acquired;
+ ulong t;
+ Sema phore;
+
+ if(canacquire(addr))
+ return 1;
+ if(ms == 0)
+ return 0;
+ timedout = 0;
+ semqueue(s, addr, &phore);
+ if(acquired = !waserror()){
+ for(;;){
+ phore.waiting = 1;
+ coherence();
+ if(canacquire(addr))
+ break;
+ t = MACHP(0)->ticks;
+ tsleep(&phore, semawoke, &phore, ms);
+ t = TK2MS(MACHP(0)->ticks - t);
+ if(t >= ms){
+ timedout = 1;
+ break;
+ }
+ ms -= t;
+ }
+ poperror();
+ }
+ semdequeue(s, &phore);
+ coherence(); /* not strictly necessary due to lock in semdequeue */
+ if(!phore.waiting)
+ semwakeup(s, addr, 1);
+ if(!acquired)
+ nexterror();
+ return !timedout;
+}
+
+uintptr
+syssemacquire(va_list list)
+{
+ int block;
+ long *addr;
+ Segment *s;
+
+ addr = va_arg(list, long*);
+ block = va_arg(list, int);
+ evenaddr((uintptr)addr);
+ s = seg(up, (uintptr)addr, 0);
+ if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
+ validaddr((uintptr)addr, sizeof(long), 1);
+ error(Ebadarg);
+ }
+ if(*addr < 0)
+ error(Ebadarg);
+ return (uintptr)semacquire(s, addr, block);
+}
+
+uintptr
+systsemacquire(va_list list)
+{
+ long *addr;
+ ulong ms;
+ Segment *s;
+
+ addr = va_arg(list, long*);
+ ms = va_arg(list, ulong);
+ evenaddr((uintptr)addr);
+ s = seg(up, (uintptr)addr, 0);
+ if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
+ validaddr((uintptr)addr, sizeof(long), 1);
+ error(Ebadarg);
+ }
+ if(*addr < 0)
+ error(Ebadarg);
+ return (uintptr)tsemacquire(s, addr, ms);
+}
+
+uintptr
+syssemrelease(va_list list)
+{
+ long *addr, delta;
+ Segment *s;
+
+ addr = va_arg(list, long*);
+ delta = va_arg(list, long);
+ evenaddr((uintptr)addr);
+ s = seg(up, (uintptr)addr, 0);
+ if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
+ validaddr((uintptr)addr, sizeof(long), 1);
+ error(Ebadarg);
+ }
+ /* delta == 0 is a no-op, not a release */
+ if(delta < 0 || *addr < 0)
+ error(Ebadarg);
+ return (uintptr)semrelease(s, addr, delta);
+}
+
+/* For binary compatibility */
+uintptr
+sys_nsec(va_list list)
+{
+ vlong *v;
+
+ /* return in register on 64bit machine */
+ if(sizeof(uintptr) == sizeof(vlong)){
+ USED(list);
+ return (uintptr)todget(nil);
+ }
+
+ v = va_arg(list, vlong*);
+ evenaddr((uintptr)v);
+ validaddr((uintptr)v, sizeof(vlong), 1);
+ *v = todget(nil);
+ return 0;
+}