jehanne/sys/src/kern/port/sysproc.c

1166 lines
26 KiB
C

/*
* This file is part of Jehanne.
*
* Copyright (C) 2015-2016 Giacomo Tesio <giacomo@tesio.it>
*
* Jehanne is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 2 of the License.
*
* Jehanne is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jehanne. If not, see <http://www.gnu.org/licenses/>.
*/
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "../port/error.h"
#include <ptrace.h>
int
sysrfork(uint32_t flag)
{
Proc *p;
int i, pid;
ProcSegment *s;
Fgrp *ofg;
Pgrp *opg;
Rgrp *org;
Egrp *oeg;
Mach *wm;
uintptr_t ds;
void (*pt)(Proc*, int, int64_t, int64_t);
uint64_t ptarg;
/* Check flags before we commit */
if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
error(Ebadarg);
if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
error(Ebadarg);
if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
error(Ebadarg);
if((flag&RFPROC) == 0) {
if(flag & (RFMEM|RFNOWAIT))
error(Ebadarg);
if(flag & (RFFDG|RFCFDG)) {
ofg = up->fgrp;
if(flag & RFFDG)
up->fgrp = dupfgrp(ofg);
else
up->fgrp = dupfgrp(nil);
closefgrp(ofg);
}
if(flag & (RFNAMEG|RFCNAMEG)) {
opg = up->pgrp;
up->pgrp = newpgrp();
if(flag & RFNAMEG)
pgrpcpy(up->pgrp, opg);
/* inherit noattach */
up->pgrp->noattach = opg->noattach;
closepgrp(opg);
}
if(flag & RFNOMNT)
up->pgrp->noattach = 1;
if(flag & RFREND) {
org = up->rgrp;
up->rgrp = newrgrp();
closergrp(org);
}
if(flag & (RFENVG|RFCENVG)) {
oeg = up->egrp;
up->egrp = smalloc(sizeof(Egrp));
up->egrp->r.ref = 1;
if(flag & RFENVG)
envcpy(up->egrp, oeg);
closeegrp(oeg);
}
if(flag & RFNOTEG)
up->noteid = incref(&noteidalloc);
return 0;
}
if((flag & RFMEM) == 0){
/* assume half might change copy-on-write, but cap it */
ds = procdatasize(up, 1)/2;
if(ds > 64*MB)
ds = 64*MB;
if(!umem_available(ds))
error(Enovmem);
}
p = newproc();
p->trace = up->trace;
p->scallnr = up->scallnr;
memmove(p->arg, up->arg, sizeof(up->arg));
p->nerrlab = 0;
p->slash = up->slash;
p->dot = up->dot;
incref(&p->dot->r);
memmove(p->note, up->note, sizeof(p->note));
p->privatemem = up->privatemem;
p->nnote = up->nnote;
p->notified = 0;
p->lastnote = up->lastnote;
p->notify = up->notify;
p->ureg = up->ureg;
p->dbgreg = 0;
/* Make a new set of memory segments */
i = -1;
MLG("flag %d: RFMEM %d", flag, flag&RFMEM);
rlock(&up->seglock);
if(waserror()){
while(i >= 0){
if(p->seg[i]){
segment_release(&p->seg[i]);
}
--i;
}
memset(p->seg, 0, NSEG*sizeof(ProcSegment*));
runlock(&up->seglock);
nexterror();
}
memmove(p->seg, up->seg, NSEG*sizeof(ProcSegment*));
if(flag & RFMEM){
for(i = 0; i < NSEG; i++){
s = p->seg[i];
if(s && !segment_share(&p->seg[i])){
--i; /* p->seg[i] was not allocated, do not release */
error(Enovmem);
}
}
} else {
for(i = 0; i < NSEG; i++){
s = p->seg[i];
if(s && !segment_fork(&p->seg[i])){
--i; /* p->seg[i] was not allocated, do not release */
error(Enovmem);
}
}
}
poperror();
runlock(&up->seglock);
/* File descriptors */
if(flag & (RFFDG|RFCFDG)) {
if(flag & RFFDG)
p->fgrp = dupfgrp(up->fgrp);
else
p->fgrp = dupfgrp(nil);
}
else {
p->fgrp = up->fgrp;
incref(&p->fgrp->r);
}
/* Process groups */
if(flag & (RFNAMEG|RFCNAMEG)) {
p->pgrp = newpgrp();
if(flag & RFNAMEG)
pgrpcpy(p->pgrp, up->pgrp);
/* inherit noattach */
p->pgrp->noattach = up->pgrp->noattach;
}
else {
p->pgrp = up->pgrp;
incref(&p->pgrp->r);
}
if(flag & RFNOMNT)
p->pgrp->noattach = 1;
if(flag & RFREND)
p->rgrp = newrgrp();
else {
incref(&up->rgrp->r);
p->rgrp = up->rgrp;
}
/* Environment group */
if(flag & (RFENVG|RFCENVG)) {
p->egrp = smalloc(sizeof(Egrp));
p->egrp->r.ref = 1;
if(flag & RFENVG)
envcpy(p->egrp, up->egrp);
}
else {
p->egrp = up->egrp;
incref(&p->egrp->r);
}
p->hang = up->hang;
p->procmode = up->procmode;
/* Craft a return frame which will cause the child to pop out of
* the scheduler in user mode with the return register zero
*/
sysrforkchild(p, up);
p->parent = up;
p->parentpid = up->pid;
if(flag&RFNOWAIT)
p->parentpid = 0;
else {
lock(&up->exl);
up->nchild++;
unlock(&up->exl);
}
if((flag&RFNOTEG) == 0)
p->noteid = up->noteid;
pid = p->pid;
memset(p->time, 0, sizeof(p->time));
p->time[TReal] = sys->ticks;
kstrdup(&p->text, up->text);
kstrdup(&p->user, up->user);
/*
* since the bss/data segments are now shareable,
* any mmu info about this process is now stale
* (i.e. has bad properties) and has to be discarded.
*/
mmuflush();
p->basepri = up->basepri;
p->priority = up->basepri;
p->fixedpri = up->fixedpri;
p->mp = up->mp;
wm = up->wired;
if(wm)
procwired(p, wm->machno);
if(p->trace && (pt = proctrace) != nil){
strncpy((char*)&ptarg, p->text, sizeof ptarg);
pt(p, SName, 0, ptarg);
}
ready(p);
sched();
return pid;
}
uintptr_t
sysexec(char* p, char **argv)
{
Ldseg *ldseg, *txtseg, *dataseg;
Fgrp *f;
Chan *chan;
ImagePointer img;
ElfSegPointer load_segments[NLOAD];
ProcSegment *s, *ts, *ds, *bs, *es;
PagePointer page, argvpage;
int argc, progargc, i, j, n, nldseg;
char *a, **argvcopy, *elem, *file;
char line[64], *progarg[sizeof(line)/2+1];
long hdrsz;
uintptr_t entry, stack, sbottom, argsize, tmp;
void (*pt)(Proc*, int, int64_t, int64_t);
uint64_t ptarg;
/*
* Open the file, remembering the final element and the full name.
*/
elem = nil;
p = validaddr(p, 1, 0);
file = validnamedup(p, 1);
MLG("file %d", file);
if(waserror()){
free(file);
nexterror();
}
chan = namec(file, Aopen, OEXEC, 0);
if(waserror()){
if(chan)
cclose(chan);
if(elem != nil)
free(elem);
nexterror();
}
kstrdup(&elem, up->genbuf);
/*
* Read the header.
* If it's a #!, fill in progarg[] with info then read a new header
* from the file indicated by the #!.
* The #! line must be less than sizeof(Exec) in size,
* including the terminating \n.
*/
hdrsz = chan->dev->read(chan, &line, sizeof(line), 0);
if(hdrsz < 2)
error(Ebadexec);
argc = 0;
progargc = 0;
if(line[0] == '#' && line[1] == '!'){
p = memchr(line, '\n', MIN(sizeof line, hdrsz));
if(p == nil)
error(Ebadexec);
*p = '\0';
argc = tokenize(line+2, progarg, nelem(progarg));
if(argc == 0)
error(Ebadexec);
/* The original file becomes an extra arg after #! line */
progarg[argc++] = file;
progargc = argc;
/*
* Take the #! $0 as a file to open, and replace
* $0 with the original path's name.
*/
p = progarg[0];
progarg[0] = elem;
cclose(chan);
chan = nil; /* if the following namec() call fails,
* the previous waserror() will close(chan):
* this assignment let it skip the close,
* since chan has just been closed.
*/
chan = namec(p, Aopen, OEXEC, 0);
}
/*
* #! has had its chance, now we need a real binary.
*/
nldseg = elf64ldseg(chan, &entry, &ldseg, cputype, PGSZ);
if(nldseg != 2){
// print("exec: elf64ldseg returned %d segs!\n", nldseg);
error(Ebadexec);
}
txtseg = ldseg;
dataseg = ldseg+1;
/*
* The new stack will contain, in descending address order:
* - argument strings;
* - array of pointers to the argument strings with a
* terminating nil (argv).
* - argc
* When the exec is committed, this temporary stack in es will
* become SSEG.
* The architecture-dependent code which jumps to the new image
* will also push a count of the argument array onto the stack (argc).
*/
es = nil; /* exec new stack */
if(!segment_virtual(&es, SgStack, SgRead|SgWrite, 0, USTKTOP-USTKSIZE, USTKTOP))
error(Enovmem);
if(waserror()){
segment_release(&es);
nexterror();
}
/* Step 0: Compute the total size and number of arguments */
argsize = 0;
/* start with arguments found from a #! header. */
for(i = 0; i < argc; i++)
argsize += strlen(progarg[i]) + 1 + sizeof(char*);
/* then size strings pointed to by the syscall argument
* argv verifing that both argv and the strings it
* points to are valid.
*/
argvcopy = argv;
evenaddr(PTR2UINT(argvcopy));
for(i = 0;; i++, argvcopy++){
a = *(char**)validaddr(argvcopy, sizeof(char**), 0);
if(a == nil)
break;
a = validaddr(a, 1, 0);
n = ((char*)vmemchr(a, 0, 0x7fffffff) - a) + 1;
/* This futzing is so argv[0] gets validated even
* though it will be thrown away if this is a shell
* script.
*/
if(argc > 0 && i == 0)
continue;
argsize += n + sizeof(char*);
//print("argv[%d] = %s, argsize %d, n %d\n", argc, a, argsize, n);
argc++;
}
if(argc < 1)
error(Ebadexec);
argsize += sizeof(char*); /* place for argv[argc] = nil */
argsize += sizeof(uintptr_t); /* place for argc = nil */
tmp = es->top - argsize;
if(tmp&~(PGSZ-1) != (tmp+sizeof(uintptr_t)+sizeof(char*)*(argc+1))&~(PGSZ-1)){
/* the argument pointers cross a page boundary, keep
* them all in the same page
*/
tmp -= (tmp+sizeof(uintptr_t)+sizeof(char*)*(argc+1))&(PGSZ-1);
}
tmp = sysexecstack(tmp, argc);
argsize += es->top - argsize - tmp;
/* Step 1: Fault enough pages in the new stack */
stack = es->top;
while(stack > es->top - argsize){
stack -= PGSZ;
if(!segment_fault(&tmp, &stack, es, FaultWrite))
error(Enovmem);
}
//print("argsize %d, first stack page %d\n", argsize, stack);
/* Step 2: Copy arguments into pages in descending order */
/* prepare argvcopy to point to the right location */
tmp = es->top - argsize;
argvpage = segment_page(es, tmp);
char *apmem;
if(argvpage == 0)
panic("sysexec: segment_fault did not allocate enough pages");
apmem = page_kmap(argvpage);
argvcopy = (char**)((uintptr_t)(apmem + (tmp&(PGSZ-1))));
/* add argc */
*((uintptr_t*)argvcopy) = argc;
++argvcopy;
/* prepare pmem to to point to the last stack page */
char *pmem;
stack = es->top;
sbottom = es->top;
sbottom -= PGSZ;
page = segment_page(es, sbottom);
if(page == 0)
panic("sysexec: segment_fault did not allocate enough pages");
pmem = page_kmap(page);
/* start filling pmem (from the end) and argvcopy
* (from the begin) with arguments found
* from a #! header.
*/
for(i = 0; i < progargc; i++){
n = strlen(progarg[i])+1;
CopyProgArgument:
if(sbottom <= stack-n){
a = pmem+((stack-n)&(PGSZ-1));
memmove(a, progarg[i], n);
stack -= n;
} else {
/* the current argument cross multiple pages */
if(stack&(PGSZ-1)){
/* fill the rest of the current page */
memmove(pmem, progarg[i]+n-1-(stack&(PGSZ-1)), (stack&(PGSZ-1))-1);
*(pmem+(stack&(PGSZ-1))-1) = 0;
n -= (stack&(PGSZ-1));
stack -= (stack&(PGSZ-1));
}
while(sbottom > stack - n){
page_kunmap(page, &pmem);
sbottom -= PGSZ;
page = segment_page(es, sbottom);
if(page == 0)
panic("sysexec: segment_fault did not allocate enough pages");
pmem = page_kmap(page);
if(n > PGSZ){
/* fill one full page */
memmove(pmem, progarg[i]+n-PGSZ, PGSZ);
n -= PGSZ;
stack -= PGSZ;
}
}
goto CopyProgArgument;
}
argvcopy[i] = (char*)stack;
}
if(progargc > 0){
/* we are in a script: argv[0] has been replaced in
* progarg and already copied, so we need to skip
* it and add any further elements from argv.
*/
--progargc;
}
/* continue filling pmem (descending) and argvcopy
* (from the current point) with exec arguments
*/
for(; i < argc; i++){
j = i - progargc;
n = strlen(argv[j])+1;
CopyExecArgument:
if(sbottom <= stack-n){
a = pmem+((stack-n)&(PGSZ-1));
memmove(a, argv[j], n);
stack -= n;
} else {
/* the current argument cross multiple pages */
if(stack&(PGSZ-1)){
/* fill the rest of the current page */
memmove(pmem, argv[j]+n-1-(stack&(PGSZ-1)), (stack&(PGSZ-1))-1);
*(pmem+(stack&(PGSZ-1))-1) = 0;
n -= (stack&(PGSZ-1));
stack -= (stack&(PGSZ-1));
}
while(sbottom > stack - n){
page_kunmap(page, &pmem);
sbottom -= PGSZ;
page = segment_page(es, sbottom);
if(page == 0)
panic("sysexec: segment_fault did not allocate enough pages");
pmem = page_kmap(page);
if(n > PGSZ){
/* fill one full page */
memmove(pmem, argv[j]+n-PGSZ, PGSZ);
n -= PGSZ;
stack -= PGSZ;
}
}
goto CopyExecArgument;
}
argvcopy[i] = (char*)stack;
INSPECT(stack);
INSPECT(pmem);
}
argvcopy[i] = nil; /* terminating nil */
page_kunmap(page, &pmem);
page_kunmap(argvpage, &apmem);
INSPECT(argvcopy);
/*
* All the argument processing is now done, ready for the image.
*/
/* build image for file */
if(!image_attach(&img, chan, ldseg))
error(Enovmem);
if(waserror()){
image_release(img);
nexterror();
}
image_segments(load_segments, img);
ts = nil;
if(!segment_load(&ts, load_segments[0], txtseg))
error(Enovmem);
if(waserror()){
segment_release(&ts);
nexterror();
}
ds = nil;
if(!segment_load(&ds, load_segments[1], dataseg))
error(Enovmem);
if(waserror()){
segment_release(&ds);
nexterror();
}
bs = nil;
tmp = dataseg->pg0vaddr + dataseg->pg0off + dataseg->memsz;
if(tmp < ds->top)
tmp = ds->top;
if(!segment_virtual(&bs, SgBSS, SgRead|SgWrite, 0,
ds->top,
tmp))
error(Enovmem);
free(ldseg); /* free elf segments */
/*
* Close on exec
*/
f = up->fgrp;
for(i=0; i<=f->maxfd; i++)
fdclose(i, CCEXEC);
wlock(&up->seglock);
if(waserror()){
wunlock(&up->seglock);
nexterror();
}
/*
* Free old memory.
* Special segments maintained across exec.
*/
for(i = SSEG; i <= BSEG; i++) {
if(up->seg[i])
segment_release(&up->seg[i]);
}
for(i = BSEG+1; i< NSEG; i++) {
s = up->seg[i];
if(s && (s->flags&SgCExec))
segment_release(&up->seg[i]);
}
if(up->trace && (pt = proctrace) != nil){
strncpy((char*)&ptarg, elem, sizeof ptarg);
pt(up, SName, 0, ptarg);
}
/*
* At this point, the mmu contains info about the old address
* space and needs to be flushed
*/
mmuflush();
up->seg[SSEG] = es;
up->seg[TSEG] = ts;
up->seg[DSEG] = ds;
up->seg[BSEG] = bs;
poperror(); /* ds */
poperror(); /* ts */
poperror(); /* es */
poperror(); /* img */
image_release(img);
free(up->text);
up->text = elem;
elem = nil;
if(up->setargs) /* setargs == 0 => args in stack from sysexec */
free(up->args);
up->args = argvcopy;
up->nargs = argc;
up->setargs = 0;
if(up->parentpid == 0){
/* this is *init* replacing itself: we set it to 1
* so that we can say that all processes started from
* an image have a parentpid (up->parent is still 0)
* (see segment_release)
*/
up->parentpid = 1;
}
// poperror(); /* p (up->args) */
poperror(); /* seglock */
wunlock(&up->seglock);
/*
* '/' processes are higher priority. (TO DO: really?)
*/
if(chan->dev->dc == L'/')
up->basepri = PriRoot;
up->priority = up->basepri;
poperror(); /* chan, elem */
cclose(chan);
poperror(); /* file */
free(file);
qlock(&up->debug);
up->nnote = 0;
up->notify = 0;
up->notified = 0;
up->privatemem = 0;
sysprocsetup(up);
qunlock(&up->debug);
if(up->hang)
up->procctl = Proc_stopme;
return (uintptr_t)sysexecregs(entry, argsize);
}
int
return0(void* _1)
{
return 0;
}
long
sysalarm(unsigned long millisecs)
{
return procalarm(millisecs);
}
long
sysawake(long millisecs)
{
return procawake(millisecs);
}
int
sys_exits(char *status)
{
char *inval = "invalid exit string";
char buf[ERRMAX];
if(status){
if(waserror())
status = inval;
else{
status = validaddr(status, 1, 0);
if(vmemchr(status, 0, ERRMAX) == 0){
memmove(buf, status, ERRMAX);
buf[ERRMAX-1] = 0;
status = buf;
}
poperror();
}
}
pexit(status, 1);
return 0;
}
int
sysawait(char *p, int n)
{
int i;
int pid;
Waitmsg w;
/*
* int await(char* s, int n);
* should really be
* usize await(char* s, usize n);
*/
p = validaddr(p, n, 1);
pid = pwait(&w);
if(pid < 0)
return -1;
i = snprint(p, n, "%d %lud %lud %lud %q",
w.pid,
w.time[TUser], w.time[TSys], w.time[TReal],
w.msg);
return i;
}
void
werrstr(char *fmt, ...)
{
va_list va;
if(up == nil)
return;
va_start(va, fmt);
vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
va_end(va);
}
static void
generrstr(char *buf, int n)
{
char *p, tmp[ERRMAX];
if(n <= 0)
error(Ebadarg);
p = validaddr(buf, n, 1);
if(n > sizeof tmp)
n = sizeof tmp;
memmove(tmp, p, n);
/* make sure it's NUL-terminated */
tmp[n-1] = '\0';
memmove(p, up->syserrstr, n);
p[n-1] = '\0';
memmove(up->syserrstr, tmp, n);
}
int
syserrstr(char* err, int nerr)
{
generrstr(err, nerr);
return 0;
}
int
sysnotify(void* a0)
{
void (*f)(void*, char*);
/*
* int notify(void (*f)(void*, char*));
*/
f = (void (*)(void*, char*))a0;
if(f != nil)
validaddr(f, sizeof(void (*)(void*, char*)), 0);
up->notify = f;
return 0;
}
int
sysnoted(int v)
{
if(v != NRSTR && !up->notified)
error(Egreg);
return 0;
}
void*
sysrendezvous(void* tagp, void* rendvalp)
{
Proc *p, **l;
uintptr_t tag, val, pc, rendval;
void (*pt)(Proc*, int, int64_t, int64_t);
void *result;
tag = PTR2UINT(tagp);
rendval = PTR2UINT(rendvalp);
l = &REND(up->rgrp, tag);
up->rendval = ~0;
lock(&up->rgrp->l);
for(p = *l; p; p = p->rendhash) {
if(p->rendtag == tag) {
*l = p->rendhash;
val = p->rendval;
p->rendval = rendval;
unlock(&up->rgrp->l);
while(p->mach != 0)
;
ready(p);
result = UINT2PTR(val);
goto rendezvousDone;
}
l = &p->rendhash;
}
if(awakeOnBlock(up)){
unlock(&up->rgrp->l);
result = UINT2PTR(up->rendval);
goto rendezvousDone;
}
/* Going to sleep here */
up->rendtag = tag;
up->rendval = rendval;
up->rendhash = *l;
*l = up;
up->state = Rendezvous;
if(up->trace && (pt = proctrace) != nil){
pc = (uintptr_t)sysrendezvous;
pt(up, SSleep, 0, Rendezvous|(pc<<8));
}
unlock(&up->rgrp->l);
sched();
result = UINT2PTR(up->rendval);
rendezvousDone:
awokeproc(up);
return result;
}
/*
* The implementation of semaphores is complicated by needing
* to avoid rescheduling in syssemrelease, so that it is safe
* to call from real-time processes. This means syssemrelease
* cannot acquire any qlocks, only spin locks.
*
* Semacquire and semrelease must both manipulate the semaphore
* wait list. Lock-free linked lists only exist in theory, not
* in practice, so the wait list is protected by a spin lock.
*
* The semaphore value *addr is stored in user memory, so it
* cannot be read or written while holding spin locks.
*
* Thus, we can access the list only when holding the lock, and
* we can access the semaphore only when not holding the lock.
* This makes things interesting. Note that sleep's condition function
* is called while holding two locks - r and up->rlock - so it cannot
* access the semaphore value either.
*
* An acquirer announces its intention to try for the semaphore
* by putting a Sema structure onto the wait list and then
* setting Sema.waiting. After one last check of semaphore,
* the acquirer sleeps until Sema.waiting==0. A releaser of n
* must wake up n acquirers who have Sema.waiting set. It does
* this by clearing Sema.waiting and then calling wakeup.
*
* There are three interesting races here.
* The first is that in this particular sleep/wakeup usage, a single
* wakeup can rouse a process from two consecutive sleeps!
* The ordering is:
*
* (a) set Sema.waiting = 1
* (a) call sleep
* (b) set Sema.waiting = 0
* (a) check Sema.waiting inside sleep, return w/o sleeping
* (a) try for semaphore, fail
* (a) set Sema.waiting = 1
* (a) call sleep
* (b) call wakeup(a)
* (a) wake up again
*
* This is okay - semacquire will just go around the loop
* again. It does mean that at the top of the for(;;) loop in
* semacquire, phore.waiting might already be set to 1.
*
* The second is that a releaser might wake an acquirer who is
* interrupted before he can acquire the lock. Since
* release(n) issues only n wakeup calls -- only n can be used
* anyway -- if the interrupted process is not going to use his
* wakeup call he must pass it on to another acquirer.
*
* The third race is similar to the second but more subtle. An
* acquirer sets waiting=1 and then does a final canacquire()
* before going to sleep. The opposite order would result in
* missing wakeups that happen between canacquire and
* waiting=1. (In fact, the whole point of Sema.waiting is to
* avoid missing wakeups between canacquire() and sleep().) But
* there can be spurious wakeups between a successful
* canacquire() and the following semdequeue(). This wakeup is
* not useful to the acquirer, since he has already acquired
* the semaphore. Like in the previous case, though, the
* acquirer must pass the wakeup call along.
*
* This is all rather subtle. The code below has been verified
* with the spin model /sys/src/9/port/semaphore.p. The
* original code anticipated the second race but not the first
* or third, which were caught only with spin. The first race
* is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
* It was lucky that my abstract model of sleep/wakeup still managed
* to preserve that behavior.
*
* I remain slightly concerned about memory coherence
* outside of locks. The spin model does not take
* queued processor writes into account so we have to
* think hard. The only variables accessed outside locks
* are the semaphore value itself and the boolean flag
* Sema.waiting. The value is only accessed with CAS,
* whose job description includes doing the right thing as
* far as memory coherence across processors. That leaves
* Sema.waiting. To handle it, we call coherence() before each
* read and after each write. - rsc
*/
/* Add semaphore p with addr a to list in seg. */
static void
semqueue(ProcSegment* s, int* addr, Sema* p)
{
memset(p, 0, sizeof *p);
p->addr = addr;
lock(&s->sema.rend.l); /* uses s->sema.Rendez.Lock, but no one else is */
p->next = &s->sema;
p->prev = s->sema.prev;
p->next->prev = p;
p->prev->next = p;
unlock(&s->sema.rend.l);
}
/* Remove semaphore p from list in seg. */
static void
semdequeue(ProcSegment* s, Sema* p)
{
lock(&s->sema.rend.l);
p->next->prev = p->prev;
p->prev->next = p->next;
unlock(&s->sema.rend.l);
}
/* Wake up n waiters with addr on list in seg. */
static void
semwakeup(ProcSegment* s, int* addr, int n)
{
Sema *p;
lock(&s->sema.rend.l);
for(p = s->sema.next; p != &s->sema && n > 0; p = p->next){
if(p->addr == addr && p->waiting){
p->waiting = 0;
coherence();
wakeup(&p->rend);
n--;
}
}
unlock(&s->sema.rend.l);
}
/* Add delta to semaphore and wake up waiters as appropriate. */
static int
semrelease(ProcSegment* s, int* addr, int delta)
{
int value;
do
value = *addr;
while(!CASW(addr, value, value+delta));
semwakeup(s, addr, delta);
return value+delta;
}
/* Try to acquire semaphore using compare-and-swap */
static int
canacquire(int* addr)
{
int value;
while((value = *addr) > 0){
if(CASW(addr, value, value-1))
return 1;
}
return 0;
}
/* Should we wake up? */
static int
semawoke(void* p)
{
coherence();
return !((Sema*)p)->waiting;
}
/* Acquire semaphore (subtract 1). */
static int
semacquire(ProcSegment* s, int* addr, int block)
{
int acquired;
Sema phore;
if(canacquire(addr))
return 1;
if(!block)
return 0;
acquired = 0;
semqueue(s, addr, &phore);
for(;;){
phore.waiting = 1;
coherence();
if(canacquire(addr)){
acquired = 1;
break;
}
if(waserror())
break;
sleep(&phore.rend, semawoke, &phore);
poperror();
}
semdequeue(s, &phore);
coherence(); /* not strictly necessary due to lock in semdequeue */
if(!phore.waiting)
semwakeup(s, addr, 1);
if(!acquired)
nexterror();
return 1;
}
/* Acquire semaphore or time-out */
static int
tsemacquire(ProcSegment* s, int* addr, uint64_t ms)
{
int acquired, timedout;
uint64_t t, elms;
Sema phore;
if(canacquire(addr))
return 1;
if(ms == 0)
return 0;
acquired = timedout = 0;
semqueue(s, addr, &phore);
for(;;){
phore.waiting = 1;
coherence();
if(canacquire(addr)){
acquired = 1;
break;
}
if(waserror())
break;
t = sys->ticks;
tsleep(&phore.rend, semawoke, &phore, ms);
elms = TK2MS(sys->ticks - t);
poperror();
if(elms >= ms){
timedout = 1;
break;
}
ms -= elms;
}
semdequeue(s, &phore);
coherence(); /* not strictly necessary due to lock in semdequeue */
if(!phore.waiting)
semwakeup(s, addr, 1);
if(timedout)
return 0;
if(!acquired)
nexterror();
return 1;
}
int
syssemacquire(int* addr, int block)
{
ProcSegment *s;
addr = validaddr(addr, sizeof(int), 1);
evenaddr(PTR2UINT(addr));
s = proc_segment(up, PTR2UINT(addr));
if(s == nil || (s->permissions&SgWrite) == 0 || (uintptr_t)addr+sizeof(int) > s->top){
validaddr(addr, sizeof(int), 1);
error(Ebadarg);
}
if(*addr < 0)
error(Ebadarg);
return semacquire(s, addr, block);
}
int
systsemacquire(int* addr, uint64_t ms)
{
ProcSegment *s;
addr = validaddr(addr, sizeof(int), 1);
evenaddr(PTR2UINT(addr));
s = proc_segment(up, PTR2UINT(addr));
if(s == nil || (s->permissions&SgWrite) == 0 || (uintptr_t)addr+sizeof(int) > s->top){
validaddr(addr, sizeof(int), 1);
error(Ebadarg);
}
if(*addr < 0)
error(Ebadarg);
return tsemacquire(s, addr, ms);
}
int
syssemrelease(int* addr, int delta)
{
ProcSegment *s;
addr = validaddr(addr, sizeof(int), 1);
evenaddr(PTR2UINT(addr));
s = proc_segment(up, PTR2UINT(addr));
if(s == nil || (s->permissions&SgWrite) == 0 || (uintptr_t)addr+sizeof(int) > s->top){
validaddr(addr, sizeof(int), 1);
error(Ebadarg);
}
/* delta == 0 is a no-op, not a release */
if(delta < 0 || *addr < 0)
error(Ebadarg);
return semrelease(s, addr, delta);
}