提交 ae76c346 创建 作者: Silas Boyd-Wickizer's avatar Silas Boyd-Wickizer

Merge 'flicts

...@@ -20,7 +20,8 @@ NM = $(TOOLPREFIX)nm ...@@ -20,7 +20,8 @@ NM = $(TOOLPREFIX)nm
OBJCOPY = $(TOOLPREFIX)objcopy OBJCOPY = $(TOOLPREFIX)objcopy
CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb \ CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb \
-m64 -Werror -std=c99 -fms-extensions -mno-sse -mcmodel=large -I$(QEMUSRC) \ -m64 -Werror -std=c99 -fms-extensions -mno-sse -mcmodel=large -mno-red-zone \
-I$(QEMUSRC) \
-fno-omit-frame-pointer -DHW_$(HW) -include param.h -include compiler.h -fno-omit-frame-pointer -DHW_$(HW) -include param.h -include compiler.h
CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector) CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector)
ASFLAGS = -m64 -gdwarf-2 -MD ASFLAGS = -m64 -gdwarf-2 -MD
...@@ -29,6 +30,7 @@ LDFLAGS += -m elf_x86_64 ...@@ -29,6 +30,7 @@ LDFLAGS += -m elf_x86_64
OBJS = \ OBJS = \
bio.o \ bio.o \
cga.o \ cga.o \
cilk.o \
condvar.o \ condvar.o \
console.o \ console.o \
crange.o \ crange.o \
......
...@@ -115,12 +115,6 @@ rep_nop(void) ...@@ -115,12 +115,6 @@ rep_nop(void)
} }
static inline void static inline void
barrier(void)
{
__asm volatile("" ::: "memory");
}
static inline void
lidt(void *p) lidt(void *p)
{ {
__asm volatile("lidt (%0)" : : "r" (p) : "memory"); __asm volatile("lidt (%0)" : : "r" (p) : "memory");
......
// cilk style run queue
// A work queue is built from NCPU per-core wqueues.
// A core pushes work to the head of its per-core wqueue.
// A core pops work from the head of its per-core wqueue.
// A core pops work from the tail of another core's per-core wqueue.
//
// Usage:
// void goo(uptr a0, uptr a1) {
// char *arg = (char*) a0;
// cprintf("goo\n");
// arg[1] = 'g';
// }
// void foo(uptr a0, uptr a1) {
// char *arg = (char*) a0;
// cilk_push(goo, a0, 0);
// arg[0] = 'f';
// cprintf("foo\n");
// }
// void example(void) {
// char arg[2];
// cilk_start();
// cilk_push(foo, (uptr)arg, 0);
// cprintf("example\n");
// cilk_end();
// cprintf("%c %c\n", arg[0], arg[1]);
// }
#if CILKENABLE
#include "types.h"
#include "kernel.h"
#include "amd64.h"
#include "cpu.h"
#include "bits.h"
#include "spinlock.h"
#include "condvar.h"
#include "queue.h"
#include "proc.h"
#include "mtrace.h"
#include "qlock.h"
#define NSLOTS (1 << CILKSHIFT)
struct cilkqueue {
struct cilkthread *thread[NSLOTS];
volatile int head __mpalign__;
qlock_t lock;
volatile int tail;
__padout__;
} __mpalign__;
struct cilkthread {
u64 rip;
u64 arg0;
u64 arg1;
struct cilkframe *frame; // parent cilkframe
__padout__;
} __mpalign__;
struct cilkstat {
u64 push;
u64 full;
u64 pop;
u64 steal;
__padout__;
} __mpalign__;
struct cilkqueue queue[NCPU] __mpalign__;
struct cilkstat stat[NCPU] __mpalign__;
static struct cilkqueue *
cilk_cur(void)
{
return &queue[mycpu()->id];
}
static struct cilkframe *
cilk_frame(void)
{
return mycpu()->cilkframe;
}
static struct cilkstat *
cilk_stat(void)
{
return &stat[mycpu()->id];
}
static int
__cilk_push(struct cilkqueue *q, struct cilkthread *t)
{
int i;
i = q->head;
if ((i - q->tail) == NSLOTS) {
cilk_stat()->full++;
return -1;
}
i = i & (NSLOTS-1);
q->thread[i] = t;
q->head++;
cilk_stat()->push++;
return 0;
}
static struct cilkthread *
__cilk_pop(struct cilkqueue *q)
{
struct qnode qn;
int i;
ql_lock(&q->lock, &qn);
i = q->head;
if ((i - q->tail) == 0) {
ql_unlock(&q->lock, &qn);
return NULL;
}
i = (i-1) & (NSLOTS-1);
q->head--;
ql_unlock(&q->lock, &qn);
cilk_stat()->pop++;
return q->thread[i];
}
static struct cilkthread *
__cilk_steal(struct cilkqueue *q)
{
struct qnode qn;
int i;
ql_lock(&q->lock, &qn);
i = q->tail;
if ((i - q->head) == 0) {
ql_unlock(&q->lock, &qn);
return NULL;
}
i = i & (NSLOTS-1);
q->tail++;
ql_unlock(&q->lock, &qn);
cilk_stat()->steal++;
return q->thread[i];
}
static void
__cilk_run(struct cilkthread *th)
{
void (*fn)(uptr arg0, uptr arg1) = (void*)th->rip;
struct cilkframe *old = mycpu()->cilkframe;
mycpu()->cilkframe = th->frame;
fn(th->arg0, th->arg1);
mycpu()->cilkframe = old;
subfetch(&th->frame->ref, 1);
kfree(th);
}
// Add the (rip, arg0, arg1) work to the local work queue.
// Guarantees some core will at some point execute the work.
// The current core might execute the work immediately.
void
cilk_push(void *rip, u64 arg0, u64 arg1)
{
void (*fn)(uptr, uptr) = rip;
struct cilkthread *th;
th = (struct cilkthread *) kalloc();
if (th == NULL) {
fn(arg0, arg1);
return;
}
th->rip = (uptr) rip;
th->arg0 = arg0;
th->arg1 = arg1;
th->frame = cilk_frame();
if (__cilk_push(cilk_cur(), th)) {
kfree(th);
fn(arg0, arg1);
} else
fetchadd(&cilk_frame()->ref, 1);
}
// Try to execute one cilkthread.
// Check local queue then steal from other queues.
int
cilk_trywork(void)
{
struct cilkthread *th;
int i;
pushcli();
th = __cilk_pop(cilk_cur());
if (th != NULL) {
__cilk_run(th);
popcli();
return 1;
}
// XXX(sbw) should be random
for (i = 0; i < NCPU; i++) {
if (i == mycpu()->id)
continue;
th = __cilk_steal(&queue[i]);
if (th != NULL) {
__cilk_run(th);
popcli();
return 1;
}
}
popcli();
return 0;
}
// Start a new work queue frame.
// We don't allow nested work queue frames.
void
cilk_start(void)
{
pushcli();
if (myproc()->cilkframe.ref != 0)
panic("cilk_start");
mycpu()->cilkframe = &myproc()->cilkframe;
}
// End of the current work queue frame.
// The core works while the reference count of the current
// work queue frame is not 0.
void
cilk_end(void)
{
while (cilk_frame()->ref != 0) {
struct cilkthread *th;
int i;
while ((th = __cilk_pop(cilk_cur())) != NULL)
__cilk_run(th);
for (i = 0; i < NCPU; i++) {
th = __cilk_steal(&queue[i]);
if (th != NULL) {
__cilk_run(th);
break;
}
}
}
mycpu()->cilkframe = NULL;
popcli();
}
void
cilk_dump(void)
{
int i;
for (i = 0; i < NCPU; i++)
cprintf("push %lu full %lu pop %lu steal %lu\n",
stat[i].push, stat[i].full, stat[i].pop, stat[i].steal);
}
static void
__test_stub(uptr a0, uptr a1)
{
//cprintf("%lu, %lu\n", a0, a1);
}
void
testcilk(void)
{
enum { iters = 1000 };
static volatile int running = 1;
u64 e, s;
int i;
pushcli();
if (mycpu()->id == 0) {
microdelay(1);
s = rdtsc();
cilk_start();
for (i = 0; i < iters; i++)
cilk_push(__test_stub, i, i);
cilk_end();
e = rdtsc();
cprintf("testcilk: %lu\n", (e-s)/iters);
cilk_dump();
running = 0;
} else {
while (running)
cilk_trywork();
}
popcli();
}
void
initcilkframe(struct cilkframe *cilk)
{
memset(cilk, 0, sizeof(*cilk));
}
void
initcilk(void)
{
int i;
for (i = 0; i < NCPU; i++)
ql_init(&queue[i].lock, "queue lock");
}
#endif // CILKENABLE
#define __padout__ char __padout[0] __attribute__((aligned(CACHELINE))) #define __padout__ char __padout[0] __attribute__((aligned(CACHELINE)))
#define __mpalign__ __attribute__((aligned(CACHELINE))) #define __mpalign__ __attribute__((aligned(CACHELINE)))
#define __noret__ __attribute__((noreturn)) #define __noret__ __attribute__((noreturn))
static inline void
barrier(void)
{
__asm volatile("" ::: "memory");
}
...@@ -162,10 +162,10 @@ kerneltrap(struct trapframe *tf) ...@@ -162,10 +162,10 @@ kerneltrap(struct trapframe *tf)
kstack = myproc()->kstack; kstack = myproc()->kstack;
} }
__cprintf("kernel trap %u cpu %u\n" __cprintf("kernel trap %u cpu %u\n"
" tf: rip %p rsp %p cr2 %p\n" " tf: rip %p rsp %p rbp %p cr2 %p cs %p\n"
" proc: name %s pid %u kstack %p\n", " proc: name %s pid %u kstack %p\n",
tf->trapno, mycpu()->id, tf->trapno, mycpu()->id,
tf->rip, tf->rsp, rcr2(), tf->rip, tf->rsp, tf->rbp, rcr2(), tf->cs,
name, pid, kstack); name, pid, kstack);
printtrace(tf->rbp); printtrace(tf->rbp);
...@@ -235,6 +235,14 @@ consoleintr(int (*getc)(void)) ...@@ -235,6 +235,14 @@ consoleintr(int (*getc)(void))
case C('P'): // Process listing. case C('P'): // Process listing.
procdumpall(); procdumpall();
break; break;
case C('E'): // Print user-space PCs.
for (u32 i = 0; i < NCPU; i++)
cpus[i].timer_printpc = 1;
break;
case C('T'): // Print user-space PCs and stack traces.
for (u32 i = 0; i < NCPU; i++)
cpus[i].timer_printpc = 2;
break;
case C('U'): // Kill line. case C('U'): // Kill line.
while(input.e != input.w && while(input.e != input.w &&
input.buf[(input.e-1) % INPUT_BUF] != '\n'){ input.buf[(input.e-1) % INPUT_BUF] != '\n'){
...@@ -248,8 +256,8 @@ consoleintr(int (*getc)(void)) ...@@ -248,8 +256,8 @@ consoleintr(int (*getc)(void))
consputc(BACKSPACE); consputc(BACKSPACE);
} }
break; break;
case C('W'): // Work queue stats case C('C'): // cilk stats
wq_dump(); cilk_dump();
break; break;
case C('L'): // Prof stats case C('L'): // Prof stats
profdump(); profdump();
......
#include "mmu.h" #include "mmu.h"
struct wqframe; struct cilkframe;
// Per-CPU state // Per-CPU state
struct cpu { struct cpu {
...@@ -10,7 +10,8 @@ struct cpu { ...@@ -10,7 +10,8 @@ struct cpu {
struct segdesc gdt[NSEGS]; // x86 global descriptor table struct segdesc gdt[NSEGS]; // x86 global descriptor table
struct taskstate ts; // Used by x86 to find stack for interrupt struct taskstate ts; // Used by x86 to find stack for interrupt
struct context *scheduler; // swtch() here to enter scheduler struct context *scheduler; // swtch() here to enter scheduler
struct wqframe *wqframe; struct cilkframe *cilkframe;
int timer_printpc;
// Cpu-local storage variables; see below // Cpu-local storage variables; see below
struct cpu *cpu; struct cpu *cpu;
......
...@@ -190,7 +190,7 @@ exec(char *path, char **argv) ...@@ -190,7 +190,7 @@ exec(char *path, char **argv)
args.path = path; args.path = path;
args.argv = argv; args.argv = argv;
wq_start(); cilk_start();
for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){ for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){
Elf64_Word type; Elf64_Word type;
if(readi(ip, (char*)&type, if(readi(ip, (char*)&type,
...@@ -199,7 +199,7 @@ exec(char *path, char **argv) ...@@ -199,7 +199,7 @@ exec(char *path, char **argv)
goto bad; goto bad;
if(type != ELF_PROG_LOAD) if(type != ELF_PROG_LOAD)
continue; continue;
wq_push(dosegment, (uptr)&args, (uptr)off); cilk_push(dosegment, (uptr)&args, (uptr)off);
} }
if (odp) { if (odp) {
...@@ -210,14 +210,14 @@ exec(char *path, char **argv) ...@@ -210,14 +210,14 @@ exec(char *path, char **argv)
ip = 0; ip = 0;
} }
wq_push(doheap, (uptr)&args, (uptr)0); cilk_push(doheap, (uptr)&args, (uptr)0);
// dostack reads from the user address space. The wq // dostack reads from the user address space. The wq
// stuff doesn't switch to the user address space. // stuff doesn't switch to the user address space.
//wq_push(dostack, (uptr)&args, (uptr)0); //cilk_push(dostack, (uptr)&args, (uptr)0);
dostack((uptr)&args, (uptr)0); dostack((uptr)&args, (uptr)0);
wq_end(); cilk_end();
// Commit to the user image. // Commit to the user image.
oldvmap = myproc()->vmap; oldvmap = myproc()->vmap;
......
...@@ -12,9 +12,9 @@ static inline uptr v2p(void *a) { return (uptr) a - KBASE; } ...@@ -12,9 +12,9 @@ static inline uptr v2p(void *a) { return (uptr) a - KBASE; }
static inline void *p2v(uptr a) { return (void *) a + KBASE; } static inline void *p2v(uptr a) { return (void *) a + KBASE; }
struct trapframe; struct trapframe;
struct cilkframe;
struct spinlock; struct spinlock;
struct condvar; struct condvar;
struct wqframe;
struct context; struct context;
struct vmnode; struct vmnode;
struct inode; struct inode;
...@@ -328,20 +328,27 @@ struct vmap * vmap_copy(struct vmap *, int); ...@@ -328,20 +328,27 @@ struct vmap * vmap_copy(struct vmap *, int);
// wq.c // wq.c
#if WQENABLE #if WQENABLE
void wq_push(void *rip, u64 arg0, u64 arg1);
void wq_start(void);
void wq_end(void);
void wq_dump(void);
int wq_trywork(void); int wq_trywork(void);
void initwqframe(struct wqframe *wq);
#else #else
#define wq_push(rip, arg0, arg1) do { \ #define wq_trywork() 0
#endif
// cilk.c
#if CILKENABLE
void cilk_push(void *rip, u64 arg0, u64 arg1);
void cilk_start(void);
void cilk_end(void);
void cilk_dump(void);
int cilk_trywork(void);
void initcilkframe(struct cilkframe *wq);
#else
#define cilk_push(rip, arg0, arg1) do { \
void (*fn)(uptr, uptr) = rip; \ void (*fn)(uptr, uptr) = rip; \
fn(arg0, arg1); \ fn(arg0, arg1); \
} while(0) } while(0)
#define wq_start() do { } while(0) #define cilk_start() do { } while(0)
#define wq_end() do { } while(0) #define cilk_end() do { } while(0)
#define wq_dump() do { } while(0) #define cilk_dump() do { } while(0)
#define wq_trywork() 0 #define cilk_trywork() 0
#define initwqframe(x) do { } while (0) #define initcilkframe(x) do { } while (0)
#endif #endif
...@@ -42,3 +42,4 @@ struct klockstat { ...@@ -42,3 +42,4 @@ struct klockstat {
#define LOCKSTAT_PROC 1 #define LOCKSTAT_PROC 1
#define LOCKSTAT_SCHED 1 #define LOCKSTAT_SCHED 1
#define LOCKSTAT_VM 1 #define LOCKSTAT_VM 1
#define LOCKSTAT_WQ 1
...@@ -24,6 +24,7 @@ extern void initdisk(void); ...@@ -24,6 +24,7 @@ extern void initdisk(void);
extern void inituser(void); extern void inituser(void);
extern void inithz(void); extern void inithz(void);
extern void initwq(void); extern void initwq(void);
extern void initcilk(void);
extern void initsamp(void); extern void initsamp(void);
extern void initpci(void); extern void initpci(void);
extern void initnet(void); extern void initnet(void);
...@@ -103,8 +104,9 @@ cmain(u64 mbmagic, u64 mbaddr) ...@@ -103,8 +104,9 @@ cmain(u64 mbmagic, u64 mbaddr)
initbio(); // buffer cache initbio(); // buffer cache
initinode(); // inode cache initinode(); // inode cache
initdisk(); // disk initdisk(); // disk
#if WQENABLE initwq();
initwq(); // work queues #if CILKENABLE
initcilk();
#endif #endif
initsamp(); initsamp();
initlockstat(); initlockstat();
......
...@@ -51,7 +51,7 @@ main(int ac, char **av) ...@@ -51,7 +51,7 @@ main(int ac, char **av)
int nthread = atoi(av[1]); int nthread = atoi(av[1]);
acquire(&l); acquire(&l);
printf(1, "mapbench[%d]: start esp %x, nthread=%d\n", getpid(), rrsp(), nthread); // printf(1, "mapbench[%d]: start esp %x, nthread=%d\n", getpid(), rrsp(), nthread);
for(int i = 0; i < nthread; i++) { for(int i = 0; i < nthread; i++) {
sbrk(8192); sbrk(8192);
...@@ -72,7 +72,7 @@ main(int ac, char **av) ...@@ -72,7 +72,7 @@ main(int ac, char **av)
acquire(&l); acquire(&l);
} }
release(&l); release(&l);
printf(1, "mapbench[%d]: done\n", getpid()); // printf(1, "mapbench[%d]: done\n", getpid());
for(int i = 0; i < nthread; i++) for(int i = 0; i < nthread; i++)
wait(); wait();
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#define CACHELINE 64 // cache line size #define CACHELINE 64 // cache line size
#define CPUKSTACKS (NPROC + NCPU) #define CPUKSTACKS (NPROC + NCPU)
#define QUANTUM 10 // scheduling time quantum and tick length (in msec) #define QUANTUM 10 // scheduling time quantum and tick length (in msec)
#define WQSHIFT 4 // 2^WORKSHIFT work queue slots #define CILKSHIFT 4 // 2^WORKSHIFT work queue slots
#define VICTIMAGE 1000000 // cycles a proc executes before an eligible victim #define VICTIMAGE 1000000 // cycles a proc executes before an eligible victim
#define VERBOSE 0 // print kernel diagnostics #define VERBOSE 0 // print kernel diagnostics
#define SPINLOCK_DEBUG 1 // Debug spin locks #define SPINLOCK_DEBUG 1 // Debug spin locks
...@@ -21,20 +21,19 @@ ...@@ -21,20 +21,19 @@
#define VERIFYFREE LOCKSTAT #define VERIFYFREE LOCKSTAT
#define ALLOC_MEMSET 1 #define ALLOC_MEMSET 1
#define KSHAREDSIZE (32 << 10) #define KSHAREDSIZE (32 << 10)
#define WQENABLE 1
#define WQSHIFT 4
#if defined(HW_josmp) #if defined(HW_josmp)
#define NCPU 16 // maximum number of CPUs #define NCPU 16 // maximum number of CPUs
#define MTRACE 0 #define MTRACE 0
#define WQENABLE 0 // Enable work queue
#define PERFSIZE (1<<30ull) #define PERFSIZE (1<<30ull)
#elif defined(HW_qemu) #elif defined(HW_qemu)
#define NCPU 4 // maximum number of CPUs #define NCPU 4 // maximum number of CPUs
#define MTRACE 0 #define MTRACE 0
#define WQENABLE 1 // Enable work queue
#define PERFSIZE (16<<20ull) #define PERFSIZE (16<<20ull)
#elif defined(HW_ud0) #elif defined(HW_ud0)
#define NCPU 4 // maximum number of CPUs #define NCPU 4 // maximum number of CPUs
#define MTRACE 0 #define MTRACE 0
#define WQENABLE 0 // Enable work queue
#define PERFSIZE (512<<20ull) #define PERFSIZE (512<<20ull)
#else #else
#error "Unknown HW" #error "Unknown HW"
......
...@@ -206,7 +206,7 @@ allocproc(void) ...@@ -206,7 +206,7 @@ allocproc(void)
snprintf(p->lockname, sizeof(p->lockname), "cv:proc:%d", p->pid); snprintf(p->lockname, sizeof(p->lockname), "cv:proc:%d", p->pid);
initlock(&p->lock, p->lockname+3, LOCKSTAT_PROC); initlock(&p->lock, p->lockname+3, LOCKSTAT_PROC);
initcondvar(&p->cv, p->lockname); initcondvar(&p->cv, p->lockname);
initwqframe(&p->wqframe); initcilkframe(&p->cilkframe);
if (ns_insert(nspid, KI(p->pid), (void *) p) < 0) if (ns_insert(nspid, KI(p->pid), (void *) p) < 0)
panic("allocproc: ns_insert"); panic("allocproc: ns_insert");
......
...@@ -13,7 +13,7 @@ struct context { ...@@ -13,7 +13,7 @@ struct context {
} __attribute__((packed)); } __attribute__((packed));
// Work queue frame // Work queue frame
struct wqframe { struct cilkframe {
volatile u64 ref; volatile u64 ref;
}; };
...@@ -63,7 +63,7 @@ struct proc { ...@@ -63,7 +63,7 @@ struct proc {
struct mtrace_stacks mtrace_stacks; struct mtrace_stacks mtrace_stacks;
#endif #endif
struct runq *runq; struct runq *runq;
struct wqframe wqframe; struct cilkframe cilkframe;
STAILQ_ENTRY(proc) runqlink; STAILQ_ENTRY(proc) runqlink;
struct condvar *oncv; // Where it is sleeping, for kill() struct condvar *oncv; // Where it is sleeping, for kill()
......
...@@ -71,6 +71,19 @@ trap(struct trapframe *tf) ...@@ -71,6 +71,19 @@ trap(struct trapframe *tf)
switch(tf->trapno){ switch(tf->trapno){
case T_IRQ0 + IRQ_TIMER: case T_IRQ0 + IRQ_TIMER:
if (mycpu()->timer_printpc) {
cprintf("cpu%d: proc %s rip %lx rsp %lx cs %x\n",
mycpu()->id,
myproc() ? myproc()->name : "(none)",
tf->rip, tf->rsp, tf->cs);
if (mycpu()->timer_printpc == 2 && tf->rbp > KBASE) {
uptr pc[10];
getcallerpcs((void *) tf->rbp, pc, NELEM(pc));
for (int i = 0; i < 10 && pc[i]; i++)
cprintf("cpu%d: %lx\n", mycpu()->id, pc[i]);
}
mycpu()->timer_printpc = 0;
}
if (mycpu()->id == 0) if (mycpu()->id == 0)
cv_tick(); cv_tick();
lapiceoi(); lapiceoi();
......
// cilk style run queue
// A work queue is built from NCPU per-core wqueues.
// A core pushes work to the head of its per-core wqueue.
// A core pops work from the head of its per-core wqueue.
// A core pops work from the tail of another core's per-core wqueue.
//
// Usage:
// void goo(uptr a0, uptr a1) {
// char *arg = (char*) a0;
// cprintf("goo\n");
// arg[1] = 'g';
// }
// void foo(uptr a0, uptr a1) {
// char *arg = (char*) a0;
// wq_push(goo, a0, 0);
// arg[0] = 'f';
// cprintf("foo\n");
// }
// void example(void) {
// char arg[2];
// wq_start();
// wq_push(foo, (uptr)arg, 0);
// cprintf("example\n");
// wq_end();
// cprintf("%c %c\n", arg[0], arg[1]);
// }
#if WQENABLE #if WQENABLE
#include "types.h" #include "types.h"
#include "kernel.h" #include "kernel.h"
#include "spinlock.h"
#include "amd64.h" #include "amd64.h"
#include "cpu.h" #include "cpu.h"
#include "bits.h" #include "wq.h"
#include "spinlock.h"
#include "condvar.h"
#include "queue.h"
#include "proc.h"
#include "mtrace.h"
#include "qlock.h"
#define NSLOTS (1 << WQSHIFT) #define NSLOTS (1 << WQSHIFT)
struct wqueue { struct wqueue {
struct wqthread *thread[NSLOTS]; struct work *w[NSLOTS];
volatile int head __mpalign__;
qlock_t lock; volatile int head __mpalign__;
volatile int tail; volatile int tail;
struct spinlock lock;
__padout__; __padout__;
} __mpalign__; } __mpalign__;;
struct wqthread {
u64 rip;
u64 arg0;
u64 arg1;
struct wqframe *frame; // parent wqframe
__padout__;
} __mpalign__;
struct wqstat { struct wqstat {
u64 push; u64 push;
...@@ -68,145 +28,163 @@ struct wqstat { ...@@ -68,145 +28,163 @@ struct wqstat {
struct wqueue queue[NCPU] __mpalign__; struct wqueue queue[NCPU] __mpalign__;
struct wqstat stat[NCPU] __mpalign__; struct wqstat stat[NCPU] __mpalign__;
static struct wqueue * static inline struct wqueue *
wq_cur(void) getwq(void)
{ {
return &queue[mycpu()->id]; pushcli();
return &queue[cpunum()];
} }
static struct wqframe * static inline void
wq_frame(void) putwq(struct wqueue *wq)
{ {
return mycpu()->wqframe; popcli();
} }
static struct wqstat * static inline struct wqstat *
wq_stat(void) wq_stat(void)
{ {
return &stat[mycpu()->id]; return &stat[cpunum()];
}
static struct work *
allocwork(void)
{
return (struct work *)kalloc();
}
static void
freework(struct work *w)
{
kfree(w);
} }
static int int
__wq_push(struct wqueue *q, struct wqthread *t) wq_push(struct work *w)
{ {
int i; int i;
i = q->head; struct wqueue *wq = getwq();
if ((i - q->tail) == NSLOTS) { i = wq->head;
if ((i - wq->tail) == NSLOTS) {
wq_stat()->full++; wq_stat()->full++;
return -1; return -1;
} }
i = i & (NSLOTS-1); i = i & (NSLOTS-1);
q->thread[i] = t; wq->w[i] = w;
q->head++; barrier();
wq->head++;
wq_stat()->push++; wq_stat()->push++;
putwq(wq);
return 0;
}
int
wq_push1(void (*fn)(struct work *w, void *a0), void *a0)
{
struct work *w = allocwork();
if (w == NULL)
return -1;
w->rip = fn;
w->arg0 = a0;
if (wq_push(w) < 0) {
freework(w);
return -1;
}
return 0;
}
int
wq_push2(void (*fn)(struct work*, void*, void*), void *a0, void *a1)
{
struct work *w = allocwork();
if (w == NULL)
return -1;
w->rip = fn;
w->arg0 = a0;
w->arg1 = a1;
if (wq_push(w) < 0) {
freework(w);
return -1;
}
return 0; return 0;
} }
static struct wqthread * static struct work *
__wq_pop(struct wqueue *q) __wq_pop(int c)
{ {
struct qnode qn; // Called with cli
struct wqueue *wq = &queue[c];
struct work *w;
int i; int i;
ql_lock(&q->lock, &qn); acquire(&wq->lock);
i = q->head; i = wq->head;
if ((i - q->tail) == 0) { if ((i - wq->tail) == 0) {
ql_unlock(&q->lock, &qn); release(&wq->lock);
return NULL; return NULL;
} }
i = (i-1) & (NSLOTS-1); i = (i-1) & (NSLOTS-1);
q->head--; w = wq->w[i];
ql_unlock(&q->lock, &qn); wq->head--;
release(&wq->lock);
wq_stat()->pop++; wq_stat()->pop++;
return q->thread[i]; return w;
} }
static struct wqthread * static struct work *
__wq_steal(struct wqueue *q) __wq_steal(int c)
{ {
struct qnode qn; // Called with cli
struct wqueue *wq = &queue[c];
struct work *w;
int i; int i;
ql_lock(&q->lock, &qn); acquire(&wq->lock);
i = q->tail; i = wq->tail;
if ((i - q->head) == 0) { if ((i - wq->head) == 0) {
ql_unlock(&q->lock, &qn); release(&wq->lock);
return NULL; return NULL;
} }
i = i & (NSLOTS-1); i = i & (NSLOTS-1);
q->tail++; w = wq->w[i];
ql_unlock(&q->lock, &qn); wq->tail++;
release(&wq->lock);
wq_stat()->steal++; wq_stat()->steal++;
return q->thread[i]; return w;
} }
static void static void
__wq_run(struct wqthread *th) __wq_run(struct work *w)
{ {
void (*fn)(uptr arg0, uptr arg1) = (void*)th->rip; void (*fn)(struct work*, void*, void*) = w->rip;
struct wqframe *old = mycpu()->wqframe; fn(w, w->arg0, w->arg1);
freework(w);
mycpu()->wqframe = th->frame;
fn(th->arg0, th->arg1);
mycpu()->wqframe = old;
subfetch(&th->frame->ref, 1);
kfree(th);
} }
// Add the (rip, arg0, arg1) work to the local work queue.
// Guarantees some core will at some point execute the work.
// The current core might execute the work immediately.
void
wq_push(void *rip, u64 arg0, u64 arg1)
{
void (*fn)(uptr, uptr) = rip;
struct wqthread *th;
th = (struct wqthread *) kalloc();
if (th == NULL) {
fn(arg0, arg1);
return;
}
th->rip = (uptr) rip;
th->arg0 = arg0;
th->arg1 = arg1;
th->frame = wq_frame();
if (__wq_push(wq_cur(), th)) {
kfree(th);
fn(arg0, arg1);
} else
fetchadd(&wq_frame()->ref, 1);
}
// Try to execute one wqthread.
// Check local queue then steal from other queues.
int int
wq_trywork(void) wq_trywork(void)
{ {
struct wqthread *th; struct work *w;
int i; int i;
pushcli(); pushcli();
th = __wq_pop(wq_cur()); w = __wq_pop(mycpu()->id);
if (th != NULL) { if (w != NULL) {
__wq_run(th); __wq_run(w);
popcli(); popcli();
return 1; return 1;
} }
// XXX(sbw) should be random // XXX(sbw) should be random
for (i = 0; i < NCPU; i++) { for (i = 0; i < NCPU; i++) {
if (i == mycpu()->id) if (i == mycpu()->id)
continue; continue;
th = __wq_steal(&queue[i]); w = __wq_steal(i);
if (th != NULL) { if (w != NULL) {
__wq_run(th); __wq_run(w);
popcli(); popcli();
return 1; return 1;
} }
...@@ -216,42 +194,6 @@ wq_trywork(void) ...@@ -216,42 +194,6 @@ wq_trywork(void)
return 0; return 0;
} }
// Start a new work queue frame.
// We don't allow nested work queue frames.
void
wq_start(void)
{
pushcli();
if (myproc()->wqframe.ref != 0)
panic("wq_start");
mycpu()->wqframe = &myproc()->wqframe;
}
// End of the current work queue frame.
// The core works while the reference count of the current
// work queue frame is not 0.
void
wq_end(void)
{
while (wq_frame()->ref != 0) {
struct wqthread *th;
int i;
while ((th = __wq_pop(wq_cur())) != NULL)
__wq_run(th);
for (i = 0; i < NCPU; i++) {
th = __wq_steal(&queue[i]);
if (th != NULL) {
__wq_run(th);
break;
}
}
}
mycpu()->wqframe = NULL;
popcli();
}
void void
wq_dump(void) wq_dump(void)
{ {
...@@ -262,31 +204,35 @@ wq_dump(void) ...@@ -262,31 +204,35 @@ wq_dump(void)
} }
static void static void
__test_stub(uptr a0, uptr a1) __test_stub(struct work *w, void *a0, void *a1)
{ {
//cprintf("%lu, %lu\n", a0, a1); //long i = (long)a0;
//cprintf("%u: %lu\n", cpunum(), i);
volatile int *running = a1;
subfetch(running, 1);
} }
void void
testwq(void) testwq(void)
{ {
enum { iters = 1000 }; enum { iters = 10 };
static volatile int running = 1; static volatile int running = iters;
u64 e, s; u64 e, s;
int i; long i;
pushcli(); pushcli();
if (mycpu()->id == 0) { if (mycpu()->id == 0) {
microdelay(1); microdelay(1);
s = rdtsc(); s = rdtsc();
wq_start(); for (i = 0; i < iters; i++) {
for (i = 0; i < iters; i++) if (wq_push2(__test_stub, (void*)i, (void*)&running) < 0)
wq_push(__test_stub, i, i); panic("testwq: oops");
wq_end(); }
e = rdtsc(); e = rdtsc();
cprintf("testwq: %lu\n", (e-s)/iters); cprintf("testwq: %lu\n", (e-s)/iters);
while (running)
nop_pause();
wq_dump(); wq_dump();
running = 0;
} else { } else {
while (running) while (running)
wq_trywork(); wq_trywork();
...@@ -295,17 +241,12 @@ testwq(void) ...@@ -295,17 +241,12 @@ testwq(void)
} }
void void
initwqframe(struct wqframe *wq)
{
memset(wq, 0, sizeof(*wq));
}
void
initwq(void) initwq(void)
{ {
int i; int i;
for (i = 0; i < NCPU; i++) for (i = 0; i < NCPU; i++)
ql_init(&queue[i].lock, "queue lock"); initlock(&queue[i].lock, "wq lock", LOCKSTAT_WQ);
} }
#endif // WQENABLE #endif // WQENABLE
struct work {
void *rip;
void *arg0;
void *arg1;
char data[];
};
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论