Merge 'flicts

ae76c346 · Silas Boyd-Wickizer · 364cfa6a · ae573cc0 · ae76c346 · ae76c346
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,8 @@ NM = $(TOOLPREFIX)nm
 OBJCOPY = $(TOOLPREFIX)objcopy
 CFLAGS = -fno-pic -static -fno-builtin -fno-strict-aliasing -O2 -Wall -MD -ggdb \
-	 -m64 -Werror -std=c99 -fms-extensions -mno-sse -mcmodel=large -I$(QEMUSRC) \
+	 -m64 -Werror -std=c99 -fms-extensions -mno-sse -mcmodel=large -mno-red-zone \
+	 -I$(QEMUSRC) \
 	 -fno-omit-frame-pointer -DHW_$(HW) -include param.h -include compiler.h
 CFLAGS += $(shell $(CC) -fno-stack-protector -E -x c /dev/null >/dev/null 2>&1 && echo -fno-stack-protector)
 ASFLAGS = -m64 -gdwarf-2 -MD
@@ -29,6 +30,7 @@ LDFLAGS += -m elf_x86_64
 OBJS = \
 	bio.o \
 	cga.o \
+	cilk.o \
 	condvar.o \
 	console.o \
 	crange.o \

--- a/amd64.h
+++ b/amd64.h
@@ -115,12 +115,6 @@ rep_nop(void)
 }
 static inline void
-barrier(void)
-{
-  __asm volatile("" ::: "memory");
-}
-static inline void
 lidt(void *p)
 {
  __asm volatile("lidt (%0)" : : "r" (p) : "memory");

--- a/cilk.c
+++ b/cilk.c
+// cilk style run queue
+// A work queue is built from NCPU per-core wqueues.
+// A core pushes work to the head of its per-core wqueue.
+// A core pops work from the head of its per-core wqueue.
+// A core pops work from the tail of another core's per-core wqueue.
+//
+// Usage:
+//   void goo(uptr a0, uptr a1) {
+//     char *arg = (char*) a0;
+//     cprintf("goo\n");
+//     arg[1] = 'g';
+//   }
+//   void foo(uptr a0, uptr a1) {
+//     char *arg = (char*) a0;
+//     cilk_push(goo, a0, 0);
+//     arg[0] = 'f';
+//     cprintf("foo\n");
+//   }
+//   void example(void) {
+//     char arg[2];
+//     cilk_start();
+//     cilk_push(foo, (uptr)arg, 0);
+//     cprintf("example\n");     
+//     cilk_end();
+//     cprintf("%c %c\n", arg[0], arg[1]);
+//   }
+#if CILKENABLE
+#include "types.h"
+#include "kernel.h"
+#include "amd64.h"
+#include "cpu.h"
+#include "bits.h"
+#include "spinlock.h"
+#include "condvar.h"
+#include "queue.h"
+#include "proc.h"
+#include "mtrace.h"
+#include "qlock.h"
+#define NSLOTS (1 << CILKSHIFT)
+struct cilkqueue {
+  struct cilkthread *thread[NSLOTS];
+  volatile int head __mpalign__;
+  qlock_t lock;
+  volatile int tail;
+  __padout__;
+} __mpalign__;
+struct cilkthread {
+  u64 rip;
+  u64 arg0;
+  u64 arg1;
+  struct cilkframe *frame;  // parent cilkframe
+  __padout__;
+} __mpalign__;
+struct cilkstat {
+  u64 push;
+  u64 full;
+  u64 pop;
+  u64 steal;
+  __padout__;
+} __mpalign__;
+struct cilkqueue queue[NCPU] __mpalign__;
+struct cilkstat stat[NCPU] __mpalign__;
+static struct cilkqueue *
+cilk_cur(void)
+{
+  return &queue[mycpu()->id];
+}
+static struct cilkframe *
+cilk_frame(void)
+{
+  return mycpu()->cilkframe;
+}
+static struct cilkstat *
+cilk_stat(void)
+{
+  return &stat[mycpu()->id];
+}
+static int
+__cilk_push(struct cilkqueue *q, struct cilkthread *t)
+{
+  int i;
+  i = q->head;
+  if ((i - q->tail) == NSLOTS) {
+    cilk_stat()->full++;
+    return -1;
+  }
+  i = i & (NSLOTS-1);
+  q->thread[i] = t;
+  q->head++;
+  cilk_stat()->push++;
+  return 0;
+}
+static struct cilkthread *
+__cilk_pop(struct cilkqueue *q)
+{
+  struct qnode qn;
+  int i;
+  ql_lock(&q->lock, &qn);
+  i = q->head;
+  if ((i - q->tail) == 0) {
+    ql_unlock(&q->lock, &qn);
+    return NULL;
+  }
+  i = (i-1) & (NSLOTS-1);
+  q->head--;
+  ql_unlock(&q->lock, &qn);
+  cilk_stat()->pop++;
+  return q->thread[i];
+}
+static struct cilkthread *
+__cilk_steal(struct cilkqueue *q)
+{
+  struct qnode qn;
+  int i;
+  ql_lock(&q->lock, &qn);
+  i = q->tail;
+  if ((i - q->head) == 0) {
+    ql_unlock(&q->lock, &qn);
+    return NULL;
+  }
+  i = i & (NSLOTS-1);
+  q->tail++;
+  ql_unlock(&q->lock, &qn);
+  cilk_stat()->steal++;
+  return q->thread[i];
+}
+static void
+__cilk_run(struct cilkthread *th)
+{
+  void (*fn)(uptr arg0, uptr arg1) = (void*)th->rip;
+  struct cilkframe *old = mycpu()->cilkframe;
+  mycpu()->cilkframe = th->frame;
+  fn(th->arg0, th->arg1);
+  mycpu()->cilkframe = old;
+  subfetch(&th->frame->ref, 1);
+  kfree(th);
+}
+// Add the (rip, arg0, arg1) work to the local work queue.
+// Guarantees some core will at some point execute the work.
+// The current core might execute the work immediately.
+void
+cilk_push(void *rip, u64 arg0, u64 arg1)
+{
+  void (*fn)(uptr, uptr) = rip;
+  struct cilkthread *th;
+  th = (struct cilkthread *) kalloc();
+  if (th == NULL) {
+    fn(arg0, arg1);
+    return;
+  }
+  th->rip = (uptr) rip;
+  th->arg0 = arg0;
+  th->arg1 = arg1;
+  th->frame = cilk_frame();
+  if (__cilk_push(cilk_cur(), th)) {
+    kfree(th);
+    fn(arg0, arg1);
+  } else 
+    fetchadd(&cilk_frame()->ref, 1);
+}
+// Try to execute one cilkthread.
+// Check local queue then steal from other queues. 
+int
+cilk_trywork(void)
+{
+  struct cilkthread *th;
+  int i;
+  pushcli();
+  th = __cilk_pop(cilk_cur());
+  if (th != NULL) {
+    __cilk_run(th);
+    popcli();
+    return 1;
+  }
+  // XXX(sbw) should be random
+  for (i = 0; i < NCPU; i++) {
+    if (i == mycpu()->id)
+        continue;
+    th = __cilk_steal(&queue[i]);
+    if (th != NULL) {
+      __cilk_run(th);
+      popcli();
+      return 1;
+    }
+  }
+  popcli();
+  return 0;
+}
+// Start a new work queue frame.
+// We don't allow nested work queue frames.
+void
+cilk_start(void)
+{ 
+  pushcli();
+  if (myproc()->cilkframe.ref != 0)
+    panic("cilk_start");
+  mycpu()->cilkframe = &myproc()->cilkframe;
+}
+// End of the current work queue frame.
+// The core works while the reference count of the current
+// work queue frame is not 0.
+void
+cilk_end(void)
+{
+  while (cilk_frame()->ref != 0) {
+    struct cilkthread *th;
+    int i;
+    while ((th = __cilk_pop(cilk_cur())) != NULL)
+      __cilk_run(th);
+    for (i = 0; i < NCPU; i++) {
+      th = __cilk_steal(&queue[i]);
+      if (th != NULL) {
+        __cilk_run(th);
+        break;
+      }
+    }
+  }
+  mycpu()->cilkframe = NULL;
+  popcli();
+}
+void
+cilk_dump(void)
+{
+  int i;
+  for (i = 0; i < NCPU; i++)
+    cprintf("push %lu full %lu pop %lu steal %lu\n",
+            stat[i].push, stat[i].full, stat[i].pop, stat[i].steal);
+}
+static void
+__test_stub(uptr a0, uptr a1)
+{
+  //cprintf("%lu, %lu\n", a0, a1);
+}
+void
+testcilk(void)
+{
+  enum { iters = 1000 };
+  static volatile int running = 1;
+  u64 e, s;
+  int i;
+  pushcli();
+  if (mycpu()->id == 0) {
+    microdelay(1);
+    s = rdtsc();
+    cilk_start();
+    for (i = 0; i < iters; i++)
+      cilk_push(__test_stub, i, i);
+    cilk_end();
+    e = rdtsc();
+    cprintf("testcilk: %lu\n", (e-s)/iters);
+    cilk_dump();
+    running = 0;
+  } else {
+    while (running)
+      cilk_trywork();
+  }
+  popcli();
+}
+void
+initcilkframe(struct cilkframe *cilk)
+{
+  memset(cilk, 0, sizeof(*cilk));
+}
+void
+initcilk(void)
+{
+  int i;
+  for (i = 0; i < NCPU; i++)
+    ql_init(&queue[i].lock, "queue lock");
+}
+#endif // CILKENABLE
--- a/compiler.h
+++ b/compiler.h
 #define __padout__  char __padout[0] __attribute__((aligned(CACHELINE)))
 #define __mpalign__ __attribute__((aligned(CACHELINE)))
 #define __noret__   __attribute__((noreturn))
+static inline void
+barrier(void)
+{
+  __asm volatile("" ::: "memory");
+}
--- a/console.c
+++ b/console.c
@@ -162,10 +162,10 @@ kerneltrap(struct trapframe *tf)
    kstack = myproc()->kstack;
  }
  __cprintf("kernel trap %u cpu %u\n"
-          "  tf: rip %p rsp %p cr2 %p\n"
+          "  tf: rip %p rsp %p rbp %p cr2 %p cs %p\n"
          "  proc: name %s pid %u kstack %p\n",
          tf->trapno, mycpu()->id, 
-          tf->rip, tf->rsp, rcr2(),
+          tf->rip, tf->rsp, tf->rbp, rcr2(), tf->cs,
          name, pid, kstack);
  printtrace(tf->rbp);
@@ -235,6 +235,14 @@ consoleintr(int (*getc)(void))
    case C('P'):  // Process listing.
      procdumpall();
      break;
+    case C('E'):  // Print user-space PCs.
+      for (u32 i = 0; i < NCPU; i++)
+        cpus[i].timer_printpc = 1;
+      break;
+    case C('T'):  // Print user-space PCs and stack traces.
+      for (u32 i = 0; i < NCPU; i++)
+        cpus[i].timer_printpc = 2;
+      break;
    case C('U'):  // Kill line.
      while(input.e != input.w &&
            input.buf[(input.e-1) % INPUT_BUF] != '\n'){
@@ -248,8 +256,8 @@ consoleintr(int (*getc)(void))
        consputc(BACKSPACE);
      }
      break;
-    case C('W'):  // Work queue stats
+    case C('C'):  // cilk stats
-      wq_dump();
+      cilk_dump();
      break;
    case C('L'):  // Prof stats
      profdump();

--- a/cpu.h
+++ b/cpu.h
 #include "mmu.h"
-struct wqframe;
+struct cilkframe;
 // Per-CPU state
 struct cpu {
@@ -10,7 +10,8 @@ struct cpu {
  struct segdesc gdt[NSEGS];   // x86 global descriptor table
  struct taskstate ts;         // Used by x86 to find stack for interrupt
  struct context *scheduler;   // swtch() here to enter scheduler
-  struct wqframe *wqframe;
+  struct cilkframe *cilkframe;
+  int timer_printpc;
  // Cpu-local storage variables; see below
  struct cpu *cpu;

--- a/exec.c
+++ b/exec.c
@@ -190,7 +190,7 @@ exec(char *path, char **argv)
  args.path = path;
  args.argv = argv;
-  wq_start();
+  cilk_start();
  for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){
    Elf64_Word type;
    if(readi(ip, (char*)&type, 
@@ -199,7 +199,7 @@ exec(char *path, char **argv)
      goto bad;
    if(type != ELF_PROG_LOAD)
      continue;
-    wq_push(dosegment, (uptr)&args, (uptr)off);
+    cilk_push(dosegment, (uptr)&args, (uptr)off);
  }
  if (odp) {
@@ -210,14 +210,14 @@ exec(char *path, char **argv)
    ip = 0;
  }
-  wq_push(doheap, (uptr)&args, (uptr)0);
+  cilk_push(doheap, (uptr)&args, (uptr)0);
  // dostack reads from the user address space.  The wq
  // stuff doesn't switch to the user address space.
-  //wq_push(dostack, (uptr)&args, (uptr)0);
+  //cilk_push(dostack, (uptr)&args, (uptr)0);
  dostack((uptr)&args, (uptr)0);
-  wq_end();
+  cilk_end();
  // Commit to the user image.
  oldvmap = myproc()->vmap;

--- a/kernel.h
+++ b/kernel.h
@@ -12,9 +12,9 @@ static inline uptr v2p(void *a) { return (uptr) a  - KBASE; }
 static inline void *p2v(uptr a) { return (void *) a + KBASE; }
 struct trapframe;
+struct cilkframe;
 struct spinlock;
 struct condvar;
-struct wqframe;
 struct context;
 struct vmnode;
 struct inode;
@@ -328,20 +328,27 @@ struct vmap *   vmap_copy(struct vmap *, int);
 // wq.c
 #if WQENABLE
-void            wq_push(void *rip, u64 arg0, u64 arg1);
-void            wq_start(void);
-void            wq_end(void);
-void            wq_dump(void);
 int             wq_trywork(void);
-void            initwqframe(struct wqframe *wq);
 #else
-#define wq_push(rip, arg0, arg1) do { \
+#define wq_trywork() 0
+#endif
+// cilk.c
+#if CILKENABLE
+void            cilk_push(void *rip, u64 arg0, u64 arg1);
+void            cilk_start(void);
+void            cilk_end(void);
+void            cilk_dump(void);
+int             cilk_trywork(void);
+void            initcilkframe(struct cilkframe *wq);
+#else
+#define cilk_push(rip, arg0, arg1) do { \
  void (*fn)(uptr, uptr) = rip; \
  fn(arg0, arg1); \
 } while(0)
-#define wq_start() do { } while(0)
+#define cilk_start() do { } while(0)
-#define wq_end() do { } while(0)
+#define cilk_end() do { } while(0)
-#define wq_dump() do { } while(0)
+#define cilk_dump() do { } while(0)
-#define wq_trywork() 0
+#define cilk_trywork() 0
-#define initwqframe(x) do { } while (0)
+#define initcilkframe(x) do { } while (0)
 #endif
--- a/lockstat.h
+++ b/lockstat.h
@@ -42,3 +42,4 @@ struct klockstat {
 #define LOCKSTAT_PROC      1
 #define LOCKSTAT_SCHED     1
 #define LOCKSTAT_VM        1
+#define LOCKSTAT_WQ        1
--- a/main.c
+++ b/main.c
@@ -24,6 +24,7 @@ extern void initdisk(void);
 extern void inituser(void);
 extern void inithz(void);
 extern void initwq(void);
+extern void initcilk(void);
 extern void initsamp(void);
 extern void initpci(void);
 extern void initnet(void);
@@ -103,8 +104,9 @@ cmain(u64 mbmagic, u64 mbaddr)
  initbio();       // buffer cache
  initinode();     // inode cache
  initdisk();      // disk
-#if WQENABLE
+  initwq();
-  initwq();        // work queues
+#if CILKENABLE
+  initcilk();
 #endif
  initsamp();
  initlockstat();

--- a/mapbench.c
+++ b/mapbench.c
@@ -51,7 +51,7 @@ main(int ac, char **av)
  int nthread = atoi(av[1]);
  acquire(&l);
-  printf(1, "mapbench[%d]: start esp %x, nthread=%d\n", getpid(), rrsp(), nthread);
+  // printf(1, "mapbench[%d]: start esp %x, nthread=%d\n", getpid(), rrsp(), nthread);
  for(int i = 0; i < nthread; i++) {
    sbrk(8192);
@@ -72,7 +72,7 @@ main(int ac, char **av)
    acquire(&l);
  }
  release(&l);
-  printf(1, "mapbench[%d]: done\n", getpid());
+  // printf(1, "mapbench[%d]: done\n", getpid());
  for(int i = 0; i < nthread; i++)
    wait();

--- a/param.h
+++ b/param.h
@@ -13,7 +13,7 @@
 #define CACHELINE    64  // cache line size
 #define CPUKSTACKS   (NPROC + NCPU)
 #define QUANTUM      10  // scheduling time quantum and tick length (in msec)
-#define WQSHIFT       4  // 2^WORKSHIFT work queue slots
+#define CILKSHIFT     4  // 2^WORKSHIFT work queue slots
 #define VICTIMAGE 1000000 // cycles a proc executes before an eligible victim
 #define VERBOSE       0  // print kernel diagnostics
 #define	SPINLOCK_DEBUG 1 // Debug spin locks
@@ -21,20 +21,19 @@
 #define VERIFYFREE    LOCKSTAT
 #define ALLOC_MEMSET  1
 #define KSHAREDSIZE   (32 << 10)
+#define WQENABLE      1
+#define WQSHIFT       4
 #if defined(HW_josmp)
 #define NCPU         16  // maximum number of CPUs
 #define MTRACE       0
-#define WQENABLE     0   // Enable work queue
 #define PERFSIZE     (1<<30ull)
 #elif defined(HW_qemu)
 #define NCPU         4   // maximum number of CPUs
 #define MTRACE       0
-#define WQENABLE     1   // Enable work queue
 #define PERFSIZE     (16<<20ull)
 #elif defined(HW_ud0)
 #define NCPU         4   // maximum number of CPUs
 #define MTRACE       0
-#define WQENABLE     0   // Enable work queue
 #define PERFSIZE     (512<<20ull)
 #else
 #error "Unknown HW"

--- a/proc.c
+++ b/proc.c
@@ -206,7 +206,7 @@ allocproc(void)
  snprintf(p->lockname, sizeof(p->lockname), "cv:proc:%d", p->pid);
  initlock(&p->lock, p->lockname+3, LOCKSTAT_PROC);
  initcondvar(&p->cv, p->lockname);
-  initwqframe(&p->wqframe);
+  initcilkframe(&p->cilkframe);
  if (ns_insert(nspid, KI(p->pid), (void *) p) < 0)
    panic("allocproc: ns_insert");

--- a/proc.h
+++ b/proc.h
@@ -13,7 +13,7 @@ struct context {
 } __attribute__((packed));
 // Work queue frame
-struct wqframe {
+struct cilkframe {
  volatile u64 ref;
 };
@@ -63,7 +63,7 @@ struct proc {
  struct mtrace_stacks mtrace_stacks;
 #endif
  struct runq *runq;
-  struct wqframe wqframe;
+  struct cilkframe cilkframe;
  STAILQ_ENTRY(proc) runqlink;
  struct condvar *oncv;        // Where it is sleeping, for kill()

--- a/trap.c
+++ b/trap.c
@@ -71,8 +71,21 @@ trap(struct trapframe *tf)
  switch(tf->trapno){
  case T_IRQ0 + IRQ_TIMER:
+    if (mycpu()->timer_printpc) {
+      cprintf("cpu%d: proc %s rip %lx rsp %lx cs %x\n",
+              mycpu()->id,
+              myproc() ? myproc()->name : "(none)",
+              tf->rip, tf->rsp, tf->cs);
+      if (mycpu()->timer_printpc == 2 && tf->rbp > KBASE) {
+        uptr pc[10];
+        getcallerpcs((void *) tf->rbp, pc, NELEM(pc));
+        for (int i = 0; i < 10 && pc[i]; i++)
+          cprintf("cpu%d:   %lx\n", mycpu()->id, pc[i]);
+      }
+      mycpu()->timer_printpc = 0;
+    }
    if (mycpu()->id == 0)
-        cv_tick();
+      cv_tick();
    lapiceoi();
    break;
  case T_IRQ0 + IRQ_IDE:

--- a/wq.c
+++ b/wq.c
-// cilk style run queue
-// A work queue is built from NCPU per-core wqueues.
-// A core pushes work to the head of its per-core wqueue.
-// A core pops work from the head of its per-core wqueue.
-// A core pops work from the tail of another core's per-core wqueue.
-//
-// Usage:
-//   void goo(uptr a0, uptr a1) {
-//     char *arg = (char*) a0;
-//     cprintf("goo\n");
-//     arg[1] = 'g';
-//   }
-//   void foo(uptr a0, uptr a1) {
-//     char *arg = (char*) a0;
-//     wq_push(goo, a0, 0);
-//     arg[0] = 'f';
-//     cprintf("foo\n");
-//   }
-//   void example(void) {
-//     char arg[2];
-//     wq_start();
-//     wq_push(foo, (uptr)arg, 0);
-//     cprintf("example\n");     
-//     wq_end();
-//     cprintf("%c %c\n", arg[0], arg[1]);
-//   }
 #if WQENABLE
 #include "types.h"
 #include "kernel.h"
+#include "spinlock.h"
 #include "amd64.h"
 #include "cpu.h"
-#include "bits.h"
+#include "wq.h"
-#include "spinlock.h"
-#include "condvar.h"
-#include "queue.h"
-#include "proc.h"
-#include "mtrace.h"
-#include "qlock.h"
 #define NSLOTS (1 << WQSHIFT)
 struct wqueue {
-  struct wqthread *thread[NSLOTS];
+  struct work *w[NSLOTS];
  volatile int head __mpalign__;
-  qlock_t lock;
  volatile int tail;
+  struct spinlock lock;
  __padout__;
-} __mpalign__;
+} __mpalign__;;
-struct wqthread {
-  u64 rip;
-  u64 arg0;
-  u64 arg1;
-  struct wqframe *frame;  // parent wqframe
-  __padout__;
-} __mpalign__;
 struct wqstat {
  u64 push;
@@ -68,145 +28,163 @@ struct wqstat {
 struct wqueue queue[NCPU] __mpalign__;
 struct wqstat stat[NCPU] __mpalign__;
-static struct wqueue *
+static inline struct wqueue *
-wq_cur(void)
+getwq(void)
 {
-  return &queue[mycpu()->id];
+  pushcli();
+  return &queue[cpunum()];
 }
-static struct wqframe *
+static inline void
-wq_frame(void)
+putwq(struct wqueue *wq)
 {
-  return mycpu()->wqframe;
+  popcli();
 }
-static struct wqstat *
+static inline struct wqstat *
 wq_stat(void)
 {
-  return &stat[mycpu()->id];
+  return &stat[cpunum()];
+}
+static struct work *
+allocwork(void)
+{
+  return (struct work *)kalloc();
+}
+static void
+freework(struct work *w)
+{
+  kfree(w);
 }
-static int
+int
-__wq_push(struct wqueue *q, struct wqthread *t)
+wq_push(struct work *w)
 {
  int i;
-  i = q->head;
+  struct wqueue *wq = getwq();
-  if ((i - q->tail) == NSLOTS) {
+  i = wq->head;
+  if ((i - wq->tail) == NSLOTS) {
    wq_stat()->full++;
    return -1;
  }
  i = i & (NSLOTS-1);
-  q->thread[i] = t;
+  wq->w[i] = w;
-  q->head++;
+  barrier();
+  wq->head++;
  wq_stat()->push++;
+  putwq(wq);
  return 0;
 }
-static struct wqthread *
+int
-__wq_pop(struct wqueue *q)
+wq_push1(void (*fn)(struct work *w, void *a0), void *a0)
 {
-  struct qnode qn;
+  struct work *w = allocwork();
-  int i;
+  if (w == NULL)
+    return -1;
+  w->rip = fn;
+  w->arg0 = a0;
+  if (wq_push(w) < 0) {
+    freework(w);
+    return -1;
+  }
+  return 0;
+}
+int
+wq_push2(void (*fn)(struct work*, void*, void*), void *a0, void *a1)
+{
+  struct work *w = allocwork();
+  if (w == NULL)
+    return -1;
+  w->rip = fn;
+  w->arg0 = a0;
+  w->arg1 = a1;
+  if (wq_push(w) < 0) {
+    freework(w);
+    return -1;
+  }
+  return 0;
+}
-  ql_lock(&q->lock, &qn);
+static struct work *
-  i = q->head;
+__wq_pop(int c)
-  if ((i - q->tail) == 0) {
+{
-    ql_unlock(&q->lock, &qn);
+  // Called with cli
+  struct wqueue *wq = &queue[c];
+  struct work *w;
+  int i;
+  acquire(&wq->lock);
+  i = wq->head;
+  if ((i - wq->tail) == 0) {
+    release(&wq->lock);
    return NULL;
  }
  i = (i-1) & (NSLOTS-1);
-  q->head--;
+  w = wq->w[i];
-  ql_unlock(&q->lock, &qn);
+  wq->head--;
+  release(&wq->lock);
  wq_stat()->pop++;
-  return q->thread[i];
+  return w;
 }
-static struct wqthread *
+static struct work *
-__wq_steal(struct wqueue *q)
+__wq_steal(int c)
 {
-  struct qnode qn;
+  // Called with cli
+  struct wqueue *wq = &queue[c];
+  struct work *w;
  int i;
-  ql_lock(&q->lock, &qn);
+  acquire(&wq->lock);
-  i = q->tail;
+  i = wq->tail;
-  if ((i - q->head) == 0) {
+  if ((i - wq->head) == 0) {
-    ql_unlock(&q->lock, &qn);
+    release(&wq->lock);
    return NULL;
  }
  i = i & (NSLOTS-1);
-  q->tail++;
+  w = wq->w[i];
-  ql_unlock(&q->lock, &qn);
+  wq->tail++;
+  release(&wq->lock);
  wq_stat()->steal++;
-  return q->thread[i];
+  return w;
 }
 static void
-__wq_run(struct wqthread *th)
+__wq_run(struct work *w)
 {
-  void (*fn)(uptr arg0, uptr arg1) = (void*)th->rip;
+  void (*fn)(struct work*, void*, void*) = w->rip;
-  struct wqframe *old = mycpu()->wqframe;
+  fn(w, w->arg0, w->arg1);
+  freework(w);
-  mycpu()->wqframe = th->frame;
-  fn(th->arg0, th->arg1);
-  mycpu()->wqframe = old;
-  subfetch(&th->frame->ref, 1);
-  kfree(th);
 }
-// Add the (rip, arg0, arg1) work to the local work queue.
-// Guarantees some core will at some point execute the work.
-// The current core might execute the work immediately.
-void
-wq_push(void *rip, u64 arg0, u64 arg1)
-{
-  void (*fn)(uptr, uptr) = rip;
-  struct wqthread *th;
-  th = (struct wqthread *) kalloc();
-  if (th == NULL) {
-    fn(arg0, arg1);
-    return;
-  }
-  th->rip = (uptr) rip;
-  th->arg0 = arg0;
-  th->arg1 = arg1;
-  th->frame = wq_frame();
-  if (__wq_push(wq_cur(), th)) {
-    kfree(th);
-    fn(arg0, arg1);
-  } else 
-    fetchadd(&wq_frame()->ref, 1);
-}
-// Try to execute one wqthread.
-// Check local queue then steal from other queues. 
 int
 wq_trywork(void)
 {
-  struct wqthread *th;
+  struct work *w;
  int i;
  pushcli();
-  th = __wq_pop(wq_cur());
+  w = __wq_pop(mycpu()->id);
-  if (th != NULL) {
+  if (w != NULL) {
-    __wq_run(th);
+    __wq_run(w);
    popcli();
    return 1;
  }
  // XXX(sbw) should be random
  for (i = 0; i < NCPU; i++) {
    if (i == mycpu()->id)
        continue;
-    th = __wq_steal(&queue[i]);
+    w = __wq_steal(i);
-    if (th != NULL) {
+    if (w != NULL) {
-      __wq_run(th);
+      __wq_run(w);
      popcli();
      return 1;
    }
@@ -216,42 +194,6 @@ wq_trywork(void)
  return 0;
 }
-// Start a new work queue frame.
-// We don't allow nested work queue frames.
-void
-wq_start(void)
-{ 
-  pushcli();
-  if (myproc()->wqframe.ref != 0)
-    panic("wq_start");
-  mycpu()->wqframe = &myproc()->wqframe;
-}
-// End of the current work queue frame.
-// The core works while the reference count of the current
-// work queue frame is not 0.
-void
-wq_end(void)
-{
-  while (wq_frame()->ref != 0) {
-    struct wqthread *th;
-    int i;
-    while ((th = __wq_pop(wq_cur())) != NULL)
-      __wq_run(th);
-    for (i = 0; i < NCPU; i++) {
-      th = __wq_steal(&queue[i]);
-      if (th != NULL) {
-        __wq_run(th);
-        break;
-      }
-    }
-  }
-  mycpu()->wqframe = NULL;
-  popcli();
-}
 void
 wq_dump(void)
 {
@@ -262,31 +204,35 @@ wq_dump(void)
 }
 static void
-__test_stub(uptr a0, uptr a1)
+__test_stub(struct work *w, void *a0, void *a1)
 {
-  //cprintf("%lu, %lu\n", a0, a1);
+  //long i = (long)a0;
+  //cprintf("%u: %lu\n", cpunum(), i);
+  volatile int *running = a1;
+  subfetch(running, 1);
 }
 void
 testwq(void)
 {
-  enum { iters = 1000 };
+  enum { iters = 10 };
-  static volatile int running = 1;
+  static volatile int running = iters;
  u64 e, s;
-  int i;
+  long i;
  pushcli();
  if (mycpu()->id == 0) {
    microdelay(1);
    s = rdtsc();
-    wq_start();
+    for (i = 0; i < iters; i++) {
-    for (i = 0; i < iters; i++)
+      if (wq_push2(__test_stub, (void*)i, (void*)&running) < 0)
-      wq_push(__test_stub, i, i);
+        panic("testwq: oops");
-    wq_end();
+    }
    e = rdtsc();
    cprintf("testwq: %lu\n", (e-s)/iters);
+    while (running)
+      nop_pause();
    wq_dump();
-    running = 0;
  } else {
    while (running)
      wq_trywork();
@@ -295,17 +241,12 @@ testwq(void)
 }
 void
-initwqframe(struct wqframe *wq)
-{
-  memset(wq, 0, sizeof(*wq));
-}
-void
 initwq(void)
 {
  int i;
  for (i = 0; i < NCPU; i++)
-    ql_init(&queue[i].lock, "queue lock");
+    initlock(&queue[i].lock, "wq lock", LOCKSTAT_WQ);
 }
 #endif // WQENABLE
--- a/wq.h
+++ b/wq.h
+struct work {
+  void *rip;
+  void *arg0;
+  void *arg1;
+  char data[];
+};