Merge branch 'scale-amd64' of git+ssh://pdos.csail.mit.edu/home/am0/6.828/xv6 into scale-amd64

Conflicts: kernel/sysfile.cc

Merge branch 'scale-amd64' of git+ssh://pdos.csail.mit.edu/home/am0/6.828/xv6 into scale-amd64
69f0e679 · Robert Morris · d44fe0b2 · 4d4fbfb9 · 69f0e679 · 69f0e679
--- a/.dir-locals.el
+++ b/.dir-locals.el
+((c-mode
+  (indent-tabs-mode . nil)
+  (c-file-style . "bsd")
+  (c-basic-offset . 2))
+ (c++-mode
+  (indent-tabs-mode . nil)
+  (c-file-style . "bsd")
+  (c-basic-offset . 2))
+ )
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 Q          ?= @
 TOOLPREFIX ?= x86_64-jos-elf-
 QEMU 	   ?= qemu-system-x86_64
-QEMUSMP	   ?= 4
+QEMUSMP	   ?= 8
 QEMUSRC    ?= ../mtrace
 MTRACE	   ?= $(QEMU)
 HW	   ?= qemu
@@ -89,7 +89,7 @@ gdb: $(KERN)
 ## mtrace
 ##
 mscan.syms: $(KERN)
-	$(NM) -S $< > $@
+	$(NM) -C -S $< > $@

 mscan.kern: $(KERN)
 	cp $< $@

--- a/README
+++ b/README
@@ -3,8 +3,12 @@
  You need to build and install mtrace:
    git+ssh://amsterdam.csail.mit.edu/home/am6/mpdev/qemu.git -b mtrace

-  #define MTRACE 1 in param.h
+  #define MTRACE 1 in param.h  (for qemu!)
+  If mtrace isn't cloned next to the xv6-scale repository, then set
+  QEMUSRC in config.mk to the directory containing mtrace-magic.h.
+  Set MTRACE in config.mk to the mtrace QEMU binary's path.
  $ make mscan.out
+  or make mtrace.out to generate just the trace file and not the summary.

 * Networking with lwIP

@@ -57,3 +61,11 @@
  $ apt-get install libjemalloc-dev
  $ make HW=user
  $ ./o.user/utest
+
+* abstract sharing
+
+  Obtain and configure mtrace as described above.
+  Disable DEBUG and enable MTRACE in param.h.
+  $ make QEMUSMP=8 mtrace.out
+  Run asharing in xv6 to generate abstract sharing traces
+  $ mscan --abstract-scopes --unexpected-sharing
--- a/bin/Makefrag
+++ b/bin/Makefrag
@@ -13,6 +13,8 @@ UPROGS= \
 	ls \
 	mapbench \
 	maptest \
+	mkdir \
+	mktree \
 	sh \
 	nsh \
 	halt \
@@ -27,10 +29,12 @@ UPROGS= \
 	wqsh \
 	cp \
 	perf \
+	asharing \
 	xls \
-	xdu
-# pdu
-# pls
+	xdu \
+	wqtest \
+	rm \
+	avar

 ifeq ($(HAVE_LWIP),y)
 UPROGS += \

--- a/bin/asharing.cc
+++ b/bin/asharing.cc
+// Tests to drive abstract sharing analysis
+
+#include "types.h"
+#include "user.h"
+#include "fcntl.h"
+#include "mtrace.h"
+#include "pthread.h"
+
+static int cpu;
+static pthread_barrier_t bar;
+enum { ncore = 8 };
+
+void
+next()
+{
+  if (setaffinity(cpu) < 0) {
+    cpu = 0;
+    if (setaffinity(cpu) < 0)
+      die("sys_setaffinity(%d) failed", cpu);
+  }
+  cpu++;
+}
+
+void*
+vmsharing(void* arg)
+{
+  u64 i = (u64) arg;
+
+  volatile char *p = (char*)(0x40000UL + i * 4096);
+  if (map((void *) p, 4096) < 0)
+    die("map failed");
+
+  if (unmap((void *) p, 4096) < 0)
+    die("unmap failed");
+
+  return 0;
+}
+
+void*
+fssharing(void* arg)
+{
+  u64 i = (u64) arg;
+
+  // Note that we keep these files open; otherwise all of these
+  // operations will share the abstract FD object and we won't get any
+  // results.
+
+  char filename[32];
+  snprintf(filename, sizeof(filename), "f%d", i);
+
+  open(filename, O_CREATE|O_RDWR);
+
+  pthread_barrier_wait(&bar);
+
+  for (u64 j = 0; j < ncore; j++) {
+    snprintf(filename, sizeof(filename), "f%d", j);
+    open(filename, O_RDWR);
+  }
+  return 0;
+}
+
+int
+main(int ac, char **av)
+{
+  void* (*op)(void*) = 0;
+  if (ac == 2 && strcmp(av[1], "vm") == 0)
+    op = vmsharing;
+  else if (ac == 2 && strcmp(av[1], "fs") == 0)
+    op = fssharing;
+  else
+    fprintf(1, "usage: %s vm|fs\n", av[0]);
+
+  if (op) {
+    mtenable_type(mtrace_record_ascope, "xv6-asharing");
+    pthread_barrier_init(&bar, 0, ncore);
+    for (u64 i = 0; i < ncore; i++) {
+      next();
+      pthread_t tid;
+      pthread_create(&tid, 0, op, (void*) i);
+    }
+
+    for (u64 i = 0; i < ncore; i++)
+      wait();
+    mtdisable("xv6-asharing");
+  }
+}
--- a/bin/avar.cc
+++ b/bin/avar.cc
+// Tests to drive abstract sharing analysis
+
+#include "types.h"
+#include "user.h"
+#include "mtrace.h"
+
+int
+main(int ac, char **av)
+{
+  if (ac == 2 && strcmp(av[1], "on") == 0)
+    mtenable_type(mtrace_record_ascope, "xv6-asharing");
+  else if (ac == 2 && strcmp(av[1], "off") == 0)
+    mtdisable("xv6-asharing");  
+  else
+    fprintf(1, "usage: %s on|off\n", av[0]);
+}
--- a/bin/forktree.cc
+++ b/bin/forktree.cc
 #include "types.h"
 #include "stat.h"
 #include "user.h"
-#include "mtrace.h"

 #define NCHILD 2
 #define NDEPTH 5
@@ -12,7 +11,6 @@ forktree(void)
  int depth = 0;

  fprintf(1, "%d: fork tree\n", getpid());
-  mtenable("xv6-forktree");

 next_level:
  //printf(1, "pid %d, depth %d\n", getpid(), depth);
@@ -47,9 +45,6 @@ forktree(void)
  if (depth > 0)
    exit();

-  mtops(0);
-  mtdisable("xv6-forktree");
-  
  fprintf(1, "%d: fork tree OK\n", getpid());
  // halt();
 }

--- a/bin/mkdir.cc
+++ b/bin/mkdir.cc
+#include "types.h"
+#include "stat.h"
+#include "user.h"
+
+int
+main(int argc, char *argv[])
+{
+  int i;
+
+  if (argc < 2)
+    die("ussage: mkdir files...");
+
+  for(i = 1; i < argc; i++) {
+    if (mkdir(argv[i]) < 0)
+      die("mkdir: %s failed to create", argv[i]);
+  }
+}
--- a/bin/mktree.cc
+++ b/bin/mktree.cc
+#include "types.h"
+#include "stat.h"
+#include "user.h"
+#include "lib.h"
+#include "fcntl.h"
+#include "wq.hh"
+
+static int branch;
+
+static void
+dolevel(int fd, int depth)
+{
+  if (depth > 0) {
+    int it = 0;                                
+    wq_for<int>(it,
+                [](int &it)->bool { return it < branch; },
+                [&fd, &depth](int i)->void
+    {
+      char name[] = "a";
+      *name += i;
+      if (mkdirat(fd, name) < 0)
+        die("mkdirat");
+
+      int nfd = openat(fd, name, O_RDONLY);
+      if (nfd < 0)
+        die("openat: %s at %u", name, depth);
+      dolevel(nfd, depth-1);
+    });
+  }
+
+  close(fd);
+}
+
+int
+main(int ac, char **av)
+{
+  if (ac < 4)
+    die("usage: %s dir branch depth", av[0]);
+
+  initwq();
+
+  const char *dir = av[1];
+  branch = atoi(av[2]);
+  int depth = atoi(av[3]);
+
+  if (mkdir(dir))
+    die("mkdir");
+  
+  int fd = open(dir, O_RDONLY);
+  if (fd < 0)
+    die("open");
+  
+  dolevel(fd, depth);
+}
--- a/bin/rm.cc
+++ b/bin/rm.cc
+#include "types.h"
+#include "stat.h"
+#include "user.h"
+
+int
+main(int argc, char *argv[])
+{
+  int i;
+
+  if(argc < 2)
+    die("Usage: rm files...");
+
+  for(i = 1; i < argc; i++){
+    if(unlink(argv[i]) < 0)
+      die("rm: %s failed to delete\n", argv[i]);
+  }
+
+  exit();
+}
--- a/bin/wqtest.cc
+++ b/bin/wqtest.cc
+#include "types.h"
+#include "user.h"
+#include "lib.h"
+#include "amd64.h"
+#include "wq.hh"
+
+#define NEW_DELETE_OPS(classname)                                   \
+  static void* operator new(unsigned long nbytes) {                 \
+    assert(nbytes == sizeof(classname));                            \
+    return malloc(sizeof(classname));                               \
+  }                                                                 \
+                                                                    \
+  static void operator delete(void *p) {                            \
+    free(p);                                                        \
+  }
+
+struct testwork : public work {
+  testwork(forframe *b) : barrier_(b) {}
+
+  virtual void run() {
+    barrier_->dec();
+    delete this;
+  }
+
+  NEW_DELETE_OPS(testwork);
+  struct forframe *barrier_;
+};
+
+static void
+test0(void)
+{
+  enum { pushes = 100 };
+  struct forframe wqbarrier(pushes);
+
+  printf("test0...\n");
+  for (int i = 0; i < pushes; i++) {
+    testwork *w = new testwork(&wqbarrier);
+    wq_push(w);
+  }
+
+  while (!wqbarrier.zero())
+    nop_pause();
+  printf("test0 done\n");
+}
+
+struct forkwork : public work {
+  forkwork(forframe *b) : barrier_(b) {}
+
+  virtual void run() {
+    int pid;
+
+    pid = fork(0);
+    if (pid < 0)
+      die("forkwork::run: fork");
+    else if (pid == 0)
+      exit();
+    wait();
+
+    barrier_->dec();
+    delete this;
+  }
+
+  NEW_DELETE_OPS(forkwork);
+  struct forframe *barrier_;
+};
+
+static void
+testfork(void)
+{
+  enum { forks = 100 };
+  struct forframe wqbarrier(forks);
+
+  printf("testfork...\n");
+  for (int i = 0; i < forks; i++) {
+    forkwork *w = new forkwork(&wqbarrier);
+    wq_push(w);
+  }
+
+  while (!wqbarrier.zero())
+    nop_pause();
+  printf("testfork done\n");
+}
+
+struct execwork : public work {
+  execwork(forframe *b) : barrier_(b) {}
+
+  virtual void run() {
+    int pid;
+
+    pid = fork(0);
+    if (pid < 0)
+      die("execwork::run: fork");
+    else if (pid == 0) {
+      static const char *args[] = { "echo", 0 };
+      exec(args[0], args);
+      die("execwork: exec failed");
+    }
+    wait();
+
+    barrier_->dec();
+    delete this;
+  }
+
+  static void test(void) {
+    enum { execs = 100 };
+    struct forframe wqbarrier(execs);
+
+    printf("testexec...\n");
+    for (int i = 0; i < execs; i++) {
+      execwork *w = new execwork(&wqbarrier);
+      wq_push(w);
+    }
+    
+    while (!wqbarrier.zero())
+      nop_pause();
+    printf("testexec done\n");
+  }
+
+  NEW_DELETE_OPS(execwork);
+  struct forframe *barrier_;
+};
+
+int
+main(int ac, char **av)
+{
+  initwq();
+  test0();
+  testfork();
+  execwork::test();
+  return 0;
+}
--- a/bin/xdu.cc
+++ b/bin/xdu.cc
@@ -51,15 +51,12 @@ du(int fd)
                  [](dirit &i)->bool { return !i.end(); },
                  [&size, &fd](const char *name)->void
    {
-      if (!strcmp(name, ".") || !strcmp(name, "..")) {
-        free((void*)name);
+      if (!strcmp(name, ".") || !strcmp(name, ".."))
        return;
-      }

      int nfd = openat(fd, name, 0);
      if (nfd >= 0)
        size += du(nfd);  // should go into work queue
-      free((void*)name);
    });
  } else {
    close(fd);
@@ -79,6 +76,5 @@ main(int ac, char **av)
  perf_stop();
  printf("%ld\n", s);
  wq_dump();
-  exitwq();
  return 0;
 }
--- a/bin/xls.cc
+++ b/bin/xls.cc
@@ -67,14 +67,12 @@ ls(const char *path)
      struct stat st;
      if (xfstatat(fd, name, &st) < 0){
        printf("ls: cannot stat %s\n", name);
-        free((void*)name);
        return;
      }
      
      if (!silent)
        printf("%u %10lu %10lu %s\n",
               ST_TYPE(st), ST_INO(st), ST_SIZE(st), name);
-      free((void*)name);
    });
  } else {
    close(fd);
@@ -99,6 +97,5 @@ main(int argc, char *argv[])
  perf_stop();
  
  wq_dump();
-  exitwq();
  return 0;
 }
--- a/include/cpputil.hh
+++ b/include/cpputil.hh
@@ -133,7 +133,7 @@ extern void *__dso_handle;
 #define NEW_DELETE_OPS(classname)                                   \
  static void* operator new(unsigned long nbytes) {                 \
    assert(nbytes == sizeof(classname));                            \
-    return kmalloc(sizeof(classname));                              \
+    return kmalloc(sizeof(classname), #classname);                  \
  }                                                                 \
                                                                    \
  static void* operator new(unsigned long nbytes, classname *buf) { \

--- a/include/dirit.hh
+++ b/include/dirit.hh
@@ -13,14 +13,6 @@ public:
    return *this;
  }

-  const char * copy_value() {
-    char *buf = (char*)malloc(256);
-    return name(buf, 256);
-  }
-
-  bool end() const { return end_; }
-
-private:
  char *name(char *buf, size_t n) const {
    n = MIN(DIRSIZ+1, n);
    memmove(buf, de_.name, n-1);
@@ -28,6 +20,9 @@ private:
    return buf;
  } 

+  bool end() const { return end_; }
+
+private:
  void refill(void) {
    int r;

@@ -45,3 +40,16 @@ private:
  bool end_;
  struct dirent de_;
 };
+
+static inline const char*
+copy_value(dirit &it)
+{
+  char *buf = (char*)malloc(256);
+  return it.name(buf, 256);
+}
+
+static inline void
+free_value(dirit &it, const char *name)
+{
+  free((void*)name);
+}
--- a/include/elf.hh
+++ b/include/elf.hh
@@ -40,10 +40,49 @@ struct proghdr {
  Elf64_Xword align;            // Segment alignment, file & memory
 };

+struct elfnote {
+  Elf64_Word namesz;            // Name size
+  Elf64_Word descsz;	        // Content size
+  Elf64_Word type;	        // Content type
+};
+
 // Values for Proghdr type
 #define ELF_PROG_LOAD           1
+#define ELF_PROG_NOTE           4

 // Flag bits for Proghdr flags
 #define ELF_PROG_FLAG_EXEC      1
 #define ELF_PROG_FLAG_WRITE     2
 #define ELF_PROG_FLAG_READ      4
+
+// All known .note types
+#define ELF_NOTE_XV6_ADDR       1
+
+// xv6-specific address note
+struct xv6_addrdesc {
+  Elf64_Word id;
+  Elf64_Addr vaddr;
+};
+
+struct xv6_addrnote {
+  struct elfnote elfnote;
+  // name is 0 bytes
+  struct xv6_addrdesc desc;
+};
+
+// All xv6-specific IDs for notes about addresses
+#define XV6_ADDR_ID_WQ 1
+
+#define DEFINE_XV6_ADDRNOTE(xname, xid, xvaddr)                         \
+  const struct xv6_addrnote xname PROG_NOTE_ATTRIBUTE = {               \
+    elfnote: {                                                          \
+      namesz: 0,                                                        \
+      descsz: sizeof(((xv6_addrnote *)nullptr)->desc),                  \
+      type: ELF_NOTE_XV6_ADDR                                           \
+    },                                                                  \
+    desc: {                                                             \
+      id: (xid),                                                        \
+      vaddr: (Elf64_Addr)(xvaddr) }                                     \
+  }
+
+#define PROG_NOTE_ATTRIBUTE __attribute__ ((section(".note"), used))
--- a/include/fcntl.h
+++ b/include/fcntl.h
@@ -5,3 +5,7 @@
 #define O_WAIT    0x400 // open waits for create, read for write

 #define AT_FDCWD  -100
+
+#define FORK_SHARE_VMAP   (1<<0)
+#define FORK_SHARE_FD     (1<<1)
+
--- a/include/filetable.hh
+++ b/include/filetable.hh
@@ -54,6 +54,7 @@ public:
      }
    }
    release(&lock_);    
+    cprintf("filetable::allocfd: failed\n");
    return -1;
  }


--- a/include/hwvm.hh
+++ b/include/hwvm.hh
@@ -10,7 +10,7 @@ extern pgmap kpml4;

 void            freevm(pgmap *pml4);
 pgmap*          setupkvm(void);
-int             setupkshared(pgmap *pml4, char *kshared);
+int             mapkva(pgmap *pml4, char* kva, uptr uva, size_t size);
 std::atomic<pme_t>* walkpgdir(pgmap *pml4, u64, int);
 void            tlbflush(void);


--- a/include/kalloc.hh
+++ b/include/kalloc.hh
@@ -63,6 +63,7 @@ enum {
  slab_perf,
  slab_kshared,
  slab_wq,
+  slab_userwq,
  slab_type_max
 };


--- a/include/kern_c.h
+++ b/include/kern_c.h
@@ -29,7 +29,7 @@ long sys_fstat(int, struct stat*);
 long sys_getpid(void);
 long sys_kill(int);
 long sys_link(const char*, const char*);
-long sys_mkdir(const char*);
+long sys_mkdirat(int, const char*);
 long sys_mknod(const char*, int, int);
 long sys_openat(int, const char*, int);
 long sys_pipe(int*);
@@ -51,6 +51,8 @@ long sys_pread(int fd, void *ubuf, size_t count, off_t offset);
 long sys_async(int, size_t, off_t, u32, u32);
 long sys_script(void *addr, u64 len, u64 chunk);
 long sys_setfs(u64 base);
+long sys_wqwait(void);
+long sys_setaffinity(int cpu);
 extern long (*syscalls[])(u64, u64, u64, u64, u64);

 // other exported/imported functions

--- a/include/kernel.hh
+++ b/include/kernel.hh
@@ -5,13 +5,9 @@ extern "C" {
 }

 #include "atomic.hh"
+#include "memlayout.h"
 #include <stdarg.h>

-#define KBASE   0xFFFFFF0000000000ull
-#define KCODE   0xFFFFFFFFC0000000ull
-#define KSHARED 0xFFFFF00000000000ull
-#define USERTOP 0x0000800000000000ull
-
 #define KCSEG (2<<3)  /* kernel code segment */
 #define KDSEG (3<<3)  /* kernel data segment */

@@ -72,6 +68,7 @@ void            vcprintf(const char *fmt, va_list ap);
 void            panic(const char*, ...) 
                  __noret__ __attribute__((format(printf, 1, 2)));
 void            kerneltrap(struct trapframe *tf) __noret__;
+void            vsnprintf(char *buf, u32 n, const char *fmt, va_list ap);
 void            snprintf(char *buf, u32 n, const char *fmt, ...);
 void            printtrace(u64 rbp);

@@ -94,6 +91,7 @@ struct inode*   ialloc(u32, short);
 struct inode*   namei(inode *cwd, const char*);
 void            iput(struct inode*);
 struct inode*   iget(u32 dev, u32 inum);
+struct inode*   igetnoref(u32 dev, u32 inum);
 void            ilock(struct inode*, int writer);
 void            iunlockput(struct inode*);
 void            iupdate(struct inode*);
@@ -125,13 +123,13 @@ void            idlezombie(struct proc*);
 void            ioapicenable(int irq, int cpu);

 // kalloc.c
-char*           kalloc(void);
+char*           kalloc(const char *name);
 void            kfree(void*);
 void*           ksalloc(int slabtype);
 void            ksfree(int slabtype, void*);
-void*           kmalloc(u64 nbytes);
+void*           kmalloc(u64 nbytes, const char *name);
 void            kmfree(void*, u64 nbytes);
-int             kmalign(void **p, int align, u64 size);
+int             kmalign(void **p, int align, u64 size, const char *name);
 void            kmalignfree(void *, int align, u64 size);
 void            verifyfree(char *ptr, u64 nbytes);
 void            kminit(void);
@@ -170,13 +168,11 @@ int             piperead(struct pipe*, char*, int);
 int             pipewrite(struct pipe*, char*, int);

 // proc.c
-struct proc*    allocproc(void);
 struct proc*    copyproc(struct proc*);
 void            finishproc(struct proc*);
 void            exit(void);
 int             fork(int);
 int             growproc(int);
-int             kill(int);
 void            pinit(void);
 void            procdumpall(void);
 void            scheduler(void) __noret__;

--- a/include/kmtrace.hh
+++ b/include/kmtrace.hh
+#pragma once
+
 #include "mtrace.h"
+
 #if MTRACE
 // Tell mtrace about switching threads
 struct kstack_tag {
@@ -61,6 +64,51 @@ static inline void mtresume(struct proc *p)
 #define mtrec() mtrace_call_set(1, ~0ull)
 #define mtign() mtrace_call_set(0, ~0ull)

+class mt_ascope
+{
+  char name[64];
+public:
+  explicit mt_ascope(const char *fmt, ...)
+  {
+    va_list ap;
+
+    va_start(ap, fmt);
+    vsnprintf(name, sizeof(name) - 1, fmt, ap);
+    va_end(ap);
+
+    mtrace_ascope_register(0, name);
+  }
+
+  ~mt_ascope()
+  {
+    mtrace_ascope_register(1, name);
+  }
+};
+
+static inline void mtreadavar(const char *fmt, ...)
+{
+  char name[64];
+  va_list ap;
+
+  va_start(ap, fmt);
+  vsnprintf(name, sizeof(name), fmt, ap);
+  va_end(ap);
+
+  mtrace_avar_register(0, name);
+}
+
+static inline void mtwriteavar(const char *fmt, ...)
+{
+  char name[64];
+  va_list ap;
+
+  va_start(ap, fmt);
+  vsnprintf(name, sizeof(name), fmt, ap);
+  va_end(ap);
+
+  mtrace_avar_register(1, name);
+}
+
 #else
 #define mtstart(ip, p) do { } while (0)
 #define mtstop(p) do { } while (0)
@@ -70,4 +118,13 @@ static inline void mtresume(struct proc *p)
 #define mtign(cpu) do { } while (0)
 #define mtrec(cpu) do { } while (0)
 #define mtign(cpu) do { } while (0)
+
+class mt_ascope
+{
+public:
+  explicit mt_ascope(const char *fmt, ...) {}
+};
+#define mtreadavar(fmt, ...) do { } while (0)
+#define mtwriteavar(fmt, ...) do { } while (0)
+
 #endif
--- a/include/lockstat.h
+++ b/include/lockstat.h
@@ -52,6 +52,7 @@ struct klockstat;
 #define LOCKSTAT_KALLOC    1
 #define LOCKSTAT_KMALLOC   1
 #define LOCKSTAT_NET       1
+#define LOCKSTAT_NS        1
 #define LOCKSTAT_PIPE      1
 #define LOCKSTAT_PROC      1
 #define LOCKSTAT_SCHED     1

--- a/include/memlayout.h
+++ b/include/memlayout.h
+#define KBASE      0xFFFFFF0000000000ull
+#define KCODE      0xFFFFFFFFC0000000ull
+#define KSHARED    0xFFFFF00000000000ull
+#define USERWQ     0xFFFFF00100000000ull
+#define USERTOP    0x0000800000000000ull
+#define UWQSTACK   0x0000700000000000ull
--- a/include/mtrace.h
+++ b/include/mtrace.h
@@ -33,8 +33,9 @@ char* strncpy(char *s, const char *t, size_t n);
  mtrace_lock_register(RET_IP(), ptr, lockname(ptr), mtrace_lockop_release, 0)

 // Enable/disable all mtrace logging
-#define mtenable(name)  mtrace_enable_set(1, name)
-#define mtdisable(name) mtrace_enable_set(0, name)
+#define mtenable(name)  mtrace_enable_set(mtrace_record_movement, name)
+#define mtenable_type(type, name)  mtrace_enable_set(type, name)
+#define mtdisable(name) mtrace_enable_set(mtrace_record_disable, name)

 // Log the number of operations 
 static inline void mtops(u64 n)
@@ -54,6 +55,7 @@ static inline void mtops(u64 n)
 #define mtrec(cpu) do { } while (0)
 #define mtign(cpu) do { } while (0)
 #define mtenable(name) do { } while (0)
+#define mtenable_type(type, name) do { } while (0)
 #define mtdisable(name) do { } while (0)
 #define mtops(n) do { } while (0)
 #endif
--- a/include/ns.hh
+++ b/include/ns.hh
 #pragma once

 #include "gc.hh"
+#include "percpu.hh"

 // name spaces
 // XXX maybe use open hash table, no chain, better cache locality
@@ -8,18 +9,26 @@
 #if SPINLOCK_DEBUG
 #define NHASH 10
 #else
-#define NHASH 30
+#define NHASH 257
 #endif

 template<class K, class V>
 class xelem : public rcu_freed {
 public:
  V val;
-  std::atomic<int> next_lock;
-  std::atomic<xelem<K, V>*> volatile next;
  K key;

-  xelem(const K &k, const V &v) : rcu_freed("xelem"), val(v), next_lock(0), next(0), key(k) {}
+  std::atomic<int> next_lock;
+  std::atomic<xelem<K, V>*> next;
+
+  int percore_c;
+  std::atomic<xelem<K, V>*> percore_next;
+  std::atomic<xelem<K, V>*>* percore_pprev;
+
+  xelem(const K &k, const V &v)
+    : rcu_freed("xelem"), val(v), key(k),
+      next_lock(0), next(0),
+      percore_next(0), percore_pprev(0) {}
  virtual void do_gc() {
    delete this;
  }
@@ -27,10 +36,11 @@ class xelem : public rcu_freed {
  NEW_DELETE_OPS(xelem)
 };

+// XXX maybe not cache align, because it takes too much space
 template<class K, class V>
 struct xbucket {
  std::atomic<xelem<K, V>*> volatile chain;
-} __attribute__((aligned (CACHELINE)));
+} ; // __attribute__((aligned (CACHELINE)));

 template<class K, class V, u64 (*HF)(const K&)>
 class xns : public rcu_freed {
@@ -38,6 +48,8 @@ class xns : public rcu_freed {
  bool allowdup;
  std::atomic<u64> nextkey;
  xbucket<K, V> table[NHASH];
+  std::atomic<xelem<K, V>*> percore[NCPU];
+  spinlock percore_lock[NCPU];

 public:
  xns(bool dup) : rcu_freed("xns") {
@@ -45,6 +57,10 @@ class xns : public rcu_freed {
    nextkey = 1;
    for (int i = 0; i < NHASH; i++)
      table[i].chain = 0;
+    for (int i = 0; i < NCPU; i++) {
+      percore[i] = nullptr;
+      initlock(&percore_lock[i], "xns_lock", LOCKSTAT_NS);
+    }
  }

  ~xns() {
@@ -85,8 +101,18 @@ class xns : public rcu_freed {
      }

      e->next = root.load();
-      if (cmpxch(&table[i].chain, e->next.load(), e))
+      if (cmpxch(&table[i].chain, e->next.load(), e)) {
+        int c = mycpuid();
+        acquire(&percore_lock[c]);
+        e->percore_c = c;
+        e->percore_next = percore[c].load();
+        if (percore[c])
+          percore[c].load()->percore_pprev = &e->percore_next;
+        e->percore_pprev = &percore[c];
+        percore[c] = e;
+        release(&percore_lock[c]);
        return 0;
+      }
    }
  }

@@ -133,6 +159,13 @@ class xns : public rcu_freed {
            break;
          }

+          int c = e->percore_c;
+          acquire(&percore_lock[c]);
+          *e->percore_pprev = e->percore_next.load();
+          if (e->percore_next)
+            e->percore_next.load()->percore_pprev = e->percore_pprev;
+          release(&percore_lock[c]);
+
          *pelock = 0;
          gc_delayed(e);
          return true;
@@ -147,12 +180,13 @@ class xns : public rcu_freed {
  template<class CB>
  void enumerate(CB cb) {
    scoped_gc_epoch gc;
-    for (int i = 0; i < NHASH; i++) {
-      auto e = table[i].chain.load();
+    int cpuoffset = mycpuid();
+    for (int i = 0; i < NCPU; i++) {
+      auto e = percore[(i + cpuoffset) % NCPU].load();
      while (e) {
        if (cb(e->key, e->val))
          return;
-        e = e->next;
+        e = e->percore_next;
      }
    }
  }

--- a/include/proc.hh
+++ b/include/proc.hh
@@ -7,6 +7,9 @@
 #include "file.hh"
 #include "filetable.hh"

+class uwq;
+class uwq_worker;
+
 // Saved registers for kernel context switches.
 // (also implicitly defined in swtch.S)
 struct context {
@@ -38,11 +41,19 @@ struct mtrace_stacks {
 };
 #endif

-enum procstate { EMBRYO, SLEEPING, RUNNABLE, RUNNING, ZOMBIE };
+typedef enum procstate { 
+  EMBRYO,
+  SLEEPING,
+  RUNNABLE,
+  RUNNING,
+  ZOMBIE 
+} procstate_t;;

 // Per-process state
 struct proc : public rcu_freed {
  struct vmap *vmap;           // va -> vma
+  uwq* uwq;
+  uwq_worker* worker;
  char *kstack;                // Bottom of kernel stack for this process
  volatile int pid;            // Process ID
  struct proc *parent;         // Parent process
@@ -61,7 +72,6 @@ struct proc : public rcu_freed {
  struct condvar cv;
  std::atomic<u64> epoch;      // low 8 bits are depth count
  char lockname[16];
-  int on_runq;
  int cpu_pin;
 #if MTRACE
  struct mtrace_stacks mtrace_stacks;
@@ -76,15 +86,21 @@ struct proc : public rcu_freed {
  LIST_ENTRY(proc) cv_sleep;   // Linked list of processes sleeping on a cv
  u64 user_fs_;

-  proc(int npid);
-  ~proc(void);
+  static proc* alloc();
+  void         set_state(procstate_t s);
+  procstate_t  get_state(void) const { return state_; }
+  int          set_cpu_pin(int cpu);
+  static int   kill(int pid);
+  int          kill();

  virtual void do_gc(void) { delete this; }
-  NEW_DELETE_OPS(proc)
-
-  void set_state(enum procstate s);
-  enum procstate get_state(void) const { return state_; }

 private:
-  enum procstate state_;       // Process state  
+  proc(int npid);
+  ~proc(void);
+  proc& operator=(const proc&);
+  proc(const proc& x);
+  NEW_DELETE_OPS(proc);
+  
+  procstate_t state_;       // Process state  
 };
--- a/include/radix.hh
+++ b/include/radix.hh
@@ -4,6 +4,7 @@
 * A page-table-like structure for mapping fixed-length keys to void* ptrs.
 */

+#include "gc.hh"
 #include "markptr.hh"

 enum { bits_per_level = 9 };
@@ -57,6 +58,10 @@ struct radix {
  }
  radix_elem* search(u64 key);
  radix_range search_lock(u64 start, u64 size);
+
+  // k is shifted value.
+  u64 skip_empty(u64 k) const;
+
  NEW_DELETE_OPS(radix)
 };

@@ -64,8 +69,8 @@ struct radix_iterator {
  const radix* r_;
  u64 k_;

-  radix_iterator(const radix* r, u64 k) : r_(r), k_(k) {}
-  radix_iterator &operator++() { k_++; return *this; }
+  radix_iterator(const radix* r, u64 k) : r_(r), k_(r->skip_empty(k)) {}
+  radix_iterator &operator++() { k_++; k_ = r_->skip_empty(k_); return *this; }
  radix_elem* operator*();
  bool operator==(const radix_iterator &other) {
    return r_ == other.r_ && k_ == other.k_; }

--- a/include/syscall.h
+++ b/include/syscall.h
@@ -13,7 +13,7 @@
 #define SYS_unlink 12
 #define SYS_fstat  13
 #define SYS_link   14
-#define SYS_mkdir  15
+#define SYS_mkdirat 15
 #define SYS_chdir  16
 #define SYS_dup    17
 #define SYS_getpid 18
@@ -31,4 +31,6 @@
 #define SYS_async  30
 #define SYS_script 31
 #define SYS_setfs  32
-#define SYS_ncount 33   /* total number of system calls */
+#define SYS_wqwait 33
+#define SYS_setaffinity 34
+#define SYS_ncount 35   /* total number of system calls */
--- a/include/user.h
+++ b/include/user.h
@@ -18,6 +18,7 @@ int unlink(const char*);
 int fstat(int fd, struct stat*);
 int link(const char*, const char*);
 int mkdir(const char*);
+int mkdirat(int dirfd, const char *pathname);
 int chdir(const char*);
 int dup(int);
 int getpid(void);
@@ -31,6 +32,7 @@ ssize_t pread(int, void*, size_t, off_t);
 int async(int, size_t, off_t, u32, u32);
 int script(void *addr, u64 len, u64 chunk);
 int setfs(u64 base);
+int setaffinity(int cpu);

 // ulib.c
 int stat(char*, struct stat*);
@@ -52,7 +54,7 @@ void free(void*);
 int atoi(const char*);

 // uthread.S
-int forkt(void *sp, void *pc, void *arg);
+int forkt(void *sp, void *pc, void *arg, int forkflags);
 void forkt_setup(u64 pid);

 // printf.c

--- a/include/uwq.hh
+++ b/include/uwq.hh
+#pragma once
+
+struct padded_length {
+  volatile u64 v_ __mpalign__;;
+  __padout__;
+};
+
+#if defined (XV6_KERNEL)
+bool uwq_trywork(void);
+
+#define NWORKERS (NCPU-1)
+struct uwq;
+
+struct uwq_worker {
+  uwq_worker(uwq*, proc*);
+  long wait();
+  void exit();
+
+  uwq* uwq_;
+  proc *proc_;
+  bool running_;
+  struct spinlock lock_;
+  struct condvar cv_;
+
+  NEW_DELETE_OPS(uwq_worker);
+};
+
+struct uwq : public referenced, public rcu_freed {
+  friend struct uwq_worker;
+
+  static uwq* alloc(vmap* vmap, filetable *ftable);
+  bool  haswork() const;
+  bool  tryworker();
+
+  void  setuentry(uptr uentry);
+
+  virtual void do_gc(void) { delete this; }
+
+protected:
+  virtual void onzero() const;
+
+private:
+  uwq(vmap* vmap, filetable* ftable, padded_length *len);
+  ~uwq();
+  uwq& operator=(const uwq&);
+  uwq(const uwq& x);
+  proc* allocworker();
+  void  finish();
+  NEW_DELETE_OPS(uwq);
+
+  struct spinlock lock_;
+  vmap* vmap_;
+  filetable* ftable_;
+  padded_length* len_;
+  uptr uentry_;
+  uptr ustack_;
+  std::atomic<u64> uref_;
+
+  uwq_worker* worker_[NWORKERS];
+};
+#endif
--- a/include/vm.hh
+++ b/include/vm.hh
@@ -5,10 +5,13 @@
 #include "radix.hh"
 #include "cpputil.hh"
 #include "hwvm.hh"
+#include "uwq.hh"

 #define VM_CRANGE 1
 #define VM_RADIX  0

+struct padded_length;
+
 using std::atomic;

 // A memory object (physical pages or inode).
@@ -17,7 +20,6 @@ enum vmntype { EAGER, ONDEMAND };
 struct vmnode {
  const u64 npages;
  atomic<char*> page[128];
-  atomic<u64> ref;
  const enum vmntype type;
  struct inode *const ip;
  const u64 offset;
@@ -27,11 +29,15 @@ struct vmnode {
         inode *i = 0, u64 off = 0, u64 s = 0);
  ~vmnode();
  void decref();
+  void incref();
+  u64 ref();
  int allocpg();
  vmnode* copy();

  int demand_load();
-  NEW_DELETE_OPS(vmnode)
+  NEW_DELETE_OPS(vmnode);
+private:
+  atomic<u64> ref_;
 };

 // A mapping of a chunk of an address space to
@@ -78,6 +84,8 @@ struct vmap {
  bool replace_vma(vma *a, vma *b);

  void decref();
+  void incref();
+
  vmap* copy(int share);
  vma* lookup(uptr start, uptr len);
  int insert(vmnode *n, uptr va_start, int dotlb);

--- a/include/wq.hh
+++ b/include/wq.hh
@@ -35,7 +35,7 @@ struct cwork : public work {
 #define xmalloc(n)     malloc(n)
 #define xfree(p, sz)   free(p)
 #elif defined(XV6_KERNEL)
-#define xmalloc(n)   kmalloc(n) 
+#define xmalloc(n)   kmalloc(n, "xmalloc")
 #define xfree(p, sz) kmfree(p, sz)
 #else
 #define xmalloc(n)     malloc(n)

--- a/include/wqfor.hh
+++ b/include/wqfor.hh
@@ -13,7 +13,7 @@ struct forwork : public work {
    : it_(it), cond_(cond), body_(body), frame_(frame) {}

  virtual void run() {
-    decltype(it_.copy_value()) v = it_.copy_value();
+    decltype(copy_value(it_)) v = copy_value(it_);
    ++it_;
    if (cond_(it_)) {
      forwork<IT, BODY> *w = new forwork<IT, BODY>(it_, cond_, body_, frame_);
@@ -21,6 +21,7 @@ struct forwork : public work {
      wq_push(w);    
    }
    body_(v);
+    free_value(it_, v);
    frame_.dec();
    delete this;
  }
@@ -48,15 +49,48 @@ wq_for(IT &init, bool (*cond)(IT &it), BODY body)

  // XXX(sbw) should be able to coarsen loop

-  decltype(init.copy_value()) v = init.copy_value();
+  if (!cond(init))
+    return;
+
+  decltype(copy_value(init)) v = copy_value(init);
  ++init;
  if (cond(init)) {
    forwork<IT, BODY> *w = new forwork<IT, BODY>(init, cond, body, frame);
    frame.inc();
    wq_push(w);
  }
+
  body(v);
+  free_value(init, v);

  while (!frame.zero())
    wq_trywork();
 }
+
+// For debugging
+// Same API as wq_for but serially executes body 
+template <typename IT, typename BODY>
+static inline void
+wq_for_serial(IT &init, bool (*cond)(IT &it), BODY body)
+{
+  for (; cond(init); ++init) {
+    decltype(copy_value(init)) v = copy_value(init);
+    body(v);
+    free_value(init, v);
+  }
+}
+
+// Default copy_value
+template <typename T>
+static inline T
+copy_value(T &it)
+{
+  return it;
+}
+
+// Default free_value
+template <typename T>
+static inline void
+free_value(T &it, T &v)
+{
+}
--- a/include/wqkernel.hh
+++ b/include/wqkernel.hh
@@ -43,9 +43,4 @@ wqarch_init(void)
 {
 }

-static inline void
-wqarch_exit(void)
-{
-}
-
 #define xprintf      cprintf 
--- a/include/wquser.hh
+++ b/include/wquser.hh
@@ -4,10 +4,16 @@
 #include "user.h"
 #include "wq.hh"
 #include "pthread.h"
+#include "memlayout.h"
+#include "uwq.hh"
+#include "atomic.hh"
+#include "lib.h"
+#include "elf.hh"

 typedef struct uspinlock wqlock_t;

 static pthread_key_t idkey;
+static std::atomic<int> nextid;
 static volatile int exiting;

 int
@@ -22,6 +28,18 @@ allocwq(unsigned long nbytes)
  return malloc(nbytes);
 }

+static inline padded_length*
+allocklen(unsigned long nbytes)
+{
+  static bool alloced;
+  if (alloced)
+    die("allocklen: allocing more than once");
+  if (nbytes > USERWQSIZE)
+    die("allocklen: too large");
+  alloced = true;
+  return (padded_length*)USERWQ;
+}
+
 static inline void
 wqlock_acquire(wqlock_t *lock)
 {
@@ -46,48 +64,32 @@ wqlock_init(wqlock_t *lock)
  initlock(lock);
 }

-static void
-setaffinity(int c)
-{
-  // XXX(sbw)
-}
+extern "C" long wqwait(void);

-static void*
-workerth(void *x)
+static void __attribute__((used))
+initworker(void)
 {
-  u64 c = (u64)x;
-  
-  setaffinity(c);
-  pthread_setspecific(idkey, (void*)c);
-  while (!exiting)
-    wq_trywork();
-
-  return 0;
+  int id;
+  forkt_setup(0);
+  id = nextid++;
+  if (id >= NCPU)
+    die("initworker: to man IDs");
+  pthread_setspecific(idkey, (void*)(u64)id);
+  while (1) {
+    if (!wq_trywork())
+      assert(wqwait() == 0);
+  }
 }
+DEFINE_XV6_ADDRNOTE(xnote, XV6_ADDR_ID_WQ, &initworker);

 static inline void
 wqarch_init(void)
 {
-  pthread_t th;
-  int r;
-
  if (pthread_key_create(&idkey, 0))
    die("wqarch_init: pthread_key_create");

-  pthread_setspecific(idkey, 0);
-  setaffinity(0);
-
-  for (int i = 1; i < NCPU; i++) {
-    r = pthread_create(&th, 0, workerth, (void*)(u64)i);
-    if (r < 0)
-      die("wqarch_init: pthread_create");
-  }
-}
-
-static inline void
-wqarch_exit(void)
-{
-  exiting = 1;
+  int id = nextid++;
+  pthread_setspecific(idkey, (void*)(u64)id);
 }

 #define xprintf      printf 

--- a/kernel/Makefrag
+++ b/kernel/Makefrag
@@ -42,6 +42,8 @@ OBJS = \
 	sysfile.o \
 	sysproc.o \
 	uart.o \
+        user.o \
+	uwq.o \
 	vm.o \
 	trap.o \
 	trapasm.o \

--- a/kernel/bio.cc
+++ b/kernel/bio.cc
@@ -46,11 +46,11 @@ static struct buf*
 bget(u32 dev, u64 sector, int *writer)
 {
  struct buf *b;
+  scoped_gc_epoch e;

 loop:
  // Try for cached block.
  // XXX ignore dev
-  gc_begin_epoch();
  b = bufns->lookup(mkpair(dev, sector));
  if (b) {
    if (b->dev != dev || b->sector != sector)
@@ -60,7 +60,6 @@ bget(u32 dev, u64 sector, int *writer)
      if (b->flags & B_BUSY) {
 	cv_sleep(&b->cv, &b->lock);
 	release(&b->lock);
-	gc_end_epoch();
 	goto loop;
      }

@@ -72,45 +71,15 @@ bget(u32 dev, u64 sector, int *writer)
    // rcu_end_read() happens in brelse
    return b;
  }
-  gc_end_epoch();

  // Allocate fresh block.
-  struct buf *victim = 0;
-  bufns->enumerate([&victim](const pair<u32, u64>&, buf *eb)->bool {
-      acquire(&eb->lock);
-      if ((eb->flags & (B_BUSY | B_DIRTY | B_VALID)) == 0) {
-        victim = eb;
-        return true;
-      }
-      release(&eb->lock);
-      return false;
-    });
-  if (victim == 0)
-    bufns->enumerate([&victim](const pair<u32, u64>&, buf *eb)->bool {
-        acquire(&eb->lock);
-        if ((eb->flags & (B_BUSY | B_DIRTY)) == 0) {
-          victim = eb;
-          return true;
-        }
-        release(&eb->lock);
-        return false;
-      });
-  if (victim == 0)
-    panic("bget all busy");
-  victim->flags |= B_BUSY;
-  bufns->remove(mkpair(victim->dev, victim->sector), &victim);
-  release(&victim->lock);
-  gc_delayed(victim);
-
  b = new buf(dev, sector);
  b->flags = B_BUSY;
  *writer = 1;
-  gc_begin_epoch();
  if (bufns->insert(mkpair(b->dev, b->sector), b) < 0) {
    gc_delayed(b);
    goto loop;
  }
-  // rcu_end_read() happens in brelse
  return b;
 }

@@ -152,8 +121,6 @@ brelse(struct buf *b, int writer)
    b->flags &= ~B_BUSY;
    cv_wakeup(&b->cv);
  }
-  // rcu_begin_read() happens in bread
-  gc_end_epoch();
 }

 void

--- a/kernel/bootdata.c
+++ b/kernel/bootdata.c
@@ -39,7 +39,7 @@ long (*syscalls[])(u64, u64, u64, u64, u64) = {
  SYSCALL(getpid),
  SYSCALL(kill),
  SYSCALL(link),
-  SYSCALL(mkdir),
+  SYSCALL(mkdirat),
  SYSCALL(mknod),
  SYSCALL(openat),
  SYSCALL(pipe),
@@ -61,5 +61,6 @@ long (*syscalls[])(u64, u64, u64, u64, u64) = {
  SYSCALL(async),
  SYSCALL(script),
  SYSCALL(setfs),
+  SYSCALL(wqwait),
+  SYSCALL(setaffinity),
 };
-
--- a/kernel/cpprt.cc
+++ b/kernel/cpprt.cc
@@ -7,7 +7,7 @@
 void *
 operator new[](unsigned long nbytes)
 {
-  u64 *x = (u64*) kmalloc(nbytes + sizeof(u64));
+  u64 *x = (u64*) kmalloc(nbytes + sizeof(u64), "array");
  *x = nbytes + sizeof(u64);
  return x+1;
 }

--- a/kernel/exec.cc
+++ b/kernel/exec.cc
@@ -15,7 +15,6 @@
 #include "wq.hh"
 #include "cilk.hh"

-#define USTACKPAGES 2
 #define BRK (USERTOP >> 1)

 struct eargs {
@@ -26,6 +25,36 @@ struct eargs {
  char **argv;
 };

+static int
+donotes(struct inode *ip, uwq *uwq, u64 off)
+{
+  struct proghdr ph;
+  struct elfnote note;
+
+  if (readi(ip, (char*)&ph, off, sizeof(ph)) != sizeof(ph))
+    return -1;
+  
+  if (readi(ip, (char*)&note, ph.offset, sizeof(note)) != sizeof(note))
+    return -1;
+
+  if (note.type == ELF_NOTE_XV6_ADDR) {
+    struct xv6_addrdesc desc;
+
+    if (note.descsz != sizeof(desc))
+      return -1;
+    if (readi(ip, (char*)&desc,
+              ph.offset+__offsetof(struct xv6_addrnote, desc),
+              sizeof(desc)) != sizeof(desc))
+      return -1;
+
+    if (desc.id == XV6_ADDR_ID_WQ) {
+      uwq->setuentry(desc.vaddr);
+      return 0;
+    }
+  }
+  return -1;
+}
+
 static void
 dosegment(struct eargs *args, u64 off)
 {
@@ -149,15 +178,19 @@ exec(const char *path, char **argv)
 {
  struct inode *ip = nullptr;
  struct vmap *vmp = nullptr;
+  uwq* uwq = nullptr;
  struct elfhdr elf;
  struct proghdr ph;
  u64 off;
  int i;
  struct vmap *oldvmap;
-
+  
  if((ip = namei(myproc()->cwd, path)) == 0)
    return -1;

+  if(myproc()->worker != nullptr)
+    return -1;
+
  gc_begin_epoch();

  // Check ELF header
@@ -171,6 +204,9 @@ exec(const char *path, char **argv)
  if((vmp = vmap::alloc()) == 0)
    goto bad;

+  if((uwq = uwq::alloc(vmp, myproc()->ftable)) == 0)
+    goto bad;
+
  // Arguments for work queue
  struct eargs args;
  args.proc = myproc();
@@ -186,7 +222,12 @@ exec(const char *path, char **argv)
             off+__offsetof(struct proghdr, type), 
             sizeof(type)) != sizeof(type))
      goto bad;
-    if(type != ELF_PROG_LOAD)
+    if (type == ELF_PROG_NOTE) {
+      if (donotes(ip, uwq, off) < 0) {
+        cilk_abort(-1);
+        break;
+      }
+    } if(type != ELF_PROG_LOAD)
      continue;
    cilk_call(dosegment, &args, off);
  }
@@ -203,7 +244,10 @@ exec(const char *path, char **argv)
  // Commit to the user image.
  oldvmap = myproc()->vmap;
  myproc()->vmap = vmp;
-  myproc()->tf->rip = elf.entry;  // main
+  if (myproc()->uwq != nullptr)
+    myproc()->uwq->dec();
+  myproc()->uwq = uwq;
+  myproc()->tf->rip = elf.entry;
  
  switchvm(myproc());
  oldvmap->decref();
@@ -215,7 +259,8 @@ exec(const char *path, char **argv)
  cprintf("exec failed\n");
  if(vmp)
    vmp->decref();
+  if(uwq)
+    uwq->dec();
  gc_end_epoch();
-
  return 0;
 }
--- a/kernel/fs.cc
+++ b/kernel/fs.cc
@@ -10,6 +10,22 @@
 // routines.  The (higher-level) system call implementations
 // are in sysfile.c.

+/*
+ * inode cache will be RCU-managed:
+ * 
+ * - to evict, mark inode as a victim
+ * - lookups that encounter a victim inode must return an error (-E_RETRY)
+ * - E_RETRY rolls back to the beginning of syscall/pagefault and retries
+ * - out-of-memory error should be treated like -E_RETRY
+ * - once an inode is marked as victim, it can be gc_delayed()
+ * - the do_gc() method should remove inode from the namespace & free it
+ * 
+ * - inodes have a refcount that lasts beyond a GC epoch
+ * - to bump refcount, first bump, then check victim flag
+ * - if victim flag is set, reduce the refcount and -E_RETRY
+ *
+ */
+
 #include "types.h"
 #include "stat.h"
 #include "mmu.h"
@@ -22,6 +38,7 @@
 #include "buf.hh"
 #include "file.hh"
 #include "cpu.hh"
+#include "kmtrace.hh"

 #define min(a, b) ((a) < (b) ? (a) : (b))
 static void itrunc(struct inode*);
@@ -184,7 +201,8 @@ ialloc(u32 dev, short type)
      //cprintf("ialloc oops %d\n", inum); // XXX harmless
    }
  }
-  panic("ialloc: no inodes");
+  cprintf("ialloc: 0/%u inodes\n", sb.ninodes);
+  return nullptr;
 }

 // Copy inode, which has changed, from memory to disk.
@@ -237,72 +255,36 @@ inode::~inode()
 struct inode*
 iget(u32 dev, u32 inum)
 {
-  struct inode *ip;
+  struct inode *ip = igetnoref(dev, inum);
+  if (ip)
+    idup(ip);
+  return ip;
+}

+struct inode*
+igetnoref(u32 dev, u32 inum)
+{
 retry:
  // Try for cached inode.
-  gc_begin_epoch();
-  ip = ins->lookup(mkpair(dev, inum));
-  if (ip) {
-    // tricky: first bump ref, then check free flag
-    ip->ref++;
-    if (ip->flags & I_FREE) {
-      gc_end_epoch();
-      ip->ref--;
-      goto retry;
-    }
-    gc_end_epoch();
-    if (!(ip->flags & I_VALID)) {
-      acquire(&ip->lock);
-      while((ip->flags & I_VALID) == 0)
-	cv_sleep(&ip->cv, &ip->lock);
-      release(&ip->lock);
-    }
+  {
+    scoped_gc_epoch e;
+    struct inode *ip = ins->lookup(mkpair(dev, inum));
+    if (ip) {
+      if (!(ip->flags & I_VALID)) {
+        acquire(&ip->lock);
+        while((ip->flags & I_VALID) == 0)
+	  cv_sleep(&ip->cv, &ip->lock);
+        release(&ip->lock);
+      }
    return ip;
-  }
-  gc_end_epoch();
-
-  // Allocate fresh inode cache slot.
- retry_evict:
-  (void) 0;
-  u32 cur_free = icache_free[mycpu()->id].x;
-  if (cur_free == 0) {
-    struct inode *victim = 0;
-    ins->enumerate([&victim](const pair<u32, u32>&, inode* eip)->bool{
-        if (eip->ref || eip->type == T_DIR)
-          return false;
-
-        acquire(&eip->lock);
-        if (eip->ref == 0 && eip->type != T_DIR &&
-            !(eip->flags & (I_FREE | I_BUSYR | I_BUSYW))) {
-          victim = eip;
-          return true;
-        }
-
-        release(&eip->lock);
-        return false;
-      });
-    if (!victim)
-      panic("iget out of space");
-    // tricky: first flag as free, then check refcnt, then remove from ns
-    victim->flags |= I_FREE;
-    if (victim->ref > 0) {
-      victim->flags &= ~(I_FREE);
-      release(&victim->lock);
-      goto retry_evict;
    }
-    release(&victim->lock);
-    ins->remove(mkpair(victim->dev, victim->inum), &victim);
-    gc_delayed(victim);
-  } else {
-    if (!cmpxch(&icache_free[mycpu()->id].x, cur_free, cur_free-1))
-      goto retry_evict;
  }

-  ip = new inode();
+  // Allocate fresh inode cache slot.
+  struct inode *ip = new inode();
  ip->dev = dev;
  ip->inum = inum;
-  ip->ref = 1;
+  ip->ref = 0;
  ip->flags = I_BUSYR | I_BUSYW;
  ip->readbusy = 1;
  snprintf(ip->lockname, sizeof(ip->lockname), "cv:ino:%d", ip->inum);
@@ -364,7 +346,7 @@ ilock(struct inode *ip, int writer)
 void
 iunlock(struct inode *ip)
 {
-  if(ip == 0 || !(ip->flags & (I_BUSYR | I_BUSYW)) || ip->ref < 1)
+  if(ip == 0 || !(ip->flags & (I_BUSYR | I_BUSYW)))
    panic("iunlock");

  acquire(&ip->lock);
@@ -405,6 +387,9 @@ iput(struct inode *ip)
      ip->flags |= (I_BUSYR | I_BUSYW);
      ip->readbusy++;

+      // XXX: use gc_delayed() to truncate the inode later.
+      // flag it as a victim in the meantime.
+
      release(&ip->lock);

      itrunc(ip);
@@ -617,7 +602,10 @@ namecmp(const char *s, const char *t)
 u64
 namehash(const strbuf<DIRSIZ> &n)
 {
-  return n._buf[0];   /* XXX */
+  u64 h = 0;
+  for (int i = 0; i < DIRSIZ && n._buf[i]; i++)
+    h = ((h << 8) ^ n._buf[i]) % 0xdeadbeef;
+  return h;
 }

 void
@@ -749,43 +737,52 @@ namex(inode *cwd, const char *path, int nameiparent, char *name)
 {
  struct inode *ip, *next;
  int r;
+  scoped_gc_epoch e;

-  gc_begin_epoch();
  if(*path == '/') 
-    ip = iget(ROOTDEV, ROOTINO);
+    ip = igetnoref(ROOTDEV, ROOTINO);
  else
-    ip = idup(cwd);
+    ip = cwd;

  while((r = skipelem(&path, name)) == 1){
+    // XXX Doing this here requires some annoying reasoning about all
+    // of the callers of namei/nameiparent.  Also, since the abstract
+    // scope is implicit, it might be wrong (or non-existent) and
+    // documenting the abstract object sets of each scope becomes
+    // difficult and probably unmaintainable.  We have to compute this
+    // information here because it's the only place that's canonical.
+    // Maybe this should return the set of inodes traversed and let
+    // the caller declare the variables?  Would it help for the caller
+    // to pass in an abstract scope?
+    mtreadavar("inode:%x.%x", ip->dev, ip->inum);
    next = 0;
    if(next == 0){
      if(ip->type == 0)
        panic("namex");
-      if(ip->type != T_DIR){
-        iput(ip);
-	gc_end_epoch();
+      if(ip->type != T_DIR)
        return 0;
-      }
      if(nameiparent && *path == '\0'){
        // Stop one level early.
-	gc_end_epoch();
+        idup(ip);
        return ip;
      }
-      if((next = dirlookup(ip, name)) == 0){
-        iput(ip);
-	gc_end_epoch();
+      if((next = dirlookup(ip, name)) == 0)
        return 0;
-      }
-      iput(ip);
    }
    ip = next;
  }
-  if(r == -1 || nameiparent){
-    iput(ip);
-    gc_end_epoch();
+
+  if(r == -1 || nameiparent)
    return 0;
-  }
-  gc_end_epoch();
+
+  // XXX write is necessary because of idup.  not logically required,
+  // so we should replace this with mtreadavar() eventually, perhaps
+  // once we implement sloppy counters for long-term inode refs.
+
+  // mtreadavar("inode:%x.%x", ip->dev, ip->inum);
+  mtwriteavar("inode:%x.%x", ip->dev, ip->inum);
+
+  idup(ip);
  return ip;
 }


--- a/kernel/hwvm.cc
+++ b/kernel/hwvm.cc
@@ -10,9 +10,14 @@
 #include "condvar.h"
 #include "proc.hh"
 #include "vm.hh"
+#include "wq.hh"

 using namespace std;

+static const char *levelnames[] = {
+  "PT", "PD", "PDP", "PML4"
+};
+
 static pgmap*
 descend(pgmap *dir, u64 va, u64 flags, int create, int level)
 {
@@ -28,7 +33,7 @@ retry:
  } else {
    if (!create)
      return nullptr;
-    next = (pgmap*) kalloc();
+    next = (pgmap*) kalloc(levelnames[level-1]);
    if (!next)
      return nullptr;
    memset(next, 0, PGSIZE);
@@ -83,7 +88,7 @@ setupkvm(void)
  pgmap *pml4;
  int k;

-  if((pml4 = (pgmap*)kalloc()) == 0)
+  if((pml4 = (pgmap*)kalloc("PML4")) == 0)
    return 0;
  k = PX(3, KBASE);
  memset(&pml4->e[0], 0, 8*k);
@@ -92,13 +97,36 @@ setupkvm(void)
 }

 int
-setupkshared(pgmap *pml4, char *kshared)
+mapkva(pgmap *pml4, char* kva, uptr uva, size_t size)
 {
-  for (u64 off = 0; off < KSHAREDSIZE; off+=4096) {
-    atomic<pme_t> *pte = walkpgdir(pml4, (u64) (KSHARED+off), 1);
+  for (u64 off = 0; off < size; off+=4096) {
+    atomic<pme_t> *pte = walkpgdir(pml4, (u64) (uva+off), 1);
    if (pte == nullptr)
-      panic("setupkshared: oops");
-    *pte = v2p(kshared+off) | PTE_P | PTE_U | PTE_W;
+      return -1;
+    *pte = v2p(kva+off) | PTE_P | PTE_U | PTE_W;
+  }
+  return 0;
+}
+
+int
+setupuvm(pgmap *pml4, char *kshared, char *uwq)
+{
+  struct todo {
+    char *kvm;
+    char *uvm;
+    size_t size;
+  } todo[] = {
+    { kshared, (char*)KSHARED, KSHAREDSIZE },
+    { uwq,     (char*)USERWQ,  USERWQSIZE }
+  };
+
+  for (int i = 0; i < NELEM(todo); i++) {
+    for (u64 off = 0; off < todo[i].size; off+=4096) {
+      atomic<pme_t> *pte = walkpgdir(pml4, (u64) (todo[i].uvm+off), 1);
+      if (pte == nullptr)
+        return -1;
+      *pte = v2p(todo[i].kvm+off) | PTE_P | PTE_U | PTE_W;
+    }
  }
  return 0;
 }

--- a/kernel/idle.cc
+++ b/kernel/idle.cc
@@ -8,6 +8,7 @@
 #include "sched.hh"
 #include "percpu.hh"
 #include "wq.hh"
+#include "uwq.hh"
 #include "kmtrace.hh"

 struct idle {
@@ -107,7 +108,7 @@ idleloop(void)
        // If we don't have an heir, try to allocate one
        if (idlem->heir == nullptr) {
          struct proc *p;
-          p = allocproc();
+          p = proc::alloc();
          if (p == nullptr)
            break;
          snprintf(p->name, sizeof(p->name), "idleh_%u", mycpu()->id);          
@@ -118,6 +119,9 @@ idleloop(void)
          idlem->heir = p;
        }

+        if (uwq_trywork())
+          break;
+
        worked = wq_trywork();
        // If we are no longer the idle thread, exit
        if (worked && idlem->cur != myproc())
@@ -131,9 +135,9 @@ idleloop(void)
 void
 initidle(void)
 {
-  struct proc *p = allocproc();
+  struct proc *p = proc::alloc();
  if (!p)
-    panic("initidle allocproc");
+    panic("initidle proc::alloc");

  SLIST_INIT(&idlem[cpunum()].zombies);
  initlock(&idlem[cpunum()].lock, "idle_lock", LOCKSTAT_IDLE);

--- a/kernel/kalloc.cc
+++ b/kernel/kalloc.cc
@@ -126,6 +126,9 @@ kfree_pool(struct kmem *m, char *v)
  if (ALLOC_MEMSET && kinited && m->size <= 16384)
    memset(v, 1, m->size);

+  if (kinited)
+    mtunlabel(mtrace_label_block, v);
+
  r = (struct run*)v;
  for (;;) {
    auto headval = m->freelist.load();
@@ -135,8 +138,6 @@ kfree_pool(struct kmem *m, char *v)
  }

  m->nfree++;
-  if (kinited)
-    mtunlabel(mtrace_label_block, r);
 }

 static void
@@ -160,7 +161,7 @@ kmemprint()
 }

 static char*
-kalloc_pool(struct kmem *km)
+kalloc_pool(struct kmem *km, const char *name)
 {
  struct run *r = 0;
  struct kmem *m;
@@ -196,7 +197,8 @@ kalloc_pool(struct kmem *km)
    return 0;
  }

-  mtlabel(mtrace_label_block, r, m->size, "kalloc", sizeof("kalloc"));
+  if (name)
+    mtlabel(mtrace_label_block, r, m->size, name, strlen(name));

  if (ALLOC_MEMSET && m->size <= 16384)
    memset(r, 2, m->size);
@@ -207,17 +209,17 @@ kalloc_pool(struct kmem *km)
 // Returns a pointer that the kernel can use.
 // Returns 0 if the memory cannot be allocated.
 char*
-kalloc(void)
+kalloc(const char *name)
 {
  if (!kinited)
    return pgalloc();
-  return kalloc_pool(kmems);
+  return kalloc_pool(kmems, name);
 }

 void *
 ksalloc(int slab)
 {
-  return kalloc_pool(slabmem[slab]);
+  return kalloc_pool(slabmem[slab], slabmem[slab]->name);
 }

 void
@@ -278,6 +280,10 @@ initkalloc(u64 mbaddr)
    slabmem[slab_wq][c].size = PGROUNDUP(wq_size());
    slabmem[slab_wq][c].ninit = NCPU;

+    strncpy(slabmem[slab_userwq][c].name, " uwq", MAXNAME);
+    slabmem[slab_userwq][c].size = USERWQSIZE;
+    slabmem[slab_userwq][c].ninit = CPUKSTACKS;
+
    for (int i = 0; i < slab_type_max; i++) {
      slabmem[i][c].name[0] = (char) c + '0';
      slabinit(&slabmem[i][c], &p, &k);

--- a/kernel/kmalloc.cc
+++ b/kernel/kmalloc.cc
@@ -34,10 +34,10 @@ kminit(void)
 }

 // get more space for freelists[c].buckets[b]
-int
+static int
 morecore(int c, int b)
 {
-  char *p = kalloc();
+  char *p = kalloc(nullptr);
  if(p == 0)
    return -1;

@@ -78,7 +78,7 @@ bucket(u64 nbytes)
 }

 void *
-kmalloc(u64 nbytes)
+kmalloc(u64 nbytes, const char *name)
 {
  int b = bucket(nbytes);

@@ -103,10 +103,11 @@ kmalloc(u64 nbytes)
    }
  }

+  mtlabel(mtrace_label_heap, (void*) h, nbytes, name, strlen(name));
+
  if (ALLOC_MEMSET)
    memset(h, 4, (1<<b));

-  mtlabel(mtrace_label_heap, (void*) h, nbytes, "kmalloc'ed", sizeof("kmalloc'ed"));
  return h;
 }

@@ -132,9 +133,9 @@ kmfree(void *ap, u64 nbytes)
 }

 int
-kmalign(void **p, int align, u64 size)
+kmalign(void **p, int align, u64 size, const char *name)
 {
-  void *mem = kmalloc(size + (align-1) + sizeof(void*));
+  void *mem = kmalloc(size + (align-1) + sizeof(void*), name);
  char *amem = ((char*)mem) + sizeof(void*);
  amem += align - ((uptr)amem & (align - 1));
  ((void**)amem)[-1] = mem;

--- a/kernel/lapic.cc
+++ b/kernel/lapic.cc
@@ -142,16 +142,10 @@ cpunum(void)
 {
  // Cannot call cpu when interrupts are enabled:
  // result not guaranteed to last long enough to be used!
-  // Would prefer to panic but even printing is chancy here:
-  // almost everything, including cprintf and panic, calls cpu,
-  // often indirectly through acquire and release.
  if(readrflags()&FL_IF){
-    static int n __mpalign__;
-    if(n == 0) {
-      n++;
-      cprintf("cpu called from %p with interrupts enabled\n",
-        __builtin_return_address(0));
-    }
+    cli();
+    panic("cpunum() called from %p with interrupts enabled\n",
+      __builtin_return_address(0));
  }

  if(lapic)

--- a/kernel/net.cc
+++ b/kernel/net.cc
@@ -32,7 +32,7 @@ netfree(void *va)
 void *
 netalloc(void)
 {
-  return kalloc();
+  return kalloc("(netalloc)");
 }

 int
@@ -278,7 +278,7 @@ netbind(int sock, void *xaddr, int xaddrlen)
  void *addr;
  long r;
  
-  addr = kmalloc(xaddrlen);
+  addr = kmalloc(xaddrlen, "sockaddr");
  if (addr == nullptr)
    return -1;

@@ -314,7 +314,7 @@ netaccept(int sock, void *xaddr, void *xaddrlen)
  if (umemcpy(&len, lenptr, sizeof(*lenptr)))
    return -1;

-  addr = kmalloc(len);
+  addr = kmalloc(len, "sockaddr");
  if (addr == nullptr)
    return -1;

@@ -352,7 +352,7 @@ netwrite(int sock, char *ubuf, int len)
  int cc;
  int r;

-  kbuf = kalloc();
+  kbuf = kalloc("(netwrite)");
  if (kbuf == nullptr)
    return -1;

@@ -375,7 +375,7 @@ netread(int sock, char *ubuf, int len)
  int cc;
  int r;

-  kbuf = kalloc();
+  kbuf = kalloc("(netread)");
  if (kbuf == nullptr)
    return -1;


--- a/kernel/pipe.cc
+++ b/kernel/pipe.cc
@@ -30,7 +30,7 @@ pipealloc(struct file **f0, struct file **f1)
  *f0 = *f1 = 0;
  if((*f0 = file::alloc()) == 0 || (*f1 = file::alloc()) == 0)
    goto bad;
-  if((p = (pipe*)kmalloc(sizeof(*p))) == 0)
+  if((p = (pipe*)kmalloc(sizeof(*p), "pipe")) == 0)
    goto bad;
  p->readopen = 1;
  p->writeopen = 1;

--- a/kernel/proc.cc
+++ b/kernel/proc.cc
@@ -13,6 +13,7 @@
 #include "kalloc.hh"
 #include "vm.hh"
 #include "ns.hh"
+#include "fcntl.h"

 u64
 proc_hash(const u32 &p)
@@ -27,7 +28,7 @@ mycpuid(void)
 }

 xns<u32, proc*, proc_hash> *xnspid __mpalign__;
-static struct proc *bootproc __mpalign__;
+struct proc *bootproc __mpalign__;

 #if MTRACE
 struct kstack_tag kstack_tag[NCPU];
@@ -36,10 +37,10 @@ struct kstack_tag kstack_tag[NCPU];
 enum { sched_debug = 0 };

 proc::proc(int npid) :
-  rcu_freed("proc"), vmap(0), kstack(0),
+  rcu_freed("proc"), vmap(0), uwq(0), worker(0), kstack(0),
  pid(npid), parent(0), tf(0), context(0), killed(0),
  ftable(0), cwd(0), tsc(0), curcycles(0), cpuid(0), epoch(0),
-  on_runq(-1), cpu_pin(0), runq(0), oncv(0), cv_wakeup(0),
+  cpu_pin(0), runq(0), oncv(0), cv_wakeup(0),
  user_fs_(0), state_(EMBRYO)
 {
  snprintf(lockname, sizeof(lockname), "cv:proc:%d", pid);
@@ -85,6 +86,30 @@ proc::set_state(enum procstate s)
  state_ = s;
 }

+int
+proc::set_cpu_pin(int cpu)
+{
+  if (cpu < -1 || cpu >= ncpu)
+    return -1;
+
+  acquire(&lock);
+  if (myproc() != this)
+    panic("set_cpu_pin not implemented for non-current proc");
+  if (cpu == -1) {
+    cpu_pin = 0;
+    release(&lock);
+    return 0;
+  }
+  // Since we're the current proc, there's no runq to get off.
+  // post_swtch will put us on the new runq.
+  cpuid = cpu;
+  cpu_pin = 1;
+  myproc()->set_state(RUNNABLE);
+  sched();
+  assert(mycpu()->id == cpu);
+  return 0;
+}
+
 // Give up the CPU for one scheduling round.
 void
 yield(void)
@@ -174,18 +199,15 @@ freeproc(struct proc *p)
  gc_delayed(p);
 }

-// Look in the process table for an UNUSED proc.
-// If found, change state to EMBRYO and initialize
-// state required to run in the kernel.
-// Otherwise return 0.
-struct proc*
-allocproc(void)
+proc*
+proc::alloc(void)
 {
-  struct proc *p;
  char *sp;
+  proc* p;

  p = new proc(xnspid->allockey());
-  if (p == 0) return 0;
+  if (p == nullptr)
+    return nullptr;

  p->cpuid = mycpu()->id;
  initprocgc(p);
@@ -230,43 +252,6 @@ allocproc(void)
  return p;
 }

-// Set up first user process.
-void
-inituser(void)
-{
-  struct proc *p;
-  extern u8 _initcode_start[];
-  extern u64 _initcode_size;
-
-  p = allocproc();
-  p->ftable = new filetable();
-  if (p->ftable == nullptr)
-    panic("userinit: new filetable");
-  bootproc = p;
-  if((p->vmap = vmap::alloc()) == 0)
-    panic("userinit: out of vmaps?");
-  vmnode *vmn =  new vmnode(PGROUNDUP(_initcode_size) / PGSIZE);
-  if(vmn == 0)
-    panic("userinit: vmn_allocpg");
-  if(p->vmap->insert(vmn, 0, 1) < 0)
-    panic("userinit: vmap_insert");
-  if(p->vmap->copyout(0, _initcode_start, _initcode_size) < 0)
-    panic("userinit: copyout");
-  memset(p->tf, 0, sizeof(*p->tf));
-  p->tf->cs = UCSEG | 0x3;
-  p->tf->ds = UDSEG | 0x3;
-  p->tf->ss = p->tf->ds;
-  p->tf->rflags = FL_IF;
-  p->tf->rsp = PGSIZE;
-  p->tf->rip = 0x0;  // beginning of initcode.S
-
-  safestrcpy(p->name, "initcode", sizeof(p->name));
-  p->cwd = 0; // forkret will fix in the process's context
-  acquire(&p->lock);
-  addrun(p);
-  release(&p->lock);
-}
-
 void
 initproc(void)
 {
@@ -279,18 +264,11 @@ initproc(void)
 // Process won't exit until it returns
 // to user space (see trap in trap.c).
 int
-kill(int pid)
+proc::kill(void)
 {
-  struct proc *p;
-
-  p = xnspid->lookup(pid);
-  if (p == 0) {
-    panic("kill");
-    return -1;
-  }
-  acquire(&p->lock);
-  p->killed = 1;
-  if(p->get_state() == SLEEPING){
+  acquire(&lock);
+  killed = 1;
+  if(get_state() == SLEEPING) {
    // XXX
    // we need to wake p up if it is cv_sleep()ing.
    // can't change p from SLEEPING to RUNNABLE since that
@@ -302,10 +280,23 @@ kill(int pid)
    //   cv might be deallocated while we're using it
    //   (pipes dynamically allocate condvars).
  }
-  release(&p->lock);
+  release(&lock);
  return 0;
 }

+int
+proc::kill(int pid)
+{
+  struct proc *p;
+
+  p = xnspid->lookup(pid);
+  if (p == 0) {
+    panic("kill");
+    return -1;
+  }
+  return p->kill();
+}
+
 // Print a process listing to console.  For debugging.
 // Runs when user types ^P on console.
 // No lock to avoid wedging a stuck machine further.
@@ -357,10 +348,13 @@ fork(int flags)
  // cprintf("%d: fork\n", myproc()->pid);

  // Allocate process.
-  if((np = allocproc()) == 0)
+  if((np = proc::alloc()) == 0)
    return -1;

-  if(flags == 0) {
+  if(flags & FORK_SHARE_VMAP) {
+    np->vmap = myproc()->vmap;
+    np->vmap->ref++;
+  } else {
    // Copy process state from p.
    if((np->vmap = myproc()->vmap->copy(cow)) == 0){
      ksfree(slab_stack, np->kstack);
@@ -370,27 +364,25 @@ fork(int flags)
      freeproc(np);
      return -1;
    }
-  } else {
-    np->vmap = myproc()->vmap;
-    np->vmap->ref++;
  }

  np->parent = myproc();
  *np->tf = *myproc()->tf;
+  np->cpu_pin = myproc()->cpu_pin;

  // Clear %eax so that fork returns 0 in the child.
  np->tf->rax = 0;

-  if (flags == 0) {
+  if (flags & FORK_SHARE_FD) {
+    myproc()->ftable->incref();
+    np->ftable = myproc()->ftable;
+  } else {
    np->ftable = new filetable(*myproc()->ftable);
    if (np->ftable == nullptr) {
      // XXX(sbw) leaking?
      freeproc(np);
      return -1;
    }
-  } else {
-    myproc()->ftable->incref();
-    np->ftable = myproc()->ftable;
  }

  np->cwd = idup(myproc()->cwd);
@@ -411,10 +403,12 @@ fork(int flags)
 void
 finishproc(struct proc *p)
 {
-  ksfree(slab_stack, p->kstack);
-  p->kstack = 0;
  if (p->vmap != nullptr)
    p->vmap->decref();
+  if (p->uwq != nullptr)
+    p->uwq->dec();
+  ksfree(slab_stack, p->kstack);
+  p->kstack = 0;
  if (!xnspid->remove(p->pid, &p))
    panic("wait: ns_remove");
  p->pid = 0;
@@ -477,7 +471,7 @@ threadalloc(void (*fn)(void *), void *arg)
 {
  struct proc *p;

-  p = allocproc();
+  p = proc::alloc();
  if (p == nullptr)
    return 0;
  

--- a/kernel/radix.cc
+++ b/kernel/radix.cc
-#include "types.h"
-#include "atomic.hh"
-#include "spinlock.h"
-#include "kernel.hh"
-#include "cpputil.hh"
+#include "crange_arch.hh"
 #include "radix.hh"

+// Returns the level we stopped at.
 template<class CB>
-void
-descend(u64 key, markptr<void> *n, u32 level, CB cb)
+u32
+descend(u64 key, markptr<void> *n, u32 level, CB cb, bool create)
 {
-  // for now, we only support exact multiples of bits_per_level
-  assert(key_bits == bits_per_level * radix_levels);
+  static_assert(key_bits == bits_per_level * radix_levels,
+                "for now, we only support exact multiples of bits_per_level");
  assert(n);

  void *v = n->ptr();
-  if (v == 0) {
+  if (v == 0 && create) {
    radix_node *new_rn = new radix_node();
    if (n->ptr().cmpxch_update(&v, (void*) new_rn))
      v = new_rn;
    else
      delete new_rn;
  }
+  // Node isn't there. Just return.
+  if (v == 0) {
+    return level+1;
+  }

  radix_node *rn = (radix_node*) v;

  u64 idx = key >> (bits_per_level * level);
  idx &= (1<<bits_per_level)-1;
  markptr<void> *vptr = &rn->ptr[idx];
-  if (level == 0)
+  if (level == 0) {
    cb(vptr);
-  else
-    descend(key, vptr, level-1, cb);
+    return level;
+  } else {
+    return descend(key, vptr, level-1, cb, create);
+  }
 }

 radix_elem*
@@ -39,7 +42,7 @@ radix::search(u64 key)
  radix_elem *result = 0;
  descend(key >> shift_, &root_, radix_levels-1, [&result](markptr<void> *v) {
      result = (radix_elem*) v->ptr().load();
-    });
+    }, false);
  return result;
 }

@@ -49,14 +52,37 @@ radix::search_lock(u64 start, u64 size)
  return radix_range(this, start >> shift_, size >> shift_);
 }

+u64
+radix::skip_empty(u64 k) const
+{
+  u64 next_k = k;
+  while (next_k < (1UL<<key_bits)) {
+    // Does next_k exist?
+    // FIXME: evil evil const_cast
+    u32 level = descend(next_k, const_cast<markptr<void>*>(&root_),
+                        radix_levels-1, [](markptr<void> *v){}, false);
+    if (level == 0) {
+      return next_k;
+    }
+    u64 mask = 1UL<<(bits_per_level * level);
+    // Skip past everything we know is missing.
+    next_k = (next_k & ~(mask-1)) + mask;
+  }
+  // Nope, no successor.
+  return ~0ULL;
+}
+
 radix_range::radix_range(radix *r, u64 start, u64 size)
  : r_(r), start_(start), size_(size)
 {
-  for (u64 k = start_; k != start_ + size_; k++)
-    descend(k, &r_->root_, radix_levels-1, [](markptr<void> *v) {
-        while (!v->mark().xchg(true))
-          ; // spin
-      });
+  for (u64 k = start_; k != start_ + size_; k++) {
+    if (descend(k, &r_->root_, radix_levels-1, [](markptr<void> *v) {
+          while (!v->mark().xchg(true))
+            ; // spin
+        }, true) != 0) {
+      panic("radix_range");
+    }
+  }
 }

 radix_range::~radix_range()
@@ -64,10 +90,13 @@ radix_range::~radix_range()
  if (!r_)
    return;

-  for (u64 k = start_; k != start_ + size_; k++)
-    descend(k, &r_->root_, radix_levels-1, [](markptr<void> *v) {
-        v->mark() = false;
-      });
+  for (u64 k = start_; k != start_ + size_; k++) {
+    if (descend(k, &r_->root_, radix_levels-1, [](markptr<void> *v) {
+          v->mark() = false;
+        }, true) != 0) {
+      panic("~radix_range");
+    }
+  }
 }

 void
@@ -79,15 +108,19 @@ radix_range::replace(u64 start, u64 size, radix_elem *val)
  assert(start >= start_);
  assert(start + size <= start_ + size_);

-  for (u64 k = start; k != start + size; k++)
-    descend(k, &r_->root_, radix_levels-1, [val](markptr<void> *v) {
-        void* cur = v->ptr().load();
-        while (!v->ptr().cmpxch_update(&cur, val))
-          ; // spin
-        val->incref();
-        if (cur)
-          ((radix_elem*) cur)->decref();
-      });
+  for (u64 k = start; k != start + size; k++) {
+    if (descend(k, &r_->root_, radix_levels-1, [val](markptr<void> *v) {
+          void* cur = v->ptr().load();
+          while (!v->ptr().cmpxch_update(&cur, val))
+            ; // spin
+          if (val)
+            val->incref();
+          if (cur)
+            ((radix_elem*) cur)->decref();
+        }, true)) {
+      panic("radix_range::replace");
+    }
+  }
 }

 radix_elem*
@@ -96,6 +129,6 @@ radix_iterator::operator*()
  radix_elem *result = 0;
  descend(k_, (markptr<void>*) &r_->root_, radix_levels-1, [&result](markptr<void> *v) {
      result = (radix_elem*) v->ptr().load();
-    });
+    }, false);
  return result;
 }
--- a/kernel/rnd.cc
+++ b/kernel/rnd.cc
@@ -3,7 +3,7 @@

 struct seed {
  u64 v;
-} __mapalign__;
+} __mpalign__;
 static struct seed seeds[NCPU] __mpalign__;

 u64 

--- a/kernel/sampler.cc
+++ b/kernel/sampler.cc
@@ -188,7 +188,7 @@ sampread(struct inode *ip, char *dst, u32 off, u32 n)
    u64 len = LOGHEADER_SZ;
    u64 cc;
    
-    hdr = (logheader*) kmalloc(len);
+    hdr = (logheader*) kmalloc(len, "logheader");
    if (hdr == nullptr)
      return -1;
    hdr->ncpus = NCPU;

--- a/kernel/sched.cc
+++ b/kernel/sched.cc
@@ -60,7 +60,9 @@ sched(void)

  struct proc *next = schednext();
  if (next == nullptr) {
-    if (myproc()->get_state() != RUNNABLE) {
+    if (myproc()->get_state() != RUNNABLE ||
+        // proc changed its CPU pin?
+        myproc()->cpuid != mycpu()->id) {
      next = idleproc();
    } else {
      myproc()->set_state(RUNNING);

--- a/kernel/spinlock.cc
+++ b/kernel/spinlock.cc
@@ -24,7 +24,7 @@ void*
 klockstat::operator new(unsigned long nbytes)
 {
  assert(nbytes == sizeof(klockstat));
-  return kmalloc(sizeof(klockstat));
+  return kmalloc(sizeof(klockstat), "klockstat");
 }

 void

--- a/kernel/syscall.cc
+++ b/kernel/syscall.cc
@@ -101,6 +101,7 @@ syscall(u64 a0, u64 a1, u64 a2, u64 a3, u64 a4, u64 num)
 {
  u64 r;

+  mt_ascope ascope("syscall(%lx,%lx,%lx,%lx,%lx,%lx)", num, a0, a1, a2, a3, a4);
  if(num < SYS_ncount && syscalls[num]) {
    mtstart(syscalls[num], myproc());
    mtrec();

--- a/kernel/sysfile.cc
+++ b/kernel/sysfile.cc
@@ -11,6 +11,7 @@
 #include "fcntl.h"
 #include "cpu.hh"
 #include "net.hh"
+#include "kmtrace.hh"

 static bool
 getfile(int fd, sref<file> *f)
@@ -214,6 +215,9 @@ create(inode *cwd, const char *path, short type, short major, short minor)
 {
  struct inode *ip, *dp;
  char name[DIRSIZ];
+  mt_ascope ascope("%s(%d.%d,%s,%d,%d,%d)",
+                   __func__, cwd->dev, cwd->inum,
+                   path, type, major, minor);

 retry:
  if((dp = nameiparent(cwd, path, name)) == 0)
@@ -227,17 +231,19 @@ create(inode *cwd, const char *path, short type, short major, short minor)
    if(type == T_FILE && ip->type == T_FILE)
      return ip;
    iunlockput(ip);
-    return 0;
+    return nullptr;
  }

-  if((ip = ialloc(dp->dev, type)) == 0)
-    panic("create: ialloc");
+  if((ip = ialloc(dp->dev, type)) == nullptr)
+    return nullptr;

  ip->major = major;
  ip->minor = minor;
  ip->nlink = 1;
  iupdate(ip);

+  mtwriteavar("inode:%x.%x", ip->dev, ip->inum);
+
  if(type == T_DIR){  // Create . and .. entries.
    dp->nlink++;  // for ".."
    iupdate(dp);
@@ -269,9 +275,8 @@ sys_openat(int dirfd, const char *path, int omode)

  if (dirfd == AT_FDCWD) {
    cwd = myproc()->cwd;
-  } else if (dirfd < 0 || dirfd >= NOFILE) {
-    return -1;
  } else {
+    // XXX(sbw) do we need the sref while we touch fdir->ip?
    sref<file> fdir;
    if (!getfile(dirfd, &fdir) || fdir->type != file::FD_INODE)
      return -1;
@@ -280,6 +285,13 @@ sys_openat(int dirfd, const char *path, int omode)

  if(argcheckstr(path) < 0)
    return -1;
+
+  // Reads the dirfd FD, dirfd's inode, the inodes of all files in
+  // path; writes the returned FD
+  mt_ascope ascope("%s(%d,%s,%d)", __func__, dirfd, path, omode);
+  mtwriteavar("thread:%x", myproc()->pid);
+  mtreadavar("inode:%x.%x", cwd->dev, cwd->inum);
+
  if(omode & O_CREATE){
    if((ip = create(cwd, path, T_FILE, 0, 0)) == 0)
      return -1;
@@ -295,6 +307,9 @@ sys_openat(int dirfd, const char *path, int omode)
        release(&pip->lock);
      }
    }
+    // XXX necessary because the mtwriteavar() to the same abstract variable
+    // does not propagate to our scope, since create() has its own inner scope.
+    mtwriteavar("inode:%x.%x", ip->dev, ip->inum);
  } else {
 retry:
    if((ip = namei(cwd, path)) == 0){
@@ -337,6 +352,7 @@ sys_openat(int dirfd, const char *path, int omode)
    return -1;
  }
  iunlock(ip);
+  mtwriteavar("fd:%x.%x", myproc()->pid, fd);

  f->type = file::FD_INODE;
  f->ip = ip;
@@ -347,11 +363,25 @@ sys_openat(int dirfd, const char *path, int omode)
 }

 long
-sys_mkdir(const char *path)
+sys_mkdirat(int dirfd, const char *path)
 {
+  struct inode *cwd;
  struct inode *ip;

-  if(argcheckstr(path) < 0 || (ip = create(myproc()->cwd, path, T_DIR, 0, 0)) == 0)
+  if (dirfd == AT_FDCWD) {
+    cwd = myproc()->cwd;
+  } else {
+    // XXX(sbw) do we need the sref while we touch fdir->ip?
+    sref<file> fdir;
+    if (!getfile(dirfd, &fdir) || fdir->type != file::FD_INODE)
+      return -1;
+    cwd = fdir->ip;
+  }
+
+  if (argcheckstr(path) < 0)
+    return -1;
+  ip = create(cwd, path, T_DIR, 0, 0);
+  if (ip == nullptr)
    return -1;
  iunlockput(ip);
  return 0;

--- a/kernel/sysproc.cc
+++ b/kernel/sysproc.cc
@@ -9,6 +9,7 @@
 #include "cpu.hh"
 #include "vm.hh"
 #include "sperf.hh"
+#include "kmtrace.hh"

 long
 sys_fork(int flags)
@@ -32,7 +33,7 @@ sys_wait(void)
 long
 sys_kill(int pid)
 {
-  return kill(pid);
+  return proc::kill(pid);
 }

 long
@@ -87,6 +88,13 @@ sys_map(uptr addr, u64 len)
 {
  ANON_REGION(__func__, &perfgroup);

+#if MTRACE
+  mt_ascope ascope("%s(%p,%lx)", __func__, addr, len);
+  mtwriteavar("thread:%x", myproc()->pid);
+  for (uptr i = PGROUNDDOWN(addr); i < PGROUNDUP(addr + len); i += PGSIZE)
+    mtwriteavar("page:%016x", i);
+#endif
+
  vmnode *vmn = new vmnode(PGROUNDUP(len) / PGSIZE);
  if (vmn == 0)
    return -1;
@@ -104,6 +112,13 @@ sys_unmap(uptr addr, u64 len)
 {
  ANON_REGION(__func__, &perfgroup);

+#if MTRACE
+  mt_ascope ascope("%s(%p,%lx)", __func__, addr, len);
+  mtwriteavar("thread:%x", myproc()->pid);
+  for (uptr i = PGROUNDDOWN(addr); i < PGROUNDUP(addr + len); i += PGSIZE)
+    mtwriteavar("page:%016x", i);
+#endif
+
  uptr align_addr = PGROUNDDOWN(addr);
  uptr align_len = PGROUNDUP(addr + len) - align_addr;
  if (myproc()->vmap->remove(align_addr, align_len) < 0)
@@ -131,3 +146,9 @@ sys_setfs(u64 base)
  switchvm(p);
  return 0;
 }
+
+long
+sys_setaffinity(int cpu)
+{
+  return myproc()->set_cpu_pin(cpu);
+}
--- a/kernel/trap.cc
+++ b/kernel/trap.cc
@@ -75,6 +75,7 @@ trap(struct trapframe *tf)
  if (myproc()->mtrace_stacks.curr >= 0)
    mtpause(myproc());
  mtstart(trap, myproc());
+  // XXX mt_ascope ascope("trap:%d", tf->trapno);
 #endif

  switch(tf->trapno){
@@ -158,6 +159,7 @@ trap(struct trapframe *tf)
 #endif
        return;
      }
+      cprintf("pagefault: failed\n");
      cli();
    }


--- a/kernel/user.cc
+++ b/kernel/user.cc
+#include "types.h"
+#include "kernel.hh"
+#include "mmu.h"
+#include "amd64.h"
+#include "spinlock.h"
+#include "condvar.h"
+#include "queue.h"
+#include "proc.hh"
+#include "cpu.hh"
+#include "bits.hh"
+#include "vm.hh"
+
+extern struct proc *bootproc;
+
+// Set up first user process.
+void
+inituser(void)
+{
+  struct proc *p;
+  extern u8 _initcode_start[];
+  extern u64 _initcode_size;
+
+  p = proc::alloc();
+  p->ftable = new filetable();
+  if (p->ftable == nullptr)
+    panic("userinit: new filetable");
+  bootproc = p;
+  if((p->vmap = vmap::alloc()) == 0)
+    panic("userinit: out of vmaps?");
+  vmnode *vmn =  new vmnode(PGROUNDUP(_initcode_size) / PGSIZE);
+  if(vmn == 0)
+    panic("userinit: vmn_allocpg");
+  if(p->vmap->insert(vmn, 0, 1) < 0)
+    panic("userinit: vmap_insert");
+  if(p->vmap->copyout(0, _initcode_start, _initcode_size) < 0)
+    panic("userinit: copyout");
+  memset(p->tf, 0, sizeof(*p->tf));
+  p->tf->cs = UCSEG | 0x3;
+  p->tf->ds = UDSEG | 0x3;
+  p->tf->ss = p->tf->ds;
+  p->tf->rflags = FL_IF;
+  p->tf->rsp = PGSIZE;
+  p->tf->rip = 0x0;  // beginning of initcode.S
+
+  safestrcpy(p->name, "initcode", sizeof(p->name));
+  p->cwd = 0; // forkret will fix in the process's context
+  acquire(&p->lock);
+  addrun(p);
+  release(&p->lock);
+}
--- a/kernel/uwq.cc
+++ b/kernel/uwq.cc
+#include "types.h"
+#include "amd64.h"
+#include "kernel.hh"
+#include "cpu.hh"
+#include "gc.hh"
+#include "percpu.hh"
+#include "spinlock.h"
+#include "condvar.h"
+#include "proc.hh"
+#include "uwq.hh"
+#include "vm.hh"
+#include "kalloc.hh"
+#include "bits.hh"
+extern "C" {
+#include "kern_c.h"
+}
+
+bool
+uwq_trywork(void)
+{
+  // Returning true means uwq added a thread to the run queue
+
+  u64 i, k;
+
+  // A "random" victim CPU
+  k = rdtsc();
+  for (i = 0; i < NCPU; i++) {
+    u64 j = (i+k) % NCPU;
+
+    if (j == mycpuid())
+      continue;
+    struct cpu *c = &cpus[j];
+    
+    // The gc_epoch is for p and uwq
+    scoped_gc_epoch xgc();
+    barrier();
+
+    struct proc *p = c->proc;
+    if (p == nullptr || p->uwq == nullptr)
+      continue;
+    uwq* uwq = p->uwq;
+
+    if (uwq->haswork()) {
+      if (uwq->tryworker())
+        return true;
+      break;
+    }
+  }
+
+  return false;
+}
+
+long
+sys_wqwait(void)
+{
+  uwq_worker* w = myproc()->worker;
+  if (w == nullptr)
+    return -1;
+
+  return w->wait();
+}
+
+//
+// uwq_worker
+//
+uwq_worker::uwq_worker(uwq* u, proc* p)
+  : uwq_(u), proc_(p), running_(false)
+{
+  initlock(&lock_, "worker_lock", 0);
+  initcondvar(&cv_, "worker_cv");
+}
+
+void
+uwq_worker::exit(void)
+{
+  if (--uwq_->uref_ == 0)
+    gc_delayed(uwq_);
+  release(&lock_);
+  delete this;
+  ::exit();
+}
+
+long
+uwq_worker::wait(void)
+{
+  acquire(&lock_);
+  if (uwq_->ref() == 0)
+    this->exit();
+
+  running_ = false;
+  cv_sleep(&cv_, &lock_);
+
+  if (uwq_->ref() == 0)
+    this->exit();
+  release(&lock_);
+  return 0;
+}
+
+//
+// uwq
+//
+uwq*
+uwq::alloc(vmap* vmap, filetable *ftable)
+{
+  padded_length* len;
+  uwq* u;
+
+  len = (padded_length*) ksalloc(slab_userwq);  
+  if (len == nullptr)
+    return nullptr;
+
+  ftable->incref();
+  vmap->incref();
+
+  u = new uwq(vmap, ftable, len);
+  if (u == nullptr) {
+    ftable->decref();
+    vmap->decref();
+    ksfree(slab_userwq, len);
+    return nullptr;
+  }
+  u->inc();
+
+  if (mapkva(vmap->pml4, (char*)len, USERWQ, USERWQSIZE)) {
+    ftable->decref();
+    vmap->decref();
+    ksfree(slab_userwq, len);
+    u->dec();
+    return nullptr;
+  }
+
+  return u;
+}
+
+uwq::uwq(vmap* vmap, filetable *ftable, padded_length *len) 
+  : rcu_freed("uwq"),
+    vmap_(vmap), ftable_(ftable), len_(len),
+    uentry_(0), ustack_(UWQSTACK), uref_(0)
+{
+  for (int i = 0; i < NCPU; i++)
+    len_[i].v_ = 0;
+
+  initlock(&lock_, "uwq_lock", 0);
+  memset(worker_, 0, sizeof(worker_));
+}
+
+uwq::~uwq(void)
+{ 
+  if (len_ != nullptr)
+    ksfree(slab_userwq, len_);
+  vmap_->decref();
+  ftable_->decref();
+}
+
+bool
+uwq::haswork(void) const
+{
+  if (len_ == nullptr)
+    return false;
+
+  for (int i = 0; i < NCPU; i++) {
+    if (len_[i].v_ > 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool
+uwq::tryworker(void)
+{
+  // Try to start a worker thread
+  scoped_acquire lock0(&lock_);
+
+  if (ref() == 0)
+    return false;
+
+  int slot = -1;
+  for (int i = 0; i < NWORKERS; i++) {
+    if (worker_[i] == nullptr) {
+      if (slot == -1)
+        slot = i;
+      continue;
+    }
+
+    uwq_worker *w = worker_[i];
+    if (w->running_)
+      continue;
+    else {
+      scoped_acquire lock1(&w->lock_);
+      proc* p = w->proc_;
+
+      acquire(&p->lock);
+      p->cpuid = mycpuid();
+      release(&p->lock);
+
+      w->running_ = true;
+      cv_wakeup(&w->cv_);
+      return true;
+    }
+  }
+
+  if (slot != -1) {
+    proc* p = allocworker();
+    if (p != nullptr) {
+      uwq_worker* w = new uwq_worker(this, p);
+      assert(w != nullptr);
+
+      ++uref_;
+      p->worker = w;
+      w->running_ = true;
+
+      acquire(&p->lock);
+      p->cpuid = mycpuid();
+      addrun(p);
+      release(&p->lock);
+
+      worker_[slot] = w;
+      return true;
+    }
+  }
+    
+  return nullptr;
+}
+
+void
+uwq::finish(void)
+{
+  bool gcnow = true;
+
+  scoped_acquire lock0(&lock_);
+  for (int i = 0; i < NWORKERS; i++) {
+    if (worker_[i] != nullptr) {
+      uwq_worker* w = worker_[i];
+      gcnow = false;
+      acquire(&w->lock_);
+      cv_wakeup(&w->cv_);
+      release(&w->lock_);
+    }
+  }
+  
+  if (gcnow)
+    gc_delayed(this);
+}
+
+void
+uwq::onzero() const
+{
+  uwq *u = (uwq*)this;
+  u->finish();
+}
+
+void
+uwq::setuentry(uptr uentry)
+{
+  uentry_ = uentry;
+}
+
+proc*
+uwq::allocworker(void)
+{
+  uptr uentry = uentry_;
+
+  if (uentry == 0)
+    return nullptr;
+
+  proc* p = proc::alloc();
+  if (p == nullptr)
+    return nullptr;
+  safestrcpy(p->name, "uwq_worker", sizeof(p->name));
+
+  // finishproc will drop these refs
+  vmap_->incref();
+  ftable_->incref();
+  
+  p->vmap = vmap_;
+  p->ftable = ftable_;
+    
+  struct vmnode *vmn;
+  if ((vmn = new vmnode(USTACKPAGES)) == nullptr) {
+    finishproc(p);
+    return nullptr;
+  }
+
+  uptr stacktop = ustack_ + (USTACKPAGES*PGSIZE);
+  if (vmap_->insert(vmn, ustack_, 1) < 0) {
+    delete vmn;
+    finishproc(p);
+    return nullptr;
+  }
+  // Include a bumper page
+  ustack_ += (USTACKPAGES*PGSIZE)+PGSIZE;
+
+  p->tf->rsp = stacktop - 8;
+  p->tf->rip = uentry;
+  p->tf->cs = UCSEG | 0x3;
+  p->tf->ds = UDSEG | 0x3;
+  p->tf->ss = p->tf->ds;
+  p->tf->rflags = FL_IF;
+
+  return p;
+}
--- a/kernel/vm.cc
+++ b/kernel/vm.cc
@@ -14,6 +14,8 @@
 #include "crange.hh"
 #include "cpputil.hh"
 #include "sperf.hh"
+#include "uwq.hh"
+#include "kmtrace.hh"

 enum { vm_debug = 0 };

@@ -22,7 +24,7 @@ enum { vm_debug = 0 };
 */

 vmnode::vmnode(u64 npg, vmntype ntype, inode *i, u64 off, u64 s)
-  : npages(npg), ref(0), type(ntype), ip(i), offset(off), sz(s)
+  : npages(npg), type(ntype), ip(i), offset(off), sz(s), ref_(0)
 {
  if (npg > NELEM(page))
    panic("vmnode too big\n");
@@ -43,12 +45,24 @@ vmnode::~vmnode()
 }

 void
-vmnode::decref()
+vmnode::decref(void)
 {
-  if(--ref == 0)
+  if(--ref_ == 0)
    delete this;
 }

+void
+vmnode::incref(void)
+{
+  ++ref_;
+}
+
+u64
+vmnode::ref(void)
+{
+  return ref_.load();
+}
+
 int
 vmnode::allocpg()
 {
@@ -56,7 +70,7 @@ vmnode::allocpg()
    if (page[i])
      continue;

-    char *p = kalloc();
+    char *p = kalloc("(vmnode::allocpg)");
    if (!p) {
      cprintf("allocpg: out of memory, leaving half-filled vmnode\n");
      return -1;
@@ -96,6 +110,10 @@ vmnode::copy()
 int
 vmnode::demand_load()
 {
+#ifdef MTRACE 
+  mtreadavar("inode:%x.%x", ip->dev, ip->inum);
+  mtwriteavar("vmnode:%016x", this);
+#endif
  for (u64 i = 0; i < sz; i += PGSIZE) {
    char *p = page[i / PGSIZE];
    s64 n;
@@ -125,7 +143,7 @@ vma::vma(vmap *vmap, uptr start, uptr end, enum vmatype vtype, vmnode *vmn) :
    vma_start(start), vma_end(end), va_type(vtype), n(vmn)
 {
  if (n)
-    n->ref++;
+    n->incref();
 }

 vma::~vma()
@@ -144,15 +162,15 @@ vmap::alloc(void)
  return new vmap();
 }

-vmap::vmap() :
+vmap::vmap() : 
 #if VM_CRANGE
-    cr(10),
+  cr(10),
 #endif
 #if VM_RADIX
-    rx(PGSHIFT),
+  rx(PGSHIFT),
 #endif
-    ref(1), pml4(setupkvm()), kshared((char*) ksalloc(slab_kshared)),
-    brk_(0)
+  ref(1), pml4(setupkvm()), kshared((char*) ksalloc(slab_kshared)),
+  brk_(0)
 {
  initlock(&brklock_, "brk_lock", LOCKSTAT_VM);
  if (pml4 == 0) {
@@ -165,8 +183,8 @@ vmap::vmap() :
    goto err;
  }

-  if (setupkshared(pml4, kshared)) {
-    cprintf("vmap::vmap: setupkshared out of memory\n");
+  if (mapkva(pml4, kshared, KSHARED, KSHAREDSIZE)) {
+    cprintf("vmap::vmap: mapkva out of memory\n");
    goto err;
  }

@@ -195,6 +213,12 @@ vmap::decref()
    delete this;
 }

+void
+vmap::incref()
+{
+  ++ref;
+}
+
 bool
 vmap::replace_vma(vma *a, vma *b)
 {
@@ -517,7 +541,7 @@ vmap::pagefault(uptr va, u32 err)
  u64 npg = (PGROUNDDOWN(va) - m->vma_start) / PGSIZE;
  if (vm_debug)
    cprintf("pagefault: err 0x%x va 0x%lx type %d ref %lu pid %d\n",
-            err, va, m->va_type, m->n->ref.load(), myproc()->pid);
+            err, va, m->va_type, m->n->ref(), myproc()->pid);

  if (m->n && !m->n->page[npg])
    if (m->n->allocpg() < 0)
@@ -546,16 +570,23 @@ vmap::pagefault(uptr va, u32 err)
  if (m->va_type == COW) {
    *pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_COW;
  } else {
-    assert(m->n->ref == 1);
    *pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_W;
  }

+  mtreadavar("vmnode:%016x", m->n);
+
  return 1;
 }

 int
 pagefault(struct vmap *vmap, uptr va, u32 err)
 {
+#if MTRACE
+  mt_ascope ascope("%s(%p)", __func__, va);
+  mtwriteavar("thread:%x", myproc()->pid);
+  mtwriteavar("page:%p.%016x", vmap, PGROUNDDOWN(va));
+#endif
+
  return vmap->pagefault(va, err);
 }

@@ -615,7 +646,16 @@ vmap::sbrk(ssize_t n, uptr *addr)
 #if VM_RADIX
  auto span = rx.search_lock(newstart, newn + PGSIZE);
 #endif
+#if VM_CRANGE
+  for (auto r: span) {
+#endif
+#if VM_RADIX
+  void *last = 0;
  for (auto r: span) {
+    if (!r || r == last)
+      continue;
+    last = r;
+#endif
    vma *e = (vma*) r;

    if (e->vma_start <= newstart) {

--- a/lib/Makefrag
+++ b/lib/Makefrag
-$(O)/lib/%.o: CFLAGS:=$(CFLAGS)
-$(O)/lib/%.o: CXXFLAGS:=$(CXXFLAGS)
+$(O)/lib/%.o: CFLAGS:=$(CFLAGS) -DXV6_USER
+$(O)/lib/%.o: CXXFLAGS:=$(CXXFLAGS) -DXV6_USER

 ULIB = ulib.o usys.o printf.o umalloc.o uthread.o fmt.o stream.o ipc.o \
       threads.o crt.o wq.o perf.o

--- a/lib/printf.cc
+++ b/lib/printf.cc
@@ -3,33 +3,67 @@
 #include "user.h"
 #include <stdarg.h>
 #include "fmt.hh"
+#include "lib.h"
+
+struct outbuf {
+  char b[128];
+  int  n;
+  int  fd;
+};
+
+static void
+flushoutbuf(struct outbuf* b)
+{
+  int i = 0;
+  int r;
+  
+  while (b->n != 0) {
+    r = write(b->fd, &b->b[i], b->n);
+    if (r == 0 || r < 0) {
+      b->n = 0;
+    } else {
+      b->n -= r;
+      i += r;
+    }
+  }
+}

-// Print to the given fd.
 static void
-writec(int c, void *arg)
+writeoutbuf(int c, void *arg)
 {
-  int fd = (int) (u64) arg;
-  write(fd, &c, 1);
+  struct outbuf* b = (struct outbuf*)arg;
+  if (b->n == NELEM(b->b))
+    flushoutbuf(b);
+  b->b[b->n] = c;
+  b->n++;
 }

 void
 fprintf(int fd, const char *fmt, ...)
 {
+  struct outbuf b;
  va_list ap;

+  b.n = 0;
+  b.fd = fd;
  va_start(ap, fmt);
-  vprintfmt(writec, (void*) (u64)fd, fmt, ap);
+  vprintfmt(writeoutbuf, (void*) &b, fmt, ap);
  va_end(ap);
+  flushoutbuf(&b);
 }

 void
 printf(const char *fmt, ...)
 {
+  struct outbuf b;
  va_list ap;

+  b.n = 0;
+  b.fd = 1;
  va_start(ap, fmt);
-  vprintfmt(writec, (void*) 1, fmt, ap);
+  vprintfmt(writeoutbuf, (void*) &b, fmt, ap);
  va_end(ap);
+  flushoutbuf(&b);
 }

 // Print to a buffer.
@@ -69,11 +103,15 @@ snprintf(char *buf, u32 n, const char *fmt, ...)
 void __attribute__((noreturn))
 die(const char* errstr, ...)
 {
+  struct outbuf b;
  va_list ap;

+  b.n = 0;
+  b.fd = 2;
  va_start(ap, errstr);
-  vprintfmt(writec, (void*) (u64)1, errstr, ap);
+  vprintfmt(writeoutbuf, (void*)&b, errstr, ap);
  va_end(ap);
+  flushoutbuf(&b);
  fprintf(2, "\n");
  exit();
 }
--- a/lib/threads.cc
+++ b/lib/threads.cc
@@ -2,6 +2,7 @@
 #include "pthread.h"
 #include "user.h"
 #include "atomic.hh"
+#include "fcntl.h"

 enum { stack_size = 8192 };
 static std::atomic<int> nextkey;
@@ -22,7 +23,7 @@ pthread_create(pthread_t* tid, const pthread_attr_t* attr,
               void* (*start)(void*), void* arg)
 {
  char* base = (char*) sbrk(stack_size);
-  int t = forkt(base + stack_size, (void*) start, arg);
+  int t = forkt(base + stack_size, (void*) start, arg, FORK_SHARE_VMAP | FORK_SHARE_FD);
  if (t < 0)
    return t;

@@ -55,7 +56,7 @@ pthread_getspecific(pthread_key_t key)
 int
 pthread_setspecific(pthread_key_t key, void* value)
 {
-  __asm volatile("movq %0, %%fs:(%1)" : : "r" (value), "r" ((u64) key * 8));
+  __asm volatile("movq %0, %%fs:(%1)" : : "r" (value), "r" ((u64) key * 8) : "memory");
  return 0;
 }


--- a/lib/ulib.c
+++ b/lib/ulib.c
@@ -151,6 +151,12 @@ open(const char *path, int omode)
  return openat(AT_FDCWD, path, omode);
 }

+int
+mkdir(const char *path)
+{
+  return mkdirat(AT_FDCWD, path);
+}
+
 extern void __cxa_pure_virtual(void);
 void __cxa_pure_virtual(void)
 { 

--- a/lib/usys.S
+++ b/lib/usys.S
@@ -30,7 +30,7 @@ SYSCALL(mknod)
 SYSCALL(unlink)
 SYSCALL(fstat)
 SYSCALL(link)
-SYSCALL(mkdir)
+SYSCALL(mkdirat)
 SYSCALL(chdir)
 SYSCALL(dup)
 SYSCALL(getpid)
@@ -48,3 +48,5 @@ SYSCALL(pread)
 SYSCALL(async)
 SYSCALL(script)
 SYSCALL(setfs)
+SYSCALL(wqwait)
+SYSCALL(setaffinity)
--- a/lib/uthread.S
+++ b/lib/uthread.S
@@ -12,7 +12,7 @@ forkt:
        movq %rdx, 0x00(%r12)   # arg
        movq %rsi, 0x08(%r12)   # function ptr

-        movq $1, %rdi           # flag for sys_fork
+        movq %rcx, %rdi         # flag for sys_fork
        movq $SYS_fork, %rax
        syscall


--- a/lib/wq.cc
+++ b/lib/wq.cc
@@ -21,6 +21,8 @@ public:
 private:
  work *steal(int c);
  work *pop(int c);
+  void inclen(int c);
+  void declen(int c);

  struct wqueue {
    work *w[NSLOTS];
@@ -38,6 +40,10 @@ private:

  percpu<wqueue> q_;
  percpu<stat> stat_;
+
+#if defined(XV6_USER)
+  padded_length* len_;
+#endif
 };

 static wq *wq_;
@@ -73,12 +79,6 @@ initwq(void)
  wqarch_init();
 }

-void
-exitwq(void)
-{
-  wqarch_exit();
-}
-
 //
 // wq
 //
@@ -95,6 +95,10 @@ wq::wq(void)

  for (i = 0; i < NCPU; i++)
    wqlock_init(&q_[i].lock);
+
+#if defined(XV6_USER)
+  len_ = allocklen(NCPU*sizeof(padded_length));
+#endif
 }

 void
@@ -107,6 +111,22 @@ wq::dump(void)
            stat_[i].pop, stat_[i].steal);
 }

+inline void
+wq::inclen(int c)
+{
+#if defined(XV6_USER)
+  __sync_fetch_and_add(&len_[c].v_, 1);
+#endif
+}
+
+inline void
+wq::declen(int c)
+{
+#if defined(XV6_USER)
+  __sync_fetch_and_sub(&len_[c].v_, 1);
+#endif
+}
+
 int
 wq::push(work *w)
 {
@@ -123,6 +143,7 @@ wq::push(work *w)
  q_->w[i] = w;
  barrier();
  q_->head++;
+  inclen(mycpuid());
  stat_->push++;
  popcli();
  return 0;
@@ -148,6 +169,7 @@ wq::pop(int c)
  i = (i-1) & (NSLOTS-1);
  w = q->w[i];
  q->head--;
+  declen(c);
  wqlock_release(&q->lock);

  stat_->pop++;
@@ -171,6 +193,7 @@ wq::steal(int c)
  i = i & (NSLOTS-1);
  w = q->w[i];
  q->tail++;
+  declen(c);
  wqlock_release(&q->lock);

  stat_->steal++;

--- a/net/sys_arch.cc
+++ b/net/sys_arch.cc
@@ -208,7 +208,7 @@ sys_thread_new(const char *name, lwip_thread_fn thread, void *arg,
  struct lwip_thread *lt;
  struct proc *p;

-  lt = (struct lwip_thread*) kmalloc(sizeof(*lt));
+  lt = (struct lwip_thread*) kmalloc(sizeof(*lt), "lwip_thread");
  if (lt == nullptr)
    return 0;
  lt->thread = thread;

--- a/param.h
+++ b/param.h
 #pragma once
-#define DEBUG         1
+#define DEBUG         0
 #define NPROC        64  // maximum number of processes
 #define KSTACKSIZE 8192  // size of per-process kernel stack
-#define NOFILE       16  // open files per process
+#define NOFILE       64  // open files per process
 #define NFILE       100  // open files per system
 #define NBUF      10000  // size of disk block cache
 #define NINODE     5000  // maximum number of active i-nodes
@@ -23,6 +23,8 @@
 #define VERIFYFREE    0  // Unreliable, e.g. vma's vmnode pointer gets reused
 #define ALLOC_MEMSET  DEBUG
 #define KSHAREDSIZE   (32 << 10)
+#define USERWQSIZE    (1 << 14)
+#define USTACKPAGES   4
 #define WQSHIFT       7
 #define CILKENABLE    0
 #if defined(HW_josmp)
@@ -30,8 +32,8 @@
 #define MTRACE       0
 #define PERFSIZE     (1<<20ull)
 #elif defined(HW_qemu)
-#define NCPU         4   // maximum number of CPUs
-#define MTRACE       0
+#define NCPU         8   // maximum number of CPUs
+#define MTRACE       1
 #define PERFSIZE     (16<<20ull)
 #elif defined(HW_ud0)
 #define NCPU         4   // maximum number of CPUs

--- a/tools/mkfs.c
+++ b/tools/mkfs.c
@@ -11,8 +11,8 @@
 #include "include/stat.h"

 int nblocks = 4067;
-int ninodes = 200;
-int size = 4096;
+int ninodes = 800;
+int size = 4172;

 int fsfd;
 struct superblock sb;

--- a/user/Makefrag.user
+++ b/user/Makefrag.user
-CXXFLAGS := -iquote user $(filter-out -nostdinc++ -Istdinc, $(CXXFLAGS)) -msse
+CXXFLAGS := -iquote user $(filter-out -nostdinc++ -Istdinc -Inet, $(CXXFLAGS)) -msse

 $(O)/utest: $(O)/kernel/crange.o \
            $(O)/kernel/gc.o \
            $(O)/kernel/rnd.o \
+            $(O)/kernel/radix.o \
            $(O)/user/umain.o
 	@echo "  LD     $@"
 	$(Q)mkdir -p $(@D)

--- a/user/crange_arch.hh
+++ b/user/crange_arch.hh
@@ -133,6 +133,12 @@ mycpu()
  return (cpu*) &cpus[myproc()->cpuid];
 }

+static inline int
+mycpuid()
+{
+  return mycpu()->id;
+}
+
 static inline void pushcli() {}
 static inline void popcli()  {}


--- a/user/umain.cc
+++ b/user/umain.cc
 #include <unistd.h>
 #include <signal.h>
 #include <getopt.h>
+#include <string.h>

 #include "crange_arch.hh"
 #include "gc.hh"
 #include "crange.hh"
+#include "radix.hh"
 #include "atomic_util.hh"
 #include "ns.hh"
 #include "uscopedperf.hh"
@@ -80,8 +82,13 @@ threadpin(void (*fn)(void*), void *arg, const char *name, int cpu)
  makeproc(p);
 }

-struct my_range : public range {
-  my_range(crange *cr, u64 k, u64 sz) : range(cr, k, sz) {}
+struct my_crange_range : public range {
+  my_crange_range(crange *cr, u64 k, u64 sz) : range(cr, k, sz) {}
+  virtual void do_gc() { delete this; }
+};
+
+struct my_radix_range : public radix_elem {
+  my_radix_range(radix *cr, u64 k, u64 sz) {}
  virtual void do_gc() { delete this; }
 };

@@ -92,7 +99,7 @@ enum { crange_items = 1024 };
 enum { random_keys = 0 };

 static void
-worker(void *arg)
+worker_crange(void *arg)
 {
  crange *cr = (crange*) arg;

@@ -106,7 +113,7 @@ worker(void *arg)
      span.replace(0);
    } else {
      ANON_REGION("worker add", &perfgroup);
-      span.replace(new my_range(cr, k, 1));
+      span.replace(new my_crange_range(cr, k, 1));
    }
  }

@@ -114,16 +121,48 @@ worker(void *arg)
 }

 static void
-populate(void *arg)
+populate_crange(void *arg)
 {
  crange *cr = (crange*) arg;
  for (u32 i = 0; i < crange_items; i++)
-    cr->search_lock(1 + 2*i, 1).replace(new my_range(cr, 1+2*i, 1));
+    cr->search_lock(1 + 2*i, 1).replace(new my_crange_range(cr, 1+2*i, 1));
+  pthread_barrier_wait(&populate_b);
+}
+
+static void
+worker_radix(void *arg)
+{
+  radix *cr = (radix*) arg;
+
+  for (u32 i = 0; i < iter_total / ncpu; i++) {
+    ANON_REGION("worker op", &perfgroup);
+    u64 rval = random_keys ? rnd<u32>() : myproc()->cpuid;
+    u64 k = 1 + rval % (crange_items * 2);
+    auto span = cr->search_lock(k, 1);
+    if (rnd<u8>() & 1) {
+      ANON_REGION("worker del", &perfgroup);
+      span.replace(k, 1, 0);
+    } else {
+      ANON_REGION("worker add", &perfgroup);
+      span.replace(k, 1, new my_radix_range(cr, k, 1));
+    }
+  }
+
+  pthread_barrier_wait(&worker_b);
+}
+
+static void
+populate_radix(void *arg)
+{
+  radix *cr = (radix*) arg;
+  for (u32 i = 0; i < crange_items; i++)
+    cr->search_lock(1 + 2*i, 1).replace(1+2*i, 1, new my_radix_range(cr, 1+2*i, 1));
  pthread_barrier_wait(&populate_b);
 }

 static const struct option long_opts[] = {
  { "ncpu", required_argument, 0, 'n' },
+  { "tree-type", required_argument, 0, 't' },
  { 0, no_argument, 0, 0 }
 };

@@ -140,14 +179,17 @@ l2(u64 v)
  return l;
 }

+enum { type_crange, type_radix };
+
 int
 main(int ac, char **av)
 {
  ncpu = NCPU;
+  int treetype = type_crange;

  for (;;) {
    int long_idx;
-    int opt = getopt_long(ac, av, "n:", long_opts, &long_idx);
+    int opt = getopt_long(ac, av, "n:t:", long_opts, &long_idx);
    if (opt == -1)
      break;

@@ -157,6 +199,15 @@ main(int ac, char **av)
      assert(ncpu <= NCPU);
      break;

+    case 't':
+      if (!strcmp(optarg, "crange"))
+        treetype = type_crange;
+      else if (!strcmp(optarg, "radix"))
+        treetype = type_radix;
+      else
+        assert(0);
+      break;
+
    case '?':
      printf("Options:\n");
      for (u32 i = 0; long_opts[i].name; i++)
@@ -178,15 +229,25 @@ main(int ac, char **av)
  initgc();

  pthread_barrier_init(&populate_b, 0, 2);
+
  crange cr(l2(crange_items));
-  threadpin(populate, &cr, "populate", 0);
+  radix rr(0);
+
+  if (treetype == type_crange)
+    threadpin(populate_crange, &cr, "populate", 0);
+  else if (treetype == type_radix)
+    threadpin(populate_radix, &rr, "populate", 0);
+
  pthread_barrier_wait(&populate_b);

  pthread_barrier_init(&worker_b, 0, ncpu+1);
  for (u32 i = 0; i < ncpu; i++) {
    char buf[32];
    sprintf(buf, "worker%d", i);
-    threadpin(worker, &cr, buf, i);
+    if (treetype == type_crange)
+      threadpin(worker_crange, &cr, buf, i);
+    else if (treetype == type_radix)
+      threadpin(worker_radix, &rr, buf, i);
  }
  pthread_barrier_wait(&worker_b);


--- a/user/wqlinux.hh
+++ b/user/wqlinux.hh
@@ -83,11 +83,6 @@ wqarch_init(void)
  }
 }

-static inline void
-wqarch_exit(void)
-{
-}
-
 #define xprintf        printf
 #define pushcli()
 #define popcli()