Merge branch 'scale-amd64' of ssh://amsterdam.csail.mit.edu/home/am0/6.828/xv6 into scale-amd64

359dab4a · Frans Kaashoek · 585d8b5f · a099175e · 359dab4a · 359dab4a
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 Q          ?= @
 TOOLPREFIX ?= x86_64-jos-elf-
 QEMU 	   ?= qemu-system-x86_64
-QEMUSMP	   ?= 4
+QEMUSMP	   ?= 8
 QEMUSRC    ?= ../mtrace
 MTRACE	   ?= $(QEMU)
 HW	   ?= qemu

--- a/bin/asharing.cc
+++ b/bin/asharing.cc
@@ -78,6 +78,9 @@ main(int ac, char **av)
      pthread_t tid;
      pthread_create(&tid, 0, op, (void*) i);
    }
+
+    for (u64 i = 0; i < ncore; i++)
+      wait();
    mtdisable("xv6-asharing");
  }
 }
--- a/bin/mktree.cc
+++ b/bin/mktree.cc
@@ -3,12 +3,19 @@
 #include "user.h"
 #include "lib.h"
 #include "fcntl.h"
+#include "wq.hh"
+
+static int branch;

 static void
-dolevel(int fd, int branch, int depth)
+dolevel(int fd, int depth)
 {
  if (depth > 0) {
-    for (int i = 0; i < branch; i++) {
+    int it = 0;                                
+    wq_for_serial<int>(it,
+                       [](int &it)->bool { return it < branch; },
+                       [&fd, &depth](int i)->void
+    {
      char name[] = "a";
      *name += i;
      if (mkdirat(fd, name) < 0)
@@ -16,9 +23,9 @@ dolevel(int fd, int branch, int depth)

      int nfd = openat(fd, name, O_RDONLY);
      if (nfd < 0)
-        die("openat");
-      dolevel(nfd, branch, depth-1);
-    }
+        die("openat: %s at %u", name, depth);
+      dolevel(nfd, depth-1);
+    });
  }

  close(fd);
@@ -30,8 +37,10 @@ main(int ac, char **av)
  if (ac < 4)
    die("usage: %s dir branch depth", av[0]);

+  initwq();
+
  const char *dir = av[1];
-  int branch = atoi(av[2]);
+  branch = atoi(av[2]);
  int depth = atoi(av[3]);

  if (mkdir(dir))
@@ -41,5 +50,5 @@ main(int ac, char **av)
  if (fd < 0)
    die("open");
  
-  dolevel(fd, branch, depth);
+  dolevel(fd, depth);
 }
--- a/bin/wqtest.cc
+++ b/bin/wqtest.cc
@@ -127,6 +127,5 @@ main(int ac, char **av)
  test0();
  testfork();
  execwork::test();
-  exitwq();
  return 0;
 }
--- a/bin/xdu.cc
+++ b/bin/xdu.cc
@@ -51,15 +51,12 @@ du(int fd)
                  [](dirit &i)->bool { return !i.end(); },
                  [&size, &fd](const char *name)->void
    {
-      if (!strcmp(name, ".") || !strcmp(name, "..")) {
-        free((void*)name);
+      if (!strcmp(name, ".") || !strcmp(name, ".."))
        return;
-      }

      int nfd = openat(fd, name, 0);
      if (nfd >= 0)
        size += du(nfd);  // should go into work queue
-      free((void*)name);
    });
  } else {
    close(fd);
@@ -79,6 +76,5 @@ main(int ac, char **av)
  perf_stop();
  printf("%ld\n", s);
  wq_dump();
-  exitwq();
  return 0;
 }
--- a/bin/xls.cc
+++ b/bin/xls.cc
@@ -67,14 +67,12 @@ ls(const char *path)
      struct stat st;
      if (xfstatat(fd, name, &st) < 0){
        printf("ls: cannot stat %s\n", name);
-        free((void*)name);
        return;
      }
      
      if (!silent)
        printf("%u %10lu %10lu %s\n",
               ST_TYPE(st), ST_INO(st), ST_SIZE(st), name);
-      free((void*)name);
    });
  } else {
    close(fd);
@@ -99,6 +97,5 @@ main(int argc, char *argv[])
  perf_stop();
  
  wq_dump();
-  exitwq();
  return 0;
 }
--- a/include/dirit.hh
+++ b/include/dirit.hh
@@ -13,14 +13,6 @@ public:
    return *this;
  }

-  const char * copy_value() {
-    char *buf = (char*)malloc(256);
-    return name(buf, 256);
-  }
-
-  bool end() const { return end_; }
-
-private:
  char *name(char *buf, size_t n) const {
    n = MIN(DIRSIZ+1, n);
    memmove(buf, de_.name, n-1);
@@ -28,6 +20,9 @@ private:
    return buf;
  } 

+  bool end() const { return end_; }
+
+private:
  void refill(void) {
    int r;

@@ -45,3 +40,16 @@ private:
  bool end_;
  struct dirent de_;
 };
+
+static inline const char*
+copy_value(dirit &it)
+{
+  char *buf = (char*)malloc(256);
+  return it.name(buf, 256);
+}
+
+static inline void
+free_value(dirit &it, const char *name)
+{
+  free((void*)name);
+}
--- a/include/fcntl.h
+++ b/include/fcntl.h
@@ -4,3 +4,7 @@
 #define O_CREATE  0x200

 #define AT_FDCWD  -100
+
+#define FORK_SHARE_VMAP   (1<<0)
+#define FORK_SHARE_FD     (1<<1)
+
--- a/include/kernel.hh
+++ b/include/kernel.hh
@@ -91,6 +91,7 @@ struct inode*   ialloc(u32, short);
 struct inode*   namei(inode *cwd, const char*);
 void            iput(struct inode*);
 struct inode*   iget(u32 dev, u32 inum);
+struct inode*   igetnoref(u32 dev, u32 inum);
 void            ilock(struct inode*, int writer);
 void            iunlockput(struct inode*);
 void            iupdate(struct inode*);

--- a/include/lockstat.h
+++ b/include/lockstat.h
@@ -52,6 +52,7 @@ struct klockstat;
 #define LOCKSTAT_KALLOC    1
 #define LOCKSTAT_KMALLOC   1
 #define LOCKSTAT_NET       1
+#define LOCKSTAT_NS        1
 #define LOCKSTAT_PIPE      1
 #define LOCKSTAT_PROC      1
 #define LOCKSTAT_SCHED     1

--- a/include/ns.hh
+++ b/include/ns.hh
 #pragma once

 #include "gc.hh"
+#include "percpu.hh"

 // name spaces
 // XXX maybe use open hash table, no chain, better cache locality
@@ -15,11 +16,19 @@ template<class K, class V>
 class xelem : public rcu_freed {
 public:
  V val;
-  std::atomic<int> next_lock;
-  std::atomic<xelem<K, V>*> volatile next;
  K key;

-  xelem(const K &k, const V &v) : rcu_freed("xelem"), val(v), next_lock(0), next(0), key(k) {}
+  std::atomic<int> next_lock;
+  std::atomic<xelem<K, V>*> next;
+
+  int percore_c;
+  std::atomic<xelem<K, V>*> percore_next;
+  std::atomic<xelem<K, V>*>* percore_pprev;
+
+  xelem(const K &k, const V &v)
+    : rcu_freed("xelem"), val(v), key(k),
+      next_lock(0), next(0),
+      percore_next(0), percore_pprev(0) {}
  virtual void do_gc() {
    delete this;
  }
@@ -39,6 +48,8 @@ class xns : public rcu_freed {
  bool allowdup;
  std::atomic<u64> nextkey;
  xbucket<K, V> table[NHASH];
+  std::atomic<xelem<K, V>*> percore[NCPU];
+  spinlock percore_lock[NCPU];

 public:
  xns(bool dup) : rcu_freed("xns") {
@@ -46,6 +57,10 @@ class xns : public rcu_freed {
    nextkey = 1;
    for (int i = 0; i < NHASH; i++)
      table[i].chain = 0;
+    for (int i = 0; i < NCPU; i++) {
+      percore[i] = nullptr;
+      initlock(&percore_lock[i], "xns_lock", LOCKSTAT_NS);
+    }
  }

  ~xns() {
@@ -86,10 +101,20 @@ class xns : public rcu_freed {
      }

      e->next = root.load();
-      if (cmpxch(&table[i].chain, e->next.load(), e))
+      if (cmpxch(&table[i].chain, e->next.load(), e)) {
+        int c = mycpuid();
+        acquire(&percore_lock[c]);
+        e->percore_c = c;
+        e->percore_next = percore[c].load();
+        if (percore[c])
+          percore[c].load()->percore_pprev = &e->percore_next;
+        e->percore_pprev = &percore[c];
+        percore[c] = e;
+        release(&percore_lock[c]);
        return 0;
      }
    }
+  }

  V lookup(const K &key) {
    u64 i = h(key);
@@ -134,6 +159,13 @@ class xns : public rcu_freed {
            break;
          }

+          int c = e->percore_c;
+          acquire(&percore_lock[c]);
+          *e->percore_pprev = e->percore_next.load();
+          if (e->percore_next)
+            e->percore_next.load()->percore_pprev = e->percore_pprev;
+          release(&percore_lock[c]);
+
          *pelock = 0;
          gc_delayed(e);
          return true;
@@ -148,12 +180,13 @@ class xns : public rcu_freed {
  template<class CB>
  void enumerate(CB cb) {
    scoped_gc_epoch gc;
-    for (int i = 0; i < NHASH; i++) {
-      auto e = table[i].chain.load();
+    int cpuoffset = mycpuid();
+    for (int i = 0; i < NCPU; i++) {
+      auto e = percore[(i + cpuoffset) % NCPU].load();
      while (e) {
        if (cb(e->key, e->val))
          return;
-        e = e->next;
+        e = e->percore_next;
      }
    }
  }

--- a/include/radix.hh
+++ b/include/radix.hh
@@ -4,6 +4,7 @@
 * A page-table-like structure for mapping fixed-length keys to void* ptrs.
 */

+#include "gc.hh"
 #include "markptr.hh"

 enum { bits_per_level = 9 };

--- a/include/user.h
+++ b/include/user.h
@@ -54,7 +54,7 @@ void free(void*);
 int atoi(const char*);

 // uthread.S
-int forkt(void *sp, void *pc, void *arg);
+int forkt(void *sp, void *pc, void *arg, int forkflags);
 void forkt_setup(u64 pid);

 // printf.c

--- a/include/wqfor.hh
+++ b/include/wqfor.hh
@@ -13,7 +13,7 @@ struct forwork : public work {
    : it_(it), cond_(cond), body_(body), frame_(frame) {}

  virtual void run() {
-    decltype(it_.copy_value()) v = it_.copy_value();
+    decltype(copy_value(it_)) v = copy_value(it_);
    ++it_;
    if (cond_(it_)) {
      forwork<IT, BODY> *w = new forwork<IT, BODY>(it_, cond_, body_, frame_);
@@ -21,6 +21,7 @@ struct forwork : public work {
      wq_push(w);    
    }
    body_(v);
+    free_value(it_, v);
    frame_.dec();
    delete this;
  }
@@ -48,15 +49,48 @@ wq_for(IT &init, bool (*cond)(IT &it), BODY body)

  // XXX(sbw) should be able to coarsen loop

-  decltype(init.copy_value()) v = init.copy_value();
+  if (!cond(init))
+    return;
+
+  decltype(copy_value(init)) v = copy_value(init);
  ++init;
  if (cond(init)) {
    forwork<IT, BODY> *w = new forwork<IT, BODY>(init, cond, body, frame);
    frame.inc();
    wq_push(w);
  }
+
  body(v);
+  free_value(init, v);

  while (!frame.zero())
    wq_trywork();
 }
+
+// For debugging
+// Same API as wq_for but serially executes body 
+template <typename IT, typename BODY>
+static inline void
+wq_for_serial(IT &init, bool (*cond)(IT &it), BODY body)
+{
+  for (; cond(init); ++init) {
+    decltype(copy_value(init)) v = copy_value(init);
+    body(v);
+    free_value(init, v);
+  }
+}
+
+// Default copy_value
+template <typename T>
+static inline T
+copy_value(T &it)
+{
+  return it;
+}
+
+// Default free_value
+template <typename T>
+static inline void
+free_value(T &it, T &v)
+{
+}
--- a/include/wqkernel.hh
+++ b/include/wqkernel.hh
@@ -43,9 +43,4 @@ wqarch_init(void)
 {
 }

-static inline void
-wqarch_exit(void)
-{
-}
-
 #define xprintf      cprintf 
--- a/include/wquser.hh
+++ b/include/wquser.hh
@@ -92,12 +92,6 @@ wqarch_init(void)
  pthread_setspecific(idkey, (void*)(u64)id);
 }

-static inline void
-wqarch_exit(void)
-{
-  exiting = 1;
-}
-
 #define xprintf      printf 
 #define pushcli()
 #define popcli()
--- a/kernel/bio.cc
+++ b/kernel/bio.cc
@@ -46,11 +46,11 @@ static struct buf*
 bget(u32 dev, u64 sector, int *writer)
 {
  struct buf *b;
+  scoped_gc_epoch e;

 loop:
  // Try for cached block.
  // XXX ignore dev
-  gc_begin_epoch();
  b = bufns->lookup(mkpair(dev, sector));
  if (b) {
    if (b->dev != dev || b->sector != sector)
@@ -60,7 +60,6 @@ bget(u32 dev, u64 sector, int *writer)
      if (b->flags & B_BUSY) {
 	cv_sleep(&b->cv, &b->lock);
 	release(&b->lock);
-	gc_end_epoch();
 	goto loop;
      }

@@ -72,45 +71,15 @@ bget(u32 dev, u64 sector, int *writer)
    // rcu_end_read() happens in brelse
    return b;
  }
-  gc_end_epoch();

  // Allocate fresh block.
-  struct buf *victim = 0;
-  bufns->enumerate([&victim](const pair<u32, u64>&, buf *eb)->bool {
-      acquire(&eb->lock);
-      if ((eb->flags & (B_BUSY | B_DIRTY | B_VALID)) == 0) {
-        victim = eb;
-        return true;
-      }
-      release(&eb->lock);
-      return false;
-    });
-  if (victim == 0)
-    bufns->enumerate([&victim](const pair<u32, u64>&, buf *eb)->bool {
-        acquire(&eb->lock);
-        if ((eb->flags & (B_BUSY | B_DIRTY)) == 0) {
-          victim = eb;
-          return true;
-        }
-        release(&eb->lock);
-        return false;
-      });
-  if (victim == 0)
-    panic("bget all busy");
-  victim->flags |= B_BUSY;
-  bufns->remove(mkpair(victim->dev, victim->sector), &victim);
-  release(&victim->lock);
-  gc_delayed(victim);
-
  b = new buf(dev, sector);
  b->flags = B_BUSY;
  *writer = 1;
-  gc_begin_epoch();
  if (bufns->insert(mkpair(b->dev, b->sector), b) < 0) {
    gc_delayed(b);
    goto loop;
  }
-  // rcu_end_read() happens in brelse
  return b;
 }

@@ -152,8 +121,6 @@ brelse(struct buf *b, int writer)
    b->flags &= ~B_BUSY;
    cv_wakeup(&b->cv);
  }
-  // rcu_begin_read() happens in bread
-  gc_end_epoch();
 }

 void

--- a/kernel/fs.cc
+++ b/kernel/fs.cc
@@ -10,6 +10,22 @@
 // routines.  The (higher-level) system call implementations
 // are in sysfile.c.

+/*
+ * inode cache will be RCU-managed:
+ * 
+ * - to evict, mark inode as a victim
+ * - lookups that encounter a victim inode must return an error (-E_RETRY)
+ * - E_RETRY rolls back to the beginning of syscall/pagefault and retries
+ * - out-of-memory error should be treated like -E_RETRY
+ * - once an inode is marked as victim, it can be gc_delayed()
+ * - the do_gc() method should remove inode from the namespace & free it
+ * 
+ * - inodes have a refcount that lasts beyond a GC epoch
+ * - to bump refcount, first bump, then check victim flag
+ * - if victim flag is set, reduce the refcount and -E_RETRY
+ *
+ */
+
 #include "types.h"
 #include "stat.h"
 #include "mmu.h"
@@ -185,7 +201,7 @@ ialloc(u32 dev, short type)
      //cprintf("ialloc oops %d\n", inum); // XXX harmless
    }
  }
-  cprintf("ialloc: no inodes\n");
+  cprintf("ialloc: 0/%u inodes\n", sb.ninodes);
  return nullptr;
 }

@@ -239,21 +255,21 @@ inode::~inode()
 struct inode*
 iget(u32 dev, u32 inum)
 {
-  struct inode *ip;
+  struct inode *ip = igetnoref(dev, inum);
+  if (ip)
+    idup(ip);
+  return ip;
+}

+struct inode*
+igetnoref(u32 dev, u32 inum)
+{
 retry:
  // Try for cached inode.
-  gc_begin_epoch();
-  ip = ins->lookup(mkpair(dev, inum));
+  {
+    scoped_gc_epoch e;
+    struct inode *ip = ins->lookup(mkpair(dev, inum));
    if (ip) {
-    // tricky: first bump ref, then check free flag
-    ip->ref++;
-    if (ip->flags & I_FREE) {
-      gc_end_epoch();
-      ip->ref--;
-      goto retry;
-    }
-    gc_end_epoch();
      if (!(ip->flags & I_VALID)) {
        acquire(&ip->lock);
        while((ip->flags & I_VALID) == 0)
@@ -262,49 +278,13 @@ iget(u32 dev, u32 inum)
      }
    return ip;
    }
-  gc_end_epoch();
-
-  // Allocate fresh inode cache slot.
- retry_evict:
-  (void) 0;
-  u32 cur_free = icache_free[mycpu()->id].x;
-  if (cur_free == 0) {
-    struct inode *victim = 0;
-    ins->enumerate([&victim](const pair<u32, u32>&, inode* eip)->bool{
-        if (eip->ref || eip->type == T_DIR)
-          return false;
-
-        acquire(&eip->lock);
-        if (eip->ref == 0 && eip->type != T_DIR &&
-            !(eip->flags & (I_FREE | I_BUSYR | I_BUSYW))) {
-          victim = eip;
-          return true;
  }

-        release(&eip->lock);
-        return false;
-      });
-    if (!victim)
-      panic("iget out of space");
-    // tricky: first flag as free, then check refcnt, then remove from ns
-    victim->flags |= I_FREE;
-    if (victim->ref > 0) {
-      victim->flags &= ~(I_FREE);
-      release(&victim->lock);
-      goto retry_evict;
-    }
-    release(&victim->lock);
-    ins->remove(mkpair(victim->dev, victim->inum), &victim);
-    gc_delayed(victim);
-  } else {
-    if (!cmpxch(&icache_free[mycpu()->id].x, cur_free, cur_free-1))
-      goto retry_evict;
-  }
-
-  ip = new inode();
+  // Allocate fresh inode cache slot.
+  struct inode *ip = new inode();
  ip->dev = dev;
  ip->inum = inum;
-  ip->ref = 1;
+  ip->ref = 0;
  ip->flags = I_BUSYR | I_BUSYW;
  ip->readbusy = 1;
  snprintf(ip->lockname, sizeof(ip->lockname), "cv:ino:%d", ip->inum);
@@ -366,7 +346,7 @@ ilock(struct inode *ip, int writer)
 void
 iunlock(struct inode *ip)
 {
-  if(ip == 0 || !(ip->flags & (I_BUSYR | I_BUSYW)) || ip->ref < 1)
+  if(ip == 0 || !(ip->flags & (I_BUSYR | I_BUSYW)))
    panic("iunlock");

  acquire(&ip->lock);
@@ -407,6 +387,9 @@ iput(struct inode *ip)
      ip->flags |= (I_BUSYR | I_BUSYW);
      ip->readbusy++;

+      // XXX: use gc_delayed() to truncate the inode later.
+      // flag it as a victim in the meantime.
+
      release(&ip->lock);

      itrunc(ip);
@@ -619,7 +602,10 @@ namecmp(const char *s, const char *t)
 u64
 namehash(const strbuf<DIRSIZ> &n)
 {
-  return n._buf[0];   /* XXX */
+  u64 h = 0;
+  for (int i = 0; i < DIRSIZ && n._buf[i]; i++)
+    h = ((h << 8) ^ n._buf[i]) % 0xdeadbeef;
+  return h;
 }

 void
@@ -751,12 +737,12 @@ namex(inode *cwd, const char *path, int nameiparent, char *name)
 {
  struct inode *ip, *next;
  int r;
+  scoped_gc_epoch e;

-  gc_begin_epoch();
  if(*path == '/') 
-    ip = iget(ROOTDEV, ROOTINO);
+    ip = igetnoref(ROOTDEV, ROOTINO);
  else
-    ip = idup(cwd);
+    ip = cwd;

  while((r = skipelem(&path, name)) == 1){
    // XXX Doing this here requires some annoying reasoning about all
@@ -773,32 +759,30 @@ namex(inode *cwd, const char *path, int nameiparent, char *name)
    if(next == 0){
      if(ip->type == 0)
        panic("namex");
-      if(ip->type != T_DIR){
-        iput(ip);
-	gc_end_epoch();
+      if(ip->type != T_DIR)
        return 0;
-      }
      if(nameiparent && *path == '\0'){
        // Stop one level early.
-	gc_end_epoch();
+        idup(ip);
        return ip;
      }
-      if((next = dirlookup(ip, name)) == 0){
-        iput(ip);
-	gc_end_epoch();
+      if((next = dirlookup(ip, name)) == 0)
        return 0;
    }
-      iput(ip);
-    }
    ip = next;
  }
-  if(r == -1 || nameiparent){
-    iput(ip);
-    gc_end_epoch();
+
+  if(r == -1 || nameiparent)
    return 0;
-  }
-  mtreadavar("inode:%x.%x", ip->dev, ip->inum);
-  gc_end_epoch();
+
+  // XXX write is necessary because of idup.  not logically required,
+  // so we should replace this with mtreadavar() eventually, perhaps
+  // once we implement sloppy counters for long-term inode refs.
+
+  // mtreadavar("inode:%x.%x", ip->dev, ip->inum);
+  mtwriteavar("inode:%x.%x", ip->dev, ip->inum);
+
+  idup(ip);
  return ip;
 }


--- a/kernel/lapic.cc
+++ b/kernel/lapic.cc
@@ -142,17 +142,11 @@ cpunum(void)
 {
  // Cannot call cpu when interrupts are enabled:
  // result not guaranteed to last long enough to be used!
-  // Would prefer to panic but even printing is chancy here:
-  // almost everything, including cprintf and panic, calls cpu,
-  // often indirectly through acquire and release.
  if(readrflags()&FL_IF){
-    static int n __mpalign__;
-    if(n == 0) {
-      n++;
-      cprintf("cpu called from %p with interrupts enabled\n",
+    cli();
+    panic("cpunum() called from %p with interrupts enabled\n",
      __builtin_return_address(0));
  }
-  }

  if(lapic)
    return lapic[ID]>>24;

--- a/kernel/proc.cc
+++ b/kernel/proc.cc
@@ -13,6 +13,7 @@
 #include "kalloc.hh"
 #include "vm.hh"
 #include "ns.hh"
+#include "fcntl.h"

 u64
 proc_hash(const u32 &p)
@@ -344,7 +345,10 @@ fork(int flags)
  if((np = proc::alloc()) == 0)
    return -1;

-  if(flags == 0) {
+  if(flags & FORK_SHARE_VMAP) {
+    np->vmap = myproc()->vmap;
+    np->vmap->ref++;
+  } else {
    // Copy process state from p.
    if((np->vmap = myproc()->vmap->copy(cow)) == 0){
      ksfree(slab_stack, np->kstack);
@@ -354,9 +358,6 @@ fork(int flags)
      freeproc(np);
      return -1;
    }
-  } else {
-    np->vmap = myproc()->vmap;
-    np->vmap->ref++;
  }

  np->parent = myproc();
@@ -366,16 +367,16 @@ fork(int flags)
  // Clear %eax so that fork returns 0 in the child.
  np->tf->rax = 0;

-  if (flags == 0) {
+  if (flags & FORK_SHARE_FD) {
+    myproc()->ftable->incref();
+    np->ftable = myproc()->ftable;
+  } else {
    np->ftable = new filetable(*myproc()->ftable);
    if (np->ftable == nullptr) {
      // XXX(sbw) leaking?
      freeproc(np);
      return -1;
    }
-  } else {
-    myproc()->ftable->incref();
-    np->ftable = myproc()->ftable;
  }

  np->cwd = idup(myproc()->cwd);

--- a/kernel/radix.cc
+++ b/kernel/radix.cc
-#include "types.h"
-#include "atomic.hh"
-#include "spinlock.h"
-#include "kernel.hh"
-#include "cpputil.hh"
+#include "crange_arch.hh"
 #include "radix.hh"

 // Returns the level we stopped at.

--- a/kernel/sysfile.cc
+++ b/kernel/sysfile.cc
@@ -215,6 +215,9 @@ create(inode *cwd, const char *path, short type, short major, short minor)
 {
  struct inode *ip, *dp;
  char name[DIRSIZ];
+  mt_ascope ascope("%s(%d.%d,%s,%d,%d,%d)",
+                   __func__, cwd->dev, cwd->inum,
+                   path, type, major, minor);

 retry:
  if((dp = nameiparent(cwd, path, name)) == 0)
@@ -239,6 +242,8 @@ create(inode *cwd, const char *path, short type, short major, short minor)
  ip->nlink = 1;
  iupdate(ip);

+  mtwriteavar("inode:%x.%x", ip->dev, ip->inum);
+
  if(type == T_DIR){  // Create . and .. entries.
    dp->nlink++;  // for ".."
    iupdate(dp);
@@ -291,6 +296,10 @@ sys_openat(int dirfd, const char *path, int omode)
  if(omode & O_CREATE){
    if((ip = create(cwd, path, T_FILE, 0, 0)) == 0)
      return -1;
+
+    // XXX necessary because the mtwriteavar() to the same abstract variable
+    // does not propagate to our scope, since create() has its own inner scope.
+    mtwriteavar("inode:%x.%x", ip->dev, ip->inum);
  } else {
 retry:
    if((ip = namei(cwd, path)) == 0)

--- a/lib/threads.cc
+++ b/lib/threads.cc
@@ -2,6 +2,7 @@
 #include "pthread.h"
 #include "user.h"
 #include "atomic.hh"
+#include "fcntl.h"

 enum { stack_size = 8192 };
 static std::atomic<int> nextkey;
@@ -22,7 +23,7 @@ pthread_create(pthread_t* tid, const pthread_attr_t* attr,
               void* (*start)(void*), void* arg)
 {
  char* base = (char*) sbrk(stack_size);
-  int t = forkt(base + stack_size, (void*) start, arg);
+  int t = forkt(base + stack_size, (void*) start, arg, FORK_SHARE_VMAP | FORK_SHARE_FD);
  if (t < 0)
    return t;


--- a/lib/uthread.S
+++ b/lib/uthread.S
@@ -12,7 +12,7 @@ forkt:
        movq %rdx, 0x00(%r12)   # arg
        movq %rsi, 0x08(%r12)   # function ptr

-        movq $1, %rdi           # flag for sys_fork
+        movq %rcx, %rdi         # flag for sys_fork
        movq $SYS_fork, %rax
        syscall


--- a/lib/wq.cc
+++ b/lib/wq.cc
@@ -79,12 +79,6 @@ initwq(void)
  wqarch_init();
 }

-void
-exitwq(void)
-{
-  wqarch_exit();
-}
-
 //
 // wq
 //

--- a/tools/mkfs.c
+++ b/tools/mkfs.c
@@ -11,8 +11,8 @@
 #include "include/stat.h"

 int nblocks = 4067;
-int ninodes = 200;
-int size = 4096;
+int ninodes = 800;
+int size = 4172;

 int fsfd;
 struct superblock sb;

--- a/user/Makefrag.user
+++ b/user/Makefrag.user
-CXXFLAGS := -iquote user $(filter-out -nostdinc++ -Istdinc, $(CXXFLAGS)) -msse
+CXXFLAGS := -iquote user $(filter-out -nostdinc++ -Istdinc -Inet, $(CXXFLAGS)) -msse

 $(O)/utest: $(O)/kernel/crange.o \
            $(O)/kernel/gc.o \
            $(O)/kernel/rnd.o \
+            $(O)/kernel/radix.o \
            $(O)/user/umain.o
 	@echo "  LD     $@"
 	$(Q)mkdir -p $(@D)

--- a/user/crange_arch.hh
+++ b/user/crange_arch.hh
@@ -133,6 +133,12 @@ mycpu()
  return (cpu*) &cpus[myproc()->cpuid];
 }

+static inline int
+mycpuid()
+{
+  return mycpu()->id;
+}
+
 static inline void pushcli() {}
 static inline void popcli()  {}


--- a/user/umain.cc
+++ b/user/umain.cc
 #include <unistd.h>
 #include <signal.h>
 #include <getopt.h>
+#include <string.h>

 #include "crange_arch.hh"
 #include "gc.hh"
 #include "crange.hh"
+#include "radix.hh"
 #include "atomic_util.hh"
 #include "ns.hh"
 #include "uscopedperf.hh"
@@ -80,8 +82,13 @@ threadpin(void (*fn)(void*), void *arg, const char *name, int cpu)
  makeproc(p);
 }

-struct my_range : public range {
-  my_range(crange *cr, u64 k, u64 sz) : range(cr, k, sz) {}
+struct my_crange_range : public range {
+  my_crange_range(crange *cr, u64 k, u64 sz) : range(cr, k, sz) {}
+  virtual void do_gc() { delete this; }
+};
+
+struct my_radix_range : public radix_elem {
+  my_radix_range(radix *cr, u64 k, u64 sz) {}
  virtual void do_gc() { delete this; }
 };

@@ -92,7 +99,7 @@ enum { crange_items = 1024 };
 enum { random_keys = 0 };

 static void
-worker(void *arg)
+worker_crange(void *arg)
 {
  crange *cr = (crange*) arg;

@@ -106,7 +113,7 @@ worker(void *arg)
      span.replace(0);
    } else {
      ANON_REGION("worker add", &perfgroup);
-      span.replace(new my_range(cr, k, 1));
+      span.replace(new my_crange_range(cr, k, 1));
    }
  }

@@ -114,16 +121,48 @@ worker(void *arg)
 }

 static void
-populate(void *arg)
+populate_crange(void *arg)
 {
  crange *cr = (crange*) arg;
  for (u32 i = 0; i < crange_items; i++)
-    cr->search_lock(1 + 2*i, 1).replace(new my_range(cr, 1+2*i, 1));
+    cr->search_lock(1 + 2*i, 1).replace(new my_crange_range(cr, 1+2*i, 1));
+  pthread_barrier_wait(&populate_b);
+}
+
+static void
+worker_radix(void *arg)
+{
+  radix *cr = (radix*) arg;
+
+  for (u32 i = 0; i < iter_total / ncpu; i++) {
+    ANON_REGION("worker op", &perfgroup);
+    u64 rval = random_keys ? rnd<u32>() : myproc()->cpuid;
+    u64 k = 1 + rval % (crange_items * 2);
+    auto span = cr->search_lock(k, 1);
+    if (rnd<u8>() & 1) {
+      ANON_REGION("worker del", &perfgroup);
+      span.replace(k, 1, 0);
+    } else {
+      ANON_REGION("worker add", &perfgroup);
+      span.replace(k, 1, new my_radix_range(cr, k, 1));
+    }
+  }
+
+  pthread_barrier_wait(&worker_b);
+}
+
+static void
+populate_radix(void *arg)
+{
+  radix *cr = (radix*) arg;
+  for (u32 i = 0; i < crange_items; i++)
+    cr->search_lock(1 + 2*i, 1).replace(1+2*i, 1, new my_radix_range(cr, 1+2*i, 1));
  pthread_barrier_wait(&populate_b);
 }

 static const struct option long_opts[] = {
  { "ncpu", required_argument, 0, 'n' },
+  { "tree-type", required_argument, 0, 't' },
  { 0, no_argument, 0, 0 }
 };

@@ -140,14 +179,17 @@ l2(u64 v)
  return l;
 }

+enum { type_crange, type_radix };
+
 int
 main(int ac, char **av)
 {
  ncpu = NCPU;
+  int treetype = type_crange;

  for (;;) {
    int long_idx;
-    int opt = getopt_long(ac, av, "n:", long_opts, &long_idx);
+    int opt = getopt_long(ac, av, "n:t:", long_opts, &long_idx);
    if (opt == -1)
      break;

@@ -157,6 +199,15 @@ main(int ac, char **av)
      assert(ncpu <= NCPU);
      break;

+    case 't':
+      if (!strcmp(optarg, "crange"))
+        treetype = type_crange;
+      else if (!strcmp(optarg, "radix"))
+        treetype = type_radix;
+      else
+        assert(0);
+      break;
+
    case '?':
      printf("Options:\n");
      for (u32 i = 0; long_opts[i].name; i++)
@@ -178,15 +229,25 @@ main(int ac, char **av)
  initgc();

  pthread_barrier_init(&populate_b, 0, 2);
+
  crange cr(l2(crange_items));
-  threadpin(populate, &cr, "populate", 0);
+  radix rr(0);
+
+  if (treetype == type_crange)
+    threadpin(populate_crange, &cr, "populate", 0);
+  else if (treetype == type_radix)
+    threadpin(populate_radix, &rr, "populate", 0);
+
  pthread_barrier_wait(&populate_b);

  pthread_barrier_init(&worker_b, 0, ncpu+1);
  for (u32 i = 0; i < ncpu; i++) {
    char buf[32];
    sprintf(buf, "worker%d", i);
-    threadpin(worker, &cr, buf, i);
+    if (treetype == type_crange)
+      threadpin(worker_crange, &cr, buf, i);
+    else if (treetype == type_radix)
+      threadpin(worker_radix, &rr, buf, i);
  }
  pthread_barrier_wait(&worker_b);


--- a/user/wqlinux.hh
+++ b/user/wqlinux.hh
@@ -83,11 +83,6 @@ wqarch_init(void)
  }
 }

-static inline void
-wqarch_exit(void)
-{
-}
-
 #define xprintf        printf
 #define pushcli()
 #define popcli()