Merge branch 'scale-amd64' of ssh://amsterdam.csail.mit.edu/home/am0/6.828/xv6 into scale-amd64

cc82ee1f · Frans Kaashoek · 0a97dd70 · c5573403 · cc82ee1f · cc82ee1f
--- a/Makefile
+++ b/Makefile
@@ -46,6 +46,7 @@ include lib/Makefrag
 include bin/Makefrag
 include kernel/Makefrag
 include tools/Makefrag
+-include user/Makefrag.$(HW)
 $(O)/%.o: %.c
 	@echo "  CC     $@"
@@ -57,6 +58,11 @@ $(O)/%.o: %.cc
 	$(Q)mkdir -p $(@D)
 	$(Q)$(CXX) $(CXXFLAGS) $(XXFLAGS) -c -o $@ $<
+$(O)/%.o: %.S
+	@echo "  CC     $@"
+	$(Q)mkdir -p $(@D)
+	$(Q)$(CC) $(ASFLAGS) -c -o $@ $<
 xv6memfs.img: bootblock kernelmemfs
 	dd if=/dev/zero of=xv6memfs.img count=10000
 	dd if=bootblock of=xv6memfs.img conv=notrunc

--- a/README
+++ b/README
@@ -50,3 +50,8 @@
  $ CC=gcc CXX=g++ ./configure --prefix=[PREFIX] \
    --enable-targets=x86_64 --enable-optimized
  $ CC=gcc CXX=g++ make && make install
+* user-space version
+  $ make HW=user o.user/utest
--- a/attic/kill.cc
+++ b/attic/kill.cc
-extern "C" {
 #include "types.h"
 #include "stat.h"
 #include "user.h"
-}
 int
 main(int argc, char **argv)

--- a/attic/rm.cc
+++ b/attic/rm.cc
-extern "C" {
 #include "types.h"
 #include "stat.h"
 #include "user.h"
-}
 int
 main(int argc, char *argv[])

--- a/bin/Makefrag
+++ b/bin/Makefrag
@@ -18,6 +18,7 @@ UPROGS= \
 	usertests \
 	lockstat \
 	preadtest \
+	ftest \
 	perf
 ifeq ($(HAVE_LWIP),y)

--- a/bin/ftest.cc
+++ b/bin/ftest.cc
+#include "types.h"
+#include "stat.h"
+#include "fcntl.h"
+#include "user.h"
+#include "lib.h"
+#include "amd64.h"
+#include "ipc.hh"
+#include "stream.h"
+#define FSIZE (64 << 10)
+#define BSIZE 4096
+static char wbuf[512];
+static char rbuf[BSIZE];
+static int check = 0;
+int
+main(int ac, char **av)
+{
+  size_t count;
+  off_t off;
+  FILE *fp;
+  int fd;
+  int i;
+  memset(ipcctl, 0, sizeof(*ipcctl));
+  for (i = 0; i < sizeof(wbuf); i++)
+    wbuf[i] = i % 16;
+  unlink("ftest.x");
+  fd = open("ftest.x", O_CREATE|O_RDWR);
+  for (i = 0; i < FSIZE; ) {
+    count = MIN(sizeof(wbuf), FSIZE-i);
+    if (write(fd, wbuf, count) != count)
+      die("write failed");
+    i += count;
+  }
+  fp = fdopen(fd, "r");
+  if (fp == 0)
+    die("fdopen");
+  off = 0;
+  while ((count = fread(rbuf, 1, BSIZE, fp))) {
+    if (check) {
+      for (i = 0; i < count; i++)
+        if (rbuf[i] != (i+off)%16)
+          die("ftest %u: %u != %u", i, (int)(rbuf[i]), (off+i)%16);
+      off += count;
+    }
+  }
+  fclose(fp);
+  exit();
+}
--- a/bin/halt.cc
+++ b/bin/halt.cc
-extern "C" {
 #include "types.h"
 #include "user.h"
-}
 int
 main(int argc, char *argv[])

--- a/bin/preadtest.cc
+++ b/bin/preadtest.cc
@@ -7,9 +7,6 @@
 #include "amd64.h"
 #include "ipc.hh"
-// XXX(sbw) add a memlayout.h?
-#define KSHARED 0xFFFFF00000000000ull
 #define FSIZE (64 << 10)
 #define BSIZE 4096
 #define PSIZE (4*BSIZE)
@@ -18,8 +15,6 @@ static int use_async;
 static char buf[BSIZE];
-struct ipcctl *ipcctl = (struct ipcctl*)KSHARED;
 struct {
  u64 acount;
  u64 atot;
@@ -28,50 +23,6 @@ struct {
  u64 ptot;
 } stats;
-static msgid_t
-ipc_msg_alloc(void)
-{
-  if (ipcctl->msghead - ipcctl->msgtail == IPC_NMSG)
-    return NULL_MSGID;
-  msgid_t i = ipcctl->msghead % IPC_NMSG;
-  ipcctl->msghead++;
-  return i;
-}
-static void
-ipc_msg_free(int msgid)
-{
-  msgid_t i;
-  i = ipcctl->msgtail % IPC_NMSG;
-  if (i != msgid)
-    die("ipc_free_msg: oops");
-  ipcctl->msgtail++;
-}
-static pageid_t
-ipc_page_alloc(void)
-{
-  if (ipcctl->pagehead - ipcctl->pagetail == IPC_NPAGE)
-    return NULL_PAGEID;
-  pageid_t i = ipcctl->pagehead % IPC_NPAGE;
-  ipcctl->pagehead++;
-  return i;
-}
-static void
-ipc_page_free(pageid_t pageid)
-{
-  pageid_t i;
-  i = ipcctl->pagetail % IPC_NPAGE;
-  if (i != pageid)
-    die("ipc_free_page: oops");
-  ipcctl->pagetail++;
-}
 static void
 kernlet_pread(int fd, size_t count, off_t off)
 {

--- a/include/arc4.hh
+++ b/include/arc4.hh
+class arc4 {
+ public:
+  arc4(const u8 *key, size_t nbytes) {
+    reset();
+    for (size_t n = 0; n < nbytes; n += 128)
+      addkey(&key[n], nbytes > n + 128 ? 128 : n + 128 - nbytes);
+    j = i;
+  }
+  u8 getbyte() {
+    uint8_t si, sj;
+    i = (i + 1) & 0xff;
+    si = s[i];
+    j = (j + si) & 0xff;
+    sj = s[j];
+    s[i] = sj;
+    s[j] = si;
+    return s[(si + sj) & 0xff];
+  }
+  template<class T> T rand() {
+    T v;
+    for (u32 i = 0; i < sizeof(v); i++)
+      *(u8*) &v = getbyte();
+    return v;
+  }
+ private:
+  void reset() {
+    i = 0xff;
+    j = 0;
+    for (u32 n = 0; n < 0x100; n++)
+      s[n] = n;
+  }
+  void addkey(const u8 *key, size_t nbytes) {
+    size_t n, keypos;
+    uint8_t si;
+    for (n = 0, keypos = 0; n < 256; n++, keypos++) {
+      if (keypos >= nbytes)
+        keypos = 0;
+      i = (i + 1) & 0xff;
+      si = s[i];
+      j = (j + si + key[keypos]) & 0xff;
+      s[i] = s[j];
+      s[j] = si;
+    }
+  }
+  u8 i;
+  u8 j;
+  u8 s[256];
+};
--- a/include/atomic.hh
+++ b/include/atomic.hh
@@ -13,31 +13,4 @@
 #define _GLIBCXX_ATOMIC_BUILTINS_8 1
 #include "atomic_std.h"
+#include "atomic_util.hh"
-template<class T>
-bool
-cmpxch(std::atomic<T> *a, T expected, T desired)
-{
-  return a->compare_exchange_weak(expected, desired);
-}
-template<class T>
-bool
-cmpxch(volatile std::atomic<T> *a, T expected, T desired)
-{
-  return a->compare_exchange_weak(expected, desired);
-}
-template<class T>
-bool
-cmpxch_update(std::atomic<T> *a, T *expected, T desired)
-{
-  return a->compare_exchange_weak(*expected, desired);
-}
-template<class T>
-bool
-cmpxch_update(volatile std::atomic<T> *a, T *expected, T desired)
-{
-  return a->compare_exchange_weak(*expected, desired);
-}
--- a/include/atomic_util.hh
+++ b/include/atomic_util.hh
+#pragma once
+template<class T>
+bool
+cmpxch(std::atomic<T> *a, T expected, T desired)
+{
+  return a->compare_exchange_weak(expected, desired);
+}
+template<class T>
+bool
+cmpxch(volatile std::atomic<T> *a, T expected, T desired)
+{
+  return a->compare_exchange_weak(expected, desired);
+}
+template<class T>
+bool
+cmpxch_update(std::atomic<T> *a, T *expected, T desired)
+{
+  return a->compare_exchange_weak(*expected, desired);
+}
+template<class T>
+bool
+cmpxch_update(volatile std::atomic<T> *a, T *expected, T desired)
+{
+  return a->compare_exchange_weak(*expected, desired);
+}
--- a/include/crange.hh
+++ b/include/crange.hh
 #pragma once
-#include "atomic.hh"
-using std::atomic;
 struct crange;
 struct crange_locked;
 struct range;
@@ -18,7 +14,7 @@ class markptr_mark;
 template<class T>
 class markptr {
 protected:
-  atomic<uptr> _p;
+  std::atomic<uptr> _p;
 public:
  markptr() : _p(0) {}
@@ -88,7 +84,7 @@ struct range : public rcu_freed {
 private:
  const u64 key;
  const u64 size;
-  atomic<int> curlevel;  // the current levels it appears on
+  std::atomic<int> curlevel; // the current levels it appears on
  const int nlevel;      // the number of levels this range should appear
  crange *const cr;      // the crange this range is part of
  markptr<range>* const next; // one next pointer per level

--- a/include/crange_arch.hh
+++ b/include/crange_arch.hh
+#include "types.h"
+#include "kernel.hh"
+#include "spinlock.h"
+#include "condvar.h"
+#include "cpputil.hh"
+#include "atomic.hh"
+#include "proc.hh"
+#include "cpu.hh"
--- a/include/file.hh
+++ b/include/file.hh
@@ -7,7 +7,7 @@ u64 namehash(const strbuf<DIRSIZ>&);
 struct file {
  enum { FD_NONE, FD_PIPE, FD_INODE, FD_SOCKET } type;
-  atomic<int> ref; // reference count
+  std::atomic<int> ref; // reference count
  char readable;
  char writable;
@@ -24,13 +24,13 @@ struct inode : public rcu_freed {
  u32 dev;           // Device number
  u32 inum;          // Inode number
  u32 gen;           // Generation number
-  atomic<int> ref;   // Reference count
+  std::atomic<int> ref; // Reference count
  int flags;         // I_BUSY, I_VALID
-  atomic<int> readbusy;
+  std::atomic<int> readbusy;
  struct condvar cv;
  struct spinlock lock;
  char lockname[16];
-  atomic<xns<strbuf<DIRSIZ>, u32, namehash>*> dir;
+  std::atomic<xns<strbuf<DIRSIZ>, u32, namehash>*> dir;
  short type;         // copy of disk inode
  short major;

--- a/include/gc.hh
+++ b/include/gc.hh
@@ -37,3 +37,8 @@ class scoped_gc_epoch {
  }
 };
+void            initgc(void);
+void            initprocgc(struct proc *);
+void            gc_start(void);
+void            gc_delayed(rcu_freed *);
--- a/include/ipc.hh
+++ b/include/ipc.hh
-#define IPC_NMSG 16
+// XXX(sbw) add a memlayout.h?
-typedef u32 msgid_t;
+#define KSHARED 0xFFFFF00000000000ull
-#define NULL_MSGID (-1)
-#define IPC_NPAGE ((KSHAREDSIZE/PGSIZE) - 1)
 typedef u32 pageid_t;
+typedef u32 msgid_t;
+#define IPC_CTLSIZE 4096
+#define IPC_PGSIZE  4096
+#define IPC_NMSG    16
+#define NULL_MSGID  (-1)
 #define NULL_PAGEID (-1)
+#define IPC_NPAGE   ((KSHAREDSIZE/IPC_PGSIZE) - 1)
 struct ipcmsg {
  volatile char done:1;
@@ -16,10 +21,28 @@ struct ipcmsg {
 };
 struct ipcctl {
-  int msghead;
+  volatile int msghead;
-  int msgtail;
+  volatile int msgtail;
  struct ipcmsg msg[IPC_NMSG];
-  int pagehead;
+  volatile int pagehead;
-  int pagetail;
+  volatile int pagetail;
 };
+extern struct ipcctl *ipcctl;
+msgid_t  ipc_msg_alloc(void);
+void     ipc_msg_free(int msgid);
+pageid_t ipc_page_alloc(void);
+void     ipc_page_free(pageid_t pageid);
+static inline struct ipcmsg*
+getmsg(msgid_t id)
+{
+  return &ipcctl->msg[id];
+}
+static inline char*
+getpage(pageid_t id)
+{
+  return (char*)(KSHARED+IPC_CTLSIZE+(id*IPC_PGSIZE));
+}
--- a/include/kernel.hh
+++ b/include/kernel.hh
@@ -100,16 +100,6 @@ int             dirlink(struct inode*, const char*, u32);
 void            dir_init(struct inode *dp);
 void	        dir_flush(struct inode *dp);
-// gc.c
-void            initgc(void);
-void            initprocgc(struct proc *);
-void            gc_start(void);
-#ifdef __cplusplus
-class rcu_freed;
-void            gc_delayed(rcu_freed *);
-#endif
 // hz.c
 void            microdelay(u64);
 u64             nsectime(void);
@@ -182,16 +172,13 @@ void            userinit(void);
 int             wait(void);
 void            yield(void);
 struct proc*    threadalloc(void (*fn)(void*), void *arg);
+void            threadpin(void (*fn)(void*), void *arg, const char *name, int cpu);
 // prof.c
 extern int profenable;
 void            profreset(void);
 void            profdump(void);
-// rnd.c
-u64             rnd();
 // sampler.c
 void            sampstart(void);
 int             sampintr(struct trapframe*);

--- a/include/ns.hh
+++ b/include/ns.hh
 #pragma once
 #include "gc.hh"
-#include "atomic.hh"
-using std::atomic;
 // name spaces
 // XXX maybe use open hash table, no chain, better cache locality
@@ -18,8 +15,8 @@ template<class K, class V>
 class xelem : public rcu_freed {
 public:
  V val;
-  atomic<int> next_lock;
+  std::atomic<int> next_lock;
-  atomic<xelem<K, V>*> volatile next;
+  std::atomic<xelem<K, V>*> volatile next;
  K key;
  xelem(const K &k, const V &v) : rcu_freed("xelem"), val(v), next_lock(0), next(0), key(k) {}
@@ -28,14 +25,14 @@ class xelem : public rcu_freed {
 template<class K, class V>
 struct xbucket {
-  atomic<xelem<K, V>*> volatile chain;
+  std::atomic<xelem<K, V>*> volatile chain;
 } __attribute__((aligned (CACHELINE)));
 template<class K, class V, u64 (*HF)(const K&)>
 class xns : public rcu_freed {
 private:
  bool allowdup;
-  atomic<u64> nextkey;
+  std::atomic<u64> nextkey;
  xbucket<K, V> table[NHASH];
 public:
@@ -109,8 +106,8 @@ class xns : public rcu_freed {
    scoped_gc_epoch gc;
    for (;;) {
-      atomic<int> fakelock(0);
+      std::atomic<int> fakelock(0);
-      atomic<int> *pelock = &fakelock;
+      std::atomic<int> *pelock = &fakelock;
      auto pe = &table[i].chain;
      for (;;) {

--- a/include/rnd.hh
+++ b/include/rnd.hh
+#pragma once
+u64 rnd();
--- a/include/stream.h
+++ b/include/stream.h
+typedef struct fstream {
+  int fd;
+  off_t off;
+  off_t poff;
+  struct stat stat;
+  int err:1;
+  int eof:1;
+  int pfill:1;
+} FILE;
+FILE  *fdopen(int fd, const char *mode);
+int    fclose(FILE *fp);
+size_t fread(void *ptr, size_t size, size_t nmemb, FILE *fp);
+int    feof(FILE *fp);
+int    ferror(FILE *fp);
--- a/kernel/Makefrag
+++ b/kernel/Makefrag
@@ -67,10 +67,6 @@ $(O)/kernel/%.o: CXXFLAGS+=-mcmodel=large
 $(O)/kernel/incbin.o: ASFLAGS+=-DMAKE_OUT=$(O)
 $(O)/kernel/incbin.o: $(O)/kernel/initcode $(O)/kernel/bootother $(O)/fs.img
-$(O)/kernel/%.o: kernel/%.S
-	@echo "  CC     $@"
-	$(Q)mkdir -p $(@D)
-	$(Q)$(CC) $(ASFLAGS) -c -o $@ $<
 $(O)/kernel/initcode: TTEXT = 0x0
 $(O)/kernel/bootother: TTEXT = 0x7000

--- a/kernel/cpprt.cc
+++ b/kernel/cpprt.cc
@@ -27,6 +27,12 @@ operator delete(void *p)
 }
 void
+operator delete[](void *p)
+{
+  kmfree(p);
+}
+void
 __cxa_pure_virtual(void)
 {
  panic("__cxa_pure_virtual");

--- a/kernel/crange.cc
+++ b/kernel/crange.cc
-#include "types.h"
+#include "crange_arch.hh"
-#include "kernel.hh"
-#include "mmu.h"
-#include "spinlock.h"
-#include "condvar.h"
-#include "queue.h"
-#include "proc.hh"
-#include "cpu.hh"
 #include "gc.hh"
 #include "crange.hh"
-#include "cpputil.hh"
+#include "rnd.hh"
 //
 // Concurrent atomic range operations using skip lists.  An insert may split an
@@ -92,14 +85,14 @@ range::print(int l)
 range::~range()
 {
-  dprintf("%d: range_free: 0x%lx 0x%lx-0x%lx(%ld)\n", myproc()->cpuid, (u64) this, key, key+size, size);
+  //dprintf("%d: range_free: 0x%lx 0x%lx-0x%lx(%ld)\n", myproc()->cpuid, (u64) this, key, key+size, size);
  cr->check(this);
  //    assert(curlevel == -1);
  for (int l = 0; l < nlevel; l++) {
    next[l] = (struct range *) 0xDEADBEEF;
  }
  kmalignfree(lock);
-  kmfree(next);
+  delete[] next;
 }
 void
@@ -107,7 +100,7 @@ range::dec_ref(void)
 {
  int n = curlevel--;
  if (n == 0) {    // now removed from all levels.
-    dprintf("%d: free_delayed: 0x%lx 0x%lx-0x%lx(%lu) %lu\n", myproc()->pid, (long) this, key, key + size, size, myproc()->epoch);
+    //dprintf("%d: free_delayed: 0x%lx 0x%lx-0x%lx(%lu) %lu\n", myproc()->pid, (long) this, key, key + size, size, myproc()->epoch);
    cr->check(this);
    assert(curlevel == -1);
    gc_delayed(this);
@@ -206,7 +199,7 @@ crange::check(struct range *absent)
 {
  if (!crange_checking)
    return;
-  int t = mycpu()->id;
+  int t = -1;  //mycpu()->id;
  struct range *e, *s;
  for (int l = 0; l < nlevel; l++) {
    for (e = crange_head->next[l].ptr(); e; e = s) {
@@ -288,7 +281,7 @@ crange::add_index(int l, range *e, range *p1, markptr<range> s1)
  if (l >= e->nlevel-1) return;
  if (e->next[l+1].mark()) return;
  // crange_check(cr, NULL);
-  if (cmpxch(&e->curlevel, l, l+1)) {
+  if (std::atomic_compare_exchange_strong(&e->curlevel, &l, l+1)) {
    assert(e->curlevel < e->nlevel);
    // this is the core inserting at level l+1, but some core may be deleting
    struct range *s = s1.ptr(); // XXX losing the mark bit ???

--- a/kernel/gc.cc
+++ b/kernel/gc.cc
-#include "types.h"
+#include "crange_arch.hh"
-#include "kernel.hh"
+#include "gc.hh"
-#include "mmu.h"
+#include "atomic_util.hh"
-#include "amd64.h"
-#include "spinlock.h"
-#include "condvar.h"
-#include "queue.h"
-#include "proc.hh"
-#include "cpu.hh"
 #include "ns.hh"
-#include "atomic.hh"
+using std::atomic;
 extern u64 proc_hash(const u32&);
 extern xns<u32, proc*, proc_hash> *xnspid;
@@ -79,7 +74,8 @@ gc_move_to_tofree_cpu(int c, u64 epoch)
  assert(gc_state[c].delayed[fe].epoch == epoch-(NEPOCH-2));   // XXX race with setting epoch = 0
  // unhook list for fe epoch atomically; this shouldn't fail
  head = gc_state[c].delayed[fe].head;
-  while (!cmpxch_update(&gc_state[c].delayed[fe].head, &head, (rcu_freed*) 0)) {}
+  while (!std::atomic_compare_exchange_strong(&gc_state[c].delayed[fe].head,
+                                              &head, (rcu_freed*) 0)) {}
  // insert list into tofree list so that each core can free in parallel and free its elements
  if(gc_state[c].tofree[fe].epoch != gc_state[c].delayed[fe].epoch) {
@@ -256,18 +252,8 @@ initgc(void)
  }
  for (int c = 0; c < ncpu; c++) {
-    struct proc *gcp; 
+    char namebuf[32];
+    snprintf(namebuf, sizeof(namebuf), "gc_%u", c);
-    gcp = threadalloc(gc_worker, NULL);
+    threadpin(gc_worker, 0, namebuf, c);
-    if (gcp == NULL)
-      panic("threadalloc: gc_worker");
-    snprintf(gcp->name, sizeof(gcp->name), "gc_%u", c);
-    gcp->cpuid = c;
-    gcp->cpu_pin = 1;
-    acquire(&gcp->lock);
-    gcp->state = RUNNABLE;
-    addrun(gcp);
-    release(&gcp->lock);
  }
 }
--- a/kernel/proc.cc
+++ b/kernel/proc.cc
@@ -674,3 +674,21 @@ threadalloc(void (*fn)(void *), void *arg)
  p->cwd = 0;
  return p;
 }
+void
+threadpin(void (*fn)(void*), void *arg, const char *name, int cpu)
+{
+  struct proc *p;
+  p = threadalloc(fn, arg);
+  if (p == NULL)
+    panic("threadpin: alloc");
+  snprintf(p->name, sizeof(p->name), "%s", name);
+  p->cpuid = cpu;
+  p->cpu_pin = 1;
+  acquire(&p->lock);
+  p->state = RUNNABLE;
+  addrun(p);
+  release(&p->lock);
+}
--- a/kernel/rnd.cc
+++ b/kernel/rnd.cc
-#include "types.h"
+#include "crange_arch.hh"
-#include "kernel.hh"
+#include "rnd.hh"
-#include "cpu.hh"
 struct seed {
  u64 v;

--- a/lib/Makefrag
+++ b/lib/Makefrag
-ULIB = ulib.o usys.o printf.o umalloc.o uthread.o fmt.o
+ULIB = ulib.o usys.o printf.o umalloc.o uthread.o fmt.o stream.o ipc.o
 ULIB := $(addprefix $(O)/lib/, $(ULIB))
-$(O)/lib/%.o: lib/%.S
-	@echo "  CC     $@"
-	$(Q)$(CC) $(ASFLAGS) -c -o $@ $<
 .PRECIOUS: $(O)/lib/%.o
 -include $(O)/lib/*.d
--- a/lib/ipc.cc
+++ b/lib/ipc.cc
+#include "types.h"
+#include "user.h"
+#include "ipc.hh"
+struct ipcctl *ipcctl = (struct ipcctl*)KSHARED;
+msgid_t
+ipc_msg_alloc(void)
+{
+  if (ipcctl->msghead - ipcctl->msgtail == IPC_NMSG)
+    return NULL_MSGID;
+  msgid_t i = ipcctl->msghead % IPC_NMSG;
+  ipcctl->msghead++;
+  return i;
+}
+void
+ipc_msg_free(int msgid)
+{
+  msgid_t i;
+  i = ipcctl->msgtail % IPC_NMSG;
+  if (i != msgid)
+    die("ipc_free_msg: oops %u %u", i, msgid);
+  ipcctl->msgtail++;
+}
+pageid_t
+ipc_page_alloc(void)
+{
+  if (ipcctl->pagehead - ipcctl->pagetail == IPC_NPAGE)
+    return NULL_PAGEID;
+  pageid_t i = ipcctl->pagehead % IPC_NPAGE;
+  ipcctl->pagehead++;
+  return i;
+}
+void
+ipc_page_free(pageid_t pageid)
+{
+  pageid_t i;
+  i = ipcctl->pagetail % IPC_NPAGE;
+  if (i != pageid)
+    die("ipc_free_page: oops");
+  ipcctl->pagetail++;
+}
--- a/lib/stream.cc
+++ b/lib/stream.cc
+#include "types.h"
+#include "stat.h"
+#include "user.h"
+#include "stream.h"
+#include "lib.h"
+#include "amd64.h"
+#include "ipc.hh"
+static const size_t pstride = 4096*4;
+static ssize_t
+fasync(FILE *fp, size_t count, off_t off)
+{
+  struct ipcmsg *msg;
+  msgid_t msgid;
+  pageid_t pageid;
+  msgid = ipc_msg_alloc();
+  if (msgid == NULL_MSGID) {
+    fprintf(2, "fasync: ipc_msg_alloc failed\n");
+    return -1;
+  }
+  pageid = ipc_page_alloc();
+  if (pageid == NULL_PAGEID) {
+    fprintf(2, "fasync: ipc_alloc_page failed\n");
+    return -1;
+  }
+  msg = &ipcctl->msg[msgid];
+  msg->done = 0;
+  msg->pageid = pageid;
+  if (async(fp->fd, count, off, msgid, pageid) != 0) {
+    fprintf(2, "fasync: async failed\n");
+    return -1;
+  }
+  return count;
+}
+static void
+fprefill(FILE *fp)
+{
+  size_t target;
+  if (!fp->pfill)
+    return;
+  target = MIN(fp->off + pstride, fp->stat.size);
+  while (target - fp->poff >= IPC_PGSIZE)
+  {
+    size_t count;
+    int r;
+    count = MIN(target - fp->poff, IPC_PGSIZE);
+    r = fasync(fp, count, fp->poff);
+    if (r < 0)
+      return;
+    fp->poff += r;
+  }
+}
+FILE*
+fdopen(int fd, const char *mode)
+{
+  FILE *fp;
+  if (mode[0] != 'r')
+    return 0;
+  fp = (FILE*)malloc(sizeof(*fp));
+  if (fp == 0)
+    return 0;
+  if (fstat(fd, &fp->stat))
+    return 0;
+  fp->fd = fd;
+  fp->off = 0;
+  fp->poff = 0;
+  fp->pfill = mode[1] == 'p';
+  fprefill(fp);
+  return fp;
+}
+int
+fclose(FILE *fp)
+{
+  int r;
+  r = close(fp->fd);
+  free(fp);
+  return r;
+  // XXX(sbw) free ipcmsgs
+}
+static ssize_t
+fpostfill(void *ptr, size_t count, FILE*fp)
+{
+  struct ipcmsg *msg;
+  msgid_t msgid;
+  if (!fp->pfill)
+    return -2;
+again:
+  msgid = ipcctl->msgtail % IPC_NMSG;
+  msg = getmsg(msgid);
+  if (!msg->submitted)
+    return -2;
+  while (msg->done == 0)
+    nop_pause(); // XXX(sbw) yield somewhere?
+  if (msg->result == -1)
+    return -1;
+  if (msg->off > fp->off) {
+    return -2;
+  } else if ((msg->off + msg->result) < fp->off) {
+    msg->submitted = 0;
+    ipc_page_free(msg->pageid);
+    ipc_msg_free(msgid);
+    goto again;
+  }
+  char *buf = getpage(msg->pageid);
+  off_t boff = fp->off - msg->off;
+  size_t bcount = MIN(count, msg->result-boff);
+  memmove(ptr, buf+boff, bcount);
+  msg->submitted = 0;
+  ipc_page_free(msg->pageid);
+  ipc_msg_free(msgid);
+  return bcount;
+}
+size_t
+fread(void *ptr, size_t size, size_t nmemb, FILE *fp)
+{
+  ssize_t r;
+  r = fpostfill(ptr, size*nmemb, fp);
+  if (r == -2)
+    r = pread(fp->fd, ptr, size*nmemb, fp->off);
+  if (r < 0) {
+    fp->err = 1;
+    return 0;
+  } else if (r == 0) {
+    fp->eof = 1;
+    return 0;
+  }
+  fp->off += r;
+  fprefill(fp);
+  return r;
+}
+int
+feof(FILE *fp)
+{
+  return fp->eof;
+}
+int
+ferror(FILE *fp)
+{
+  return fp->err;
+}
--- a/net/Makefrag
+++ b/net/Makefrag
@@ -14,8 +14,7 @@ LFLAGS += -llwip
 CFLAGS   += -Ilwip/src/include -Inet -Ilwip/src/include/ipv4 -DLWIP
 CXXFLAGS += -Ilwip/src/include -Inet -Ilwip/src/include/ipv4 -DLWIP
-LWIP_CFLAGS = $(COMFLAGS) -std=c99 \
+LWIP_CFLAGS = -Wno-attributes \
-	    -Wno-attributes \
 	    -Wno-address \
 	    -Wno-char-subscripts \
 	    -Wno-unused-but-set-variable \
@@ -26,8 +25,7 @@ LWIP_INCLUDES := \
 	-Ilwip/src/include \
 	-Ilwip/src/include/ipv4 \
 	-Inet \
-	-Inet/arch \
+	-Inet/arch
-	-I.
 LWIP_SRCFILES += \
 	lwip/src/api/api_lib.c \
@@ -59,23 +57,11 @@ LWIP_SRCFILES += \
 	lwip/src/core/udp.c \
 	lwip/src/netif/etharp.c \
 	net/sys_arch.c \
-	net/if.c \
+	net/if.c
 LWIP_OBJFILES := $(patsubst %.c, $(O)/%.o, $(LWIP_SRCFILES))
-LWIP_OBJFILES := $(patsubst %.S, $(O)/%.o, $(LWIP_OBJFILES))
-$(O)/net/%.o: CFLAGS+=-mcmodel=large
 $(O)/net/%.o: CXXFLAGS+=-mcmodel=large
+$(O)/lwip/src/%.o: CFLAGS+=-mcmodel=large $(LWIP_CFLAGS) $(LWIP_INCLUDES)
-$(O)/net/%.o: net/%.c
-	@echo "  CC     $@"
-	$(Q)mkdir -p $(@D)
-	$(Q)$(CC) $(LWIP_CFLAGS) $(LWIP_INCLUDES) -c -o $@ $<
-$(O)/lwip/src/%.o: lwip/src/%.c
-	@echo "  CC     $@"
-	$(Q)mkdir -p $(@D)
-	$(Q)$(CC) $(LWIP_CFLAGS) $(LWIP_INCLUDES) -c -o $@ $<
 $(O)/liblwip.a: $(LWIP_OBJFILES)
 	@echo "  AR     $@"

--- a/param.h
+++ b/param.h
@@ -34,6 +34,10 @@
 #define NCPU         4   // maximum number of CPUs
 #define MTRACE       0
 #define PERFSIZE     (512<<20ull)
+#elif defined(HW_user)
+#define NCPU        256
+#define MTRACE      0
+#define PERFSIZE    (16<<20ull)
 #else
 #error "Unknown HW"
 #endif
--- a/user/Makefrag.user
+++ b/user/Makefrag.user
+CXXFLAGS := -Iuser $(CXXFLAGS) -msse
+$(O)/utest: $(O)/kernel/crange.o \
+            $(O)/kernel/gc.o \
+            $(O)/kernel/rnd.o \
+            $(O)/user/umain.o
+	@echo "  LD     $@"
+	$(Q)mkdir -p $(@D)
+	$(Q)$(CXX) -o $@ $^ -lpthread -lrt
--- a/user/crange_arch.hh
+++ b/user/crange_arch.hh
+#include <inttypes.h>
+#include <stdio.h>
+#include <assert.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <atomic>
+#include <utility>
+extern "C" {
+#include <string.h>
+}
+typedef uint8_t   u8;
+typedef uint16_t  u16;
+typedef uint32_t  u32;
+typedef uint64_t  u64;
+typedef uintptr_t uptr;
+#define cprintf(...) printf(__VA_ARGS__)
+#define panic(...) do { printf(__VA_ARGS__); assert(0); } while (0)
+#define LOCKSTAT_CRANGE 0
+#define LOCKSTAT_GC     0
+struct spinlock {
+  pthread_mutex_t mu;
+};
+struct condvar {
+  pthread_cond_t cv;
+};
+static inline void
+acquire(spinlock *s)
+{
+  pthread_mutex_lock(&s->mu);
+}
+static inline void
+release(spinlock *s)
+{
+  pthread_mutex_unlock(&s->mu);
+}
+static inline int
+tryacquire(spinlock *s)
+{
+  return !pthread_mutex_trylock(&s->mu);
+}
+static inline void
+initlock(spinlock *s, const char *m, int lockstat)
+{
+  memset(s, 0, sizeof(*s));
+}
+static inline void
+cv_wakeup(condvar *c)
+{
+  pthread_cond_signal(&c->cv);
+}
+static inline void
+cv_sleepto(condvar *c, spinlock *s, u64 ns)
+{
+  timespec ts;
+  ts.tv_sec  = ns / 1000000000;
+  ts.tv_nsec = ns % 1000000000;
+  pthread_cond_timedwait(&c->cv, &s->mu, &ts);
+}
+static inline u64
+nsectime()
+{
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  return (((u64) 1000000000) * ts.tv_sec) + ts.tv_nsec;
+}
+static inline void
+initcondvar(condvar *c, const char *m)
+{
+  memset(c, 0, sizeof(*c));
+}
+static inline int
+kmalign(void **ptr, size_t align, size_t size)
+{
+  return posix_memalign(ptr, align, size);
+}
+static inline void
+kmalignfree(void *ptr)
+{
+  free(ptr);
+}
+struct proc {
+  spinlock gc_epoch_lock;
+  u64 epoch;
+  u64 epoch_depth;
+  u32 cpuid;
+  u32 pid;
+  char name[32];
+  void (*f) (void*);
+  void *farg;
+};
+struct cpu {
+  u32 id;
+};
+extern pthread_key_t myproc_key;
+extern cpu cpus[];
+extern proc procs[];
+extern u32 ncpu;
+extern u64 ticks;
+static inline proc*
+myproc()
+{
+  return (proc*) pthread_getspecific(myproc_key);
+}
+static inline cpu*
+mycpu()
+{
+  return (cpu*) &cpus[myproc()->cpuid];
+}
+static inline void pushcli() {}
+static inline void popcli()  {}
+void threadpin(void (*fn)(void*), void *arg, const char *name, int cpu);
--- a/user/intelctr.hh
+++ b/user/intelctr.hh
+namespace intelctr {
+using scopedperf::tsc_ctr;
+using scopedperf::pmc_setup;
+static tsc_ctr tsc;
+static pmc_setup<48> l2_ld_hit(0x00410124, "l2 ld hit");
+static pmc_setup<48> l2_ld_miss(0x00410224, "l2 ld miss");
+// rfo: request for ownership (~write)
+static pmc_setup<48> l2_rfo_hit(0x00410424, "l2 rfo hit");
+static pmc_setup<48> l2_rfo_miss(0x00410824, "l2 rfo miss");
+static pmc_setup<48> l2_i_hit(0x00411024, "l2 i hit");
+static pmc_setup<48> l2_i_miss(0x00412024, "l2 i miss");
+static pmc_setup<48> l2_prefetch_hit(0x00414024, "l2 pref hit");
+static pmc_setup<48> l2_prefetch_miss(0x00418024, "l2 pref miss");
+static pmc_setup<48> l2_prefetch(0x0041c024, "l2 prefetch");  // ~zero
+static pmc_setup<48> l2_miss(0x0041aa24, "l2 all miss");
+static pmc_setup<48> l2_refs(0x0041ff24, "l2 all refs");
+// ---
+static pmc_setup<48> l2_ld_demand(0x00410f26, "l2 demand ld");
+static pmc_setup<48> l2_ld_demand_i(0x00410126, "l2 dem ld i");  // ~l2_ld_miss
+static pmc_setup<48> l2_ld_demand_s(0x00410226, "l2 dem ld s");  // ~l2_rfo_miss
+static pmc_setup<48> l2_ld_demand_e(0x00410426, "l2 dem ld e");
+static pmc_setup<48> l2_ld_demand_m(0x00410826, "l2 dem ld m");
+static pmc_setup<48> l2_ld_prefetch(0x0041f026, "l2 prefetch ld");  // ~zero
+// ---
+static pmc_setup<48> l2_wr_i(0x00410127, "l2 write i");
+static pmc_setup<48> l2_wr_s(0x00410227, "l2 write s");
+static pmc_setup<48> l2_wr_m(0x00410827, "l2 write m");
+static pmc_setup<48> l2_wr_sem(0x00410e27, "l2 write sem");
+static pmc_setup<48> l2_wr(0x00410f27, "l2 write");  // l2_wr_i + l2_wr_sem
+static pmc_setup<48> l2_wrlk(0x0041f027, "l2 wrlk");  // ??
+// ---
+// where do loads come from?  interesting, but maybe inaccurate?
+//   doesn't add up to other l2 counters..
+static pmc_setup<48> ld_l1hit(0x004101cb, "ld l1 hit");
+static pmc_setup<48> ld_l2hit(0x004102cb, "ld l2 hit");
+static pmc_setup<48> ld_l3hit_unsh(0x004104cb, "ld l3 unsh");
+static pmc_setup<48> ld_l2other(0x004108cb, "ld l2 other");
+static pmc_setup<48> ld_offdie(0x004110cb, "ld offdie");
+static pmc_setup<48> ld_lfb(0x004140cb, "ld lfb");
+static pmc_setup<48> ld_dtlbmiss(0x004180cb, "ld dtlb-miss");
+// ---
+static pmc_setup<48> uops(0x0041010e, "uops_issued");
+static pmc_setup<48> mem_loads(0x0041010b, "mem load ins");
+static pmc_setup<48> mem_stores(0x0041020b, "mem store ins");
+static pmc_setup<48> dtlb_miss(0x00410149, "dtlb miss");
+static pmc_setup<48> itlb_miss(0x00410185, "itlb miss");
+}
--- a/user/scopedperf.hh
+++ b/user/scopedperf.hh
+#pragma once
+/*
+ * Canonical location:
+ *   git+ssh://amsterdam.csail.mit.edu/home/am1/prof/proftools.git
+ *   under spmc/lib/scopedperf.hh
+ */
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/time.h>
+namespace scopedperf {
+/*
+ * statically enable/disable most of the generated code for profiling.
+ */
+class default_enabler {
+ public:
+  bool enabled() const { return true; }
+};
+class always_enabled {
+ public:
+  bool enabled() const { return true; }
+};
+class always_disabled {
+ public:
+  bool enabled() const { return false; }
+};
+/*
+ * spinlock: mostly to avoid pthread mutex sleeping.
+ */
+class spinlock {
+ public:
+  spinlock() : x(0) {}
+  void acquire() {
+    while (!__sync_bool_compare_and_swap(&x, 0, 1))
+      ;
+  }
+  void release() {
+    x = 0;
+  }
+ private:
+  volatile uint x;
+};
+class scoped_spinlock {
+ public:
+  scoped_spinlock(spinlock *larg) : l(larg) {
+    l->acquire();
+    held = true;
+  }
+  void release() {
+    if (held)
+      l->release();
+    held = false;
+  }
+  ~scoped_spinlock() { release(); }
+ private:
+  spinlock *const l;
+  bool held;
+};
+/*
+ * ctrgroup: a group of performance counters.
+ */
+template<typename... Counters>
+class ctrgroup_chain;
+template<>
+class ctrgroup_chain<> {
+ public:
+  ctrgroup_chain() {}
+  static const uint nctr = 0;
+  void get_samples(uint64_t *v) const {}
+  void get_delta(uint64_t *delta, uint64_t *prev) const {}
+  std::vector<std::string> get_names() const { return {}; }
+};
+template<typename One, typename... Others>
+class ctrgroup_chain<One, Others...> : ctrgroup_chain<Others...> {
+ public:
+  ctrgroup_chain(One *x, Others*... y)
+    : ctrgroup_chain<Others...>(y...), ctr(x)
+  {
+    x->setup();
+  }
+  static const uint nctr = 1 + ctrgroup_chain<Others...>::nctr;
+  void get_samples(uint64_t *v) const {
+    v[0] = ctr->sample();
+    ctrgroup_chain<Others...>::get_samples(v+1);
+  }
+  void get_delta(uint64_t *delta, uint64_t *prev) const {
+    uint64_t x = ctr->sample();
+    *delta = (x - *prev) & ctr->mask;
+    *prev = x;
+    ctrgroup_chain<Others...>::get_delta(delta+1, prev+1);
+  }
+  std::vector<std::string> get_names() const {
+    std::vector<std::string> v = ctrgroup_chain<Others...>::get_names();
+    v.insert(v.begin(), ctr->name);
+    return v;
+  }
+ private:
+  const One *const ctr;
+};
+template<typename... Counters>
+ctrgroup_chain<Counters...>
+ctrgroup(Counters*... args)
+{
+  return ctrgroup_chain<Counters...>(args...);
+}
+/*
+ * perfsum: aggregating counter deltas across multiple CPUs.
+ */
+class perfsum_base {
+ public:
+  enum display_opt { show, hide };
+  perfsum_base(const std::string &n, display_opt d) : name(n), disp(d) {
+    scoped_spinlock x(get_sums_lock());
+    get_sums()->push_back(this);
+  }
+  static void printall(int w0 = 17, int w = 13) {
+    scoped_spinlock x(get_sums_lock());
+    auto sums = get_sums();
+    std::sort(sums->begin(), sums->end(),
+	      [](perfsum_base *a, perfsum_base *b) { return a->name < b->name; });
+    for (perfsum_base *ps: *sums) {
+      if (ps->disp == hide || !ps->get_enabled())
+	continue;
+      auto p = ps->get_stats();
+      print_row(ps->name, ps->get_names(), w0, w, [](const std::string &name)
+		{ return name; });
+      print_row("  avg",   p, w0, w, [](const std::pair<uint64_t, uint64_t> &e)
+	        { return ((double) e.second) / (double) e.first; });
+      print_row("  total", p, w0, w, [](const std::pair<uint64_t, uint64_t> &e)
+		{ return e.second; });
+      print_row("  count", p, w0, w, [](const std::pair<uint64_t, uint64_t> &e)
+		{ return e.first; });
+    }
+  }
+  static void resetall() {
+    scoped_spinlock x(get_sums_lock());
+    for (perfsum_base *ps: *get_sums())
+      ps->reset();
+  }
+  virtual std::vector<std::pair<uint64_t, uint64_t> > get_stats() const = 0;
+  virtual std::vector<std::string> get_names() const = 0;
+  virtual bool get_enabled() const = 0;
+  virtual void reset() = 0;
+ private:
+  template<class Row, class Callback>
+  static void print_row(const std::string &rowname, const Row &r,
+			int w0, int w, Callback f)
+  {
+    std::cout << std::left << std::setw(w0) << rowname;
+    for (const auto &elem: r)
+      std::cout << std::left << std::setw(w) << f(elem) << " ";
+    std::cout << std::endl;
+  }
+  static std::vector<perfsum_base*> *get_sums() {
+    static std::vector<perfsum_base*> v;
+    return &v;
+  }
+  static spinlock *get_sums_lock() {
+    static spinlock l;
+    return &l;
+  }
+  const std::string name;
+  const display_opt disp;
+};
+static inline void
+compiler_barrier()
+{
+  /* Avoid compile-time reordering across performance counter reads */
+  __asm __volatile("" ::: "memory");
+}
+template<typename Enabler, typename... Counters>
+class perfsum_ctr : public perfsum_base, public Enabler {
+ public:
+  perfsum_ctr(const ctrgroup_chain<Counters...> *c,
+	      const std::string &n, display_opt d)
+    : perfsum_base(n, d), cg(c), base(0)
+  {
+    reset();
+  }
+  perfsum_ctr(const std::string &n,
+	      const perfsum_ctr<Enabler, Counters...> *basesum, display_opt d)
+    : perfsum_base(n, d), cg(basesum->cg), base(basesum)
+  {
+    reset();
+  }
+  void get_samples(uint64_t *s) const {
+    compiler_barrier();
+    cg->get_samples(s);
+    compiler_barrier();
+  }
+  void record(uint cpuid, uint64_t *s) {
+    uint64_t delta[cg->nctr];
+    compiler_barrier();
+    cg->get_delta(delta, s);
+    compiler_barrier();
+    for (uint i = 0; i < cg->nctr; i++)
+      stat[cpuid].sum[i] += delta[i];
+    stat[cpuid].count++;
+  }
+  std::vector<std::pair<uint64_t, uint64_t> > get_stats() const /* override */ {
+    std::vector<std::pair<uint64_t, uint64_t> > v;
+    for (uint i = 0; i < cg->nctr; i++) {
+      uint64_t b =
+	base ? base->addcpus([i](const stats *s) { return s->sum[i]; })
+	     : addcpus([](const stats *s) { return s->count; });
+      v.push_back(std::make_pair(b,
+	addcpus([i](const stats *s) { return s->sum[i]; })));
+    }
+    return v;
+  }
+  std::vector<std::string> get_names() const /* override */ {
+    return cg->get_names();
+  }
+  bool get_enabled() const /* override */ {
+    return Enabler::enabled();
+  }
+  void reset() /* override */ {
+    memset(stat, 0, sizeof(stat));
+  }
+ private:
+  enum { maxcpu = 256 };
+  struct stats {
+    uint64_t count;
+    uint64_t sum[ctrgroup_chain<Counters...>::nctr];
+  } __attribute__((aligned (128)));
+  struct stats stat[maxcpu];
+  const struct ctrgroup_chain<Counters...> *const cg;
+  const struct perfsum_ctr<Enabler, Counters...> *const base;
+  template<class T>
+  uint64_t addcpus(T f) const {
+    uint64_t tot = 0;
+    for (uint i = 0; i < maxcpu; i++)
+      tot += f(&stat[i]);
+    return tot;
+  }
+};
+template<typename Enabler, typename... Counters>
+class perfsum_ctr_inlinegroup :
+  public ctrgroup_chain<Counters...>,
+  public perfsum_ctr<Enabler, Counters...>
+{
+ public:
+  perfsum_ctr_inlinegroup(const std::string &n, perfsum_base::display_opt d,
+                          Counters*... ctrs)
+    : ctrgroup_chain<Counters...>(ctrs...),
+      perfsum_ctr<Enabler, Counters...>(this, n, d) {}
+};
+template<typename Enabler = default_enabler, typename... Counters>
+perfsum_ctr<Enabler, Counters...>
+perfsum(const std::string &name, const ctrgroup_chain<Counters...> *c,
+	const perfsum_base::display_opt d = perfsum_base::show)
+{
+  return perfsum_ctr<Enabler, Counters...>(c, name, d);
+}
+template<typename Enabler = default_enabler, typename... Counters>
+perfsum_ctr_inlinegroup<Enabler, Counters...>
+perfsum_group(const std::string &name, Counters*... c)
+{
+  return perfsum_ctr_inlinegroup<Enabler, Counters...>(name, perfsum_base::show, c...);
+}
+template<typename Enabler, typename... Counters>
+perfsum_ctr<Enabler, Counters...>
+perfsum_frac(const std::string &name,
+	     const perfsum_ctr<Enabler, Counters...> *base)
+{
+  return perfsum_ctr<Enabler, Counters...>(name, base, perfsum_base::show);
+}
+/*
+ * namedctr &c: actual counter implementations.
+ */
+template<uint64_t CounterWidth>
+class namedctr {
+ public:
+  namedctr(const std::string &n) : name(n) {}
+  void setup() {}
+  const std::string name;
+  static const uint64_t mask =
+    ((1ULL << (CounterWidth - 1)) - 1) << 1 | 1;
+};
+class tsc_ctr : public namedctr<64> {
+ public:
+  tsc_ctr() : namedctr("tsc") {}
+  static uint64_t sample() {
+    uint64_t a, d;
+    __asm __volatile("rdtsc" : "=a" (a), "=d" (d));
+    return a | (d << 32);
+  }
+};
+class tscp_ctr : public namedctr<64> {
+ public:
+  tscp_ctr() : namedctr("tscp") {}
+  static uint64_t sample() {
+    uint64_t a, d, c;
+    __asm __volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
+    return a | (d << 32);
+  }
+};
+template<uint64_t CounterWidth>
+class pmc_ctr : public namedctr<CounterWidth> {
+ public:
+  pmc_ctr(int n) : namedctr<CounterWidth>(mkname(n)), cn(n) {}
+  pmc_ctr(const std::string &nm) : namedctr<CounterWidth>(nm), cn(-1) {}
+  uint64_t sample() const {
+    uint64_t a, d;
+    __asm __volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (cn));
+    return a | (d << 32);
+  }
+  int cn;
+ private:
+  static std::string mkname(int n) {
+    std::stringstream ss;
+    ss << "pmc" << n;
+    return ss.str();
+  }
+};
+template<uint64_t CounterWidth = 64>
+class pmc_setup : public pmc_ctr<CounterWidth> {
+ public:
+  pmc_setup(uint64_t v, const std::string &nm)
+    : pmc_ctr<CounterWidth>(nm), pmc_v(v) {}
+  void setup() {
+    if (pmc_ctr<CounterWidth>::cn >= 0)
+      return;
+    /*
+     * XXX detect how many counters the hardware has
+     */
+    static bool pmcuse[4];
+    static spinlock pmcuselock;
+    int n = 0;
+    scoped_spinlock x(&pmcuselock);
+    while (n < 4 && pmcuse[n])
+      n++;
+    assert(n < 4);
+    pmcuse[n] = true;
+    x.release();
+    // ugly but effective
+    std::stringstream ss;
+    ss << "for f in /sys/kernel/spmc/cpu*/" << n << "; do "
+       << "echo " << std::hex << pmc_v << " > $f; done";
+    assert(0 == system(ss.str().c_str()));
+    pmc_ctr<CounterWidth>::cn = n;
+  }
+ private:
+  uint64_t pmc_v;
+};
+class tod_ctr : public namedctr<64> {
+ public:
+  tod_ctr() : namedctr("tod-usec") {}
+  uint64_t sample() const {
+    struct timeval tv;
+    gettimeofday(&tv, 0);
+    return ((uint64_t) tv.tv_usec) + ((uint64_t) tv.tv_sec) * 1000000;
+  }
+};
+class zero_ctr : public namedctr<64> {
+ public:
+  zero_ctr() : namedctr("zero") {}
+  uint64_t sample() const { return 0; }
+};
+/*
+ * scoped performance-counting regions, which record samples into a perfsum.
+ */
+template<typename Enabler, typename... Counters>
+class base_perf_region {
+ public:
+  base_perf_region(perfsum_ctr<Enabler, Counters...> *psarg)
+    : ps(psarg), enabled(ps->enabled()), cpuid(enabled ? sched_getcpu() : 0)
+  {
+    if (enabled)
+      ps->get_samples(s);
+  }
+  // invoke lap multiple times to precisely measure iterations
+  // (use same measurement for end of one & start of next round)
+  void lap() {
+    if (enabled)
+      ps->record(cpuid, s);
+  }
+ private:
+  perfsum_ctr<Enabler, Counters...> *const ps;
+  const bool enabled;
+  const uint cpuid;
+  uint64_t s[ctrgroup_chain<Counters...>::nctr];
+};
+template<typename Enabler, typename... Counters>
+class scoped_perf_region : public base_perf_region<Enabler, Counters...> {
+ public:
+  scoped_perf_region(perfsum_ctr<Enabler, Counters...> *psarg)
+    : base_perf_region<Enabler, Counters...>(psarg) {}
+  ~scoped_perf_region() { base_perf_region<Enabler, Counters...>::lap(); }
+};
+template<typename Enabler, typename... Counters>
+class killable_perf_region : public base_perf_region<Enabler, Counters...> {
+ public:
+  killable_perf_region(perfsum_ctr<Enabler, Counters...> *psarg)
+    : base_perf_region<Enabler, Counters...>(psarg), active(true) {}
+  ~killable_perf_region() { stop(); }
+  // perform a final measurement, if needed before destructor
+  void stop() {
+    if (active)
+      base_perf_region<Enabler, Counters...>::lap();
+    active = false;
+  }
+  // prevent destructor from performing a measurement
+  void kill() { active = false; }
+ private:
+  bool active;
+};
+template<typename Enabler, typename... Counters>
+scoped_perf_region<Enabler, Counters...>
+perf_region(perfsum_ctr<Enabler, Counters...> *ps)
+{
+  return scoped_perf_region<Enabler, Counters...>(ps);
+}
+template<typename Enabler, typename... Counters>
+killable_perf_region<Enabler, Counters...>
+killable_region(perfsum_ctr<Enabler, Counters...> *ps)
+{
+  return killable_perf_region<Enabler, Counters...>(ps);
+}
+/*
+ * macros for the common case of putting in a scoped perf-counting region.
+ */
+#define __PERF_CONCAT2(a, b)  a ## b
+#define __PERF_CONCAT(a, b)   __PERF_CONCAT2(a, b)
+#define __PERF_ANON	      __PERF_CONCAT(__anon_id_, __COUNTER__)
+#define __PERF_REGION(region_var, sum_var, region_type, text, group)	       \
+  static auto __PERF_CONCAT(sum_var, _sum) = scopedperf::perfsum(text, group); \
+  auto region_var = region_type(&__PERF_CONCAT(sum_var, _sum));
+#define ANON_REGION(text, group) \
+  __PERF_REGION(__PERF_ANON, __PERF_ANON, scopedperf::perf_region, text, group)
+#define PERF_REGION(var, text, group) \
+  __PERF_REGION(var, __PERF_ANON, scopedperf::perf_region, text, group)
+#define KILLABLE_REGION(var, text, group) \
+  __PERF_REGION(var, __PERF_ANON, scopedperf::killable_region, text, group)
+} /* namespace scopedperf */
--- a/user/umain.cc
+++ b/user/umain.cc
+#include <unistd.h>
+#include <signal.h>
+#include <getopt.h>
+#include "crange_arch.hh"
+#include "gc.hh"
+#include "crange.hh"
+#include "atomic_util.hh"
+#include "ns.hh"
+#include "scopedperf.hh"
+#include "intelctr.hh"
+#include "arc4.hh"
+#include "amd64.h"
+static auto perfgroup = ctrgroup(&intelctr::tsc
+                                // ,&intelctr::l2_refs
+                                // ,&intelctr::l2_miss
+                                );
+u64
+proc_hash(const u32 &pid)
+{
+  return pid;
+}
+pthread_key_t myproc_key, arc4_key;
+cpu cpus[NCPU];
+u32 ncpu;
+u64 ticks;
+xns<u32, proc*, proc_hash> *xnspid;
+static auto rnd_perfsum = scopedperf::perfsum("arc4 rnd", &perfgroup);
+template<class T>
+T rnd()
+{
+  auto __PERF_ANON = scopedperf::perf_region(&rnd_perfsum);
+  arc4 *a = (arc4*) pthread_getspecific(arc4_key);
+  if (!a) {
+    struct seed { u64 a, b; } s = { rdtsc(), pthread_self() };
+    a = new arc4((u8*) &s, sizeof(s));
+    pthread_setspecific(arc4_key, a);
+  }
+  return a->rand<T>();
+}
+static void*
+proc_start(void *arg)
+{
+  proc *p = (proc *) arg;
+  pthread_setspecific(myproc_key, p);
+  p->pid = pthread_self();
+  initprocgc(p);
+  xnspid->insert(p->pid, p);
+  p->f(p->farg);
+  return 0;
+}
+void
+makeproc(proc *p)
+{
+  pthread_t tid;
+  pthread_create(&tid, 0, &proc_start, p);
+}
+void
+threadpin(void (*fn)(void*), void *arg, const char *name, int cpu)
+{
+  proc *p = new proc();
+  memset(p, 0, sizeof(*p));
+  p->f = fn;
+  p->farg = arg;
+  snprintf(p->name, sizeof(p->name), "%s", name);
+  p->cpuid = cpu;
+  makeproc(p);
+}
+static pthread_barrier_t worker_b, populate_b;
+enum { iter_total = 1000000 };
+enum { crange_items = 1024 };
+static void
+worker(void *arg)
+{
+  crange *cr = (crange*) arg;
+  for (u32 i = 0; i < iter_total / ncpu; i++) {
+    ANON_REGION("worker op", &perfgroup);
+    u64 k = 1 + rnd<u32>() % (crange_items * 2);
+    auto span = cr->search_lock(k, 1);
+    if (rnd<u8>() & 1) {
+      ANON_REGION("worker del", &perfgroup);
+      span.replace(0);
+    } else {
+      ANON_REGION("worker add", &perfgroup);
+      span.replace(new range(cr, k, 1));
+    }
+  }
+  pthread_barrier_wait(&worker_b);
+}
+static void
+populate(void *arg)
+{
+  crange *cr = (crange*) arg;
+  for (u32 i = 0; i < crange_items; i++)
+    cr->search_lock(1 + 2*i, 1).replace(new range(cr, 1+2*i, 1));
+  pthread_barrier_wait(&populate_b);
+}
+static const struct option long_opts[] = {
+  { "ncpu", required_argument, 0, 'n' },
+  { 0, no_argument, 0, 0 }
+};
+static u32
+l2(u64 v)
+{
+  u32 l = 0;
+  while (v) {
+    v = v>>1;
+    l++;
+  }
+  return l;
+}
+int
+main(int ac, char **av)
+{
+  ncpu = NCPU;
+  for (;;) {
+    int long_idx;
+    int opt = getopt_long(ac, av, "n:", long_opts, &long_idx);
+    if (opt == -1)
+      break;
+    switch (opt) {
+    case 'n':
+      ncpu = atoi(optarg);
+      assert(ncpu <= NCPU);
+      break;
+    case '?':
+      printf("Options:\n");
+      for (u32 i = 0; long_opts[i].name; i++)
+        printf("  -%c / --%s%s\n",
+               long_opts[i].val,
+               long_opts[i].name,
+               long_opts[i].has_arg == required_argument ? " ARG" :
+               long_opts[i].has_arg == optional_argument ? " [ARG]" : "");
+      exit(-1);
+    }
+  }
+  assert(0 == pthread_key_create(&myproc_key, 0));
+  assert(0 == pthread_key_create(&arc4_key, 0));
+  for (u32 i = 0; i < NCPU; i++)
+    cpus[i].id = i;
+  xnspid = new xns<u32, proc*, proc_hash>(false);
+  initgc();
+  pthread_barrier_init(&populate_b, 0, 2);
+  crange cr(l2(crange_items));
+  threadpin(populate, &cr, "populate", 0);
+  pthread_barrier_wait(&populate_b);
+  pthread_barrier_init(&worker_b, 0, ncpu+1);
+  for (u32 i = 0; i < ncpu; i++) {
+    char buf[32];
+    sprintf(buf, "worker%d", i);
+    threadpin(worker, &cr, buf, i);
+  }
+  pthread_barrier_wait(&worker_b);
+  scopedperf::perfsum_base::printall();
+}