Merge branch 'scale-amd64' of git+ssh://amsterdam.csail.mit.edu/home/am0/6.828/xv6 into scale-amd64

eae2c654 · Silas Boyd-Wickizer · acbc6c42 · 1a129849 · eae2c654 · eae2c654
--- a/crange.c
+++ b/crange.c
@@ -24,9 +24,7 @@
 // node for deletion by marking its next pointer.
 //

-#define assert(c)   if (!(c)) { panic("assertion failure"); }
-
-#define CRANGE_CHECKING 1
+#define CRANGE_CHECKING 0
 #define MINNLEVEL 10

 #define MARKED(x) (((uintptr) (x)) & 0x1)
@@ -158,7 +156,7 @@ static void
 clist_range_free(void *p)
 {
  struct clist_range *e = (struct clist_range *) p;
-  // cprintf("crange_free: %u(%u)\n", e->key, e->size);
+  if (crange_debug) cprintf("crange_free: %u(%u)\n", e->key, e->size);
  crange_check(e->cr, 0, p);
  for (int l = 0; l < e->nlevel; l++) {
    e->next[l] = (struct clist_range *) 0xDEADBEEF;

--- a/gc.c
+++ b/gc.c
@@ -9,10 +9,21 @@
 #include "cpu.h"
 #include "kmtrace.h"

-#define NEPOCH 4
+// GC scheme based on Fraser's:
+// a machine has a global_epoch
+// a process maintain an epoch (>= global_epoch)
+// one gc thread and state (e.g., NEPOCH delaylists and one tofreelists) per core
+// a process add to its core epoch's delayed freelist on delayed_free
+// a gcc performs two jobs:
+// 1. one gcc thread perform step 1:
+//   updates a thread's epoch, when not in an epoch
+//   compute min over all process's epochs, and sets global_epoch to min
+//   move a core's (global_epoch-2)'s delayed list to a core's tofree list
+//   (costs linear in the number of processes.)
+// 2. in parallel gc threads free the elements on the tofree list (till global_epoach)
+//   (costs linear in the number of elements to be freed)

-static struct { struct spinlock l __mpalign__; } rcu_lock[NCPU];
-static struct { struct condvar cv __mpalign__; } rcu_cv[NCPU];
+enum { gc_debug = 0 };

 struct gc {
  u64 epoch;
@@ -31,29 +42,43 @@ struct gc {
  };
  int type;
 } __mpalign__;
-struct gc gc_epoch[NEPOCH][NCPU] __mpalign__;
-u64 global_epoch __mpalign__;
-int ndelayed __mpalign__;

-enum { rcu_debug = 0 };
+static struct gc_state { 
+  struct condvar cv;
+  struct gc delayed[NEPOCH];
+  struct gc tofree[NEPOCH];
+  int ndelayed;
+  int min_epoch;
+} __mpalign__ gc_state[NCPU] __mpalign__;
+
+static struct { struct spinlock l __mpalign__; } gc_lock;
+u64 global_epoch __mpalign__;

 struct gc *
 gc_alloc()
 {
-  return kmalloc(sizeof(struct gc));
+  struct gc *r = kmalloc(sizeof(struct gc));
+  assert(r);
+  gc_state[mycpu()->id].ndelayed++;
+  return r;
 }

-void *
+static void *
 gc_min(void *vkey, void *v, void *arg){
  u64 *min_epoch_p = arg;
  struct proc *p = (struct proc *) v;
+  acquire(&p->gc_epoch_lock);
+  if (p->epoch_depth == 0) {
+    p->epoch = global_epoch;  
+  }
+  release(&p->gc_epoch_lock);
  if (*min_epoch_p > p->epoch) {
      *min_epoch_p = p->epoch;
  }
-  return 0;
+  return NULL;
 }

-void
+static void
 gc_free_elem(struct gc *r)
 {
  switch (r->type) {
@@ -69,96 +94,111 @@ gc_free_elem(struct gc *r)
  kmfree(r);
 }

-// Fraser's reclaimation scheme: free all delayed-free items in global_epoch-2
-static void
-gc_free_epoch(u64 epoch)
+static int
+gc_free_tofreelist(struct gc **head, u64 epoch)
 {
-  cprintf("free epoch %d\n", epoch);
-
-  for (int j = 0; j < NCPU; j++) {
-    if (__sync_bool_compare_and_swap(&global_epoch, epoch, epoch+1)) {
-      // only one core succeeds; that core in charge of freeing epoch
-      struct gc *head;
+  int nfree = 0;
  struct gc *r, *nr;
-      uint32 fe = (epoch - (NEPOCH-2)) % NEPOCH;
-      int cas;

-      if (gc_epoch[fe][j].epoch != epoch - (NEPOCH-2))
-	panic("gc_free_epoch");
-
-      // unhook list for fe epoch atomically
-      head = gc_epoch[fe][j].next;
-      // this shouldn't fail, because no core is modifying it.
-      cas = __sync_bool_compare_and_swap(&gc_epoch[fe][j].next, head, 0);
-      if (!cas) panic("gc_free_epoch");
-      // free list items on the delayed list
-      for (r = head; r != NULL; r = nr) {
-	if (r->epoch > epoch-(NEPOCH-2)) {
-	  cprintf("%lu %lu\n", r->epoch, epoch-(NEPOCH-2));
-	  panic("gc_free_epoch");
+  for (r = *head; r != NULL; r = nr) {
+    if (r->epoch > epoch) {
+      cprintf("gc_free_tofreelist: r->epoch %ld > epoch %ld\n", r->epoch, epoch);
+      assert(0);
    }
    nr = r->next;
    gc_free_elem(r);
-	int x = __sync_fetch_and_sub(&ndelayed, 1);
-	if (x < 0) panic("gc_free_epoch");
-      }
-      if (gc_epoch[fe][j].next != 0)
-	panic("gc_free_epoch");
-      gc_epoch[fe][j].epoch = gc_epoch[fe][j].epoch + NEPOCH;
-    }
+    nfree++;
  }
+  *head = r;
+  return nfree;
 }

-void
-gc(void)
+
+// move to free delayed list to tofreelist so that a process can do its own freeing
+void *
+gc_move_to_tofree_cpu(int c, u64 epoch)
 {
-  u64 global = global_epoch;
-  u64 min = global;
-  ns_enumerate(nspid, gc_min, &min);
-  // cprintf("gc: global %lu min %lu ndelay %d\n", global_epoch, min, ndelayed);
-  if (min >= global) {
-    gc_free_epoch(min);
+  struct gc *head;
+  uint32 fe = (epoch - (NEPOCH-2)) % NEPOCH;
+  int cas;
+  assert(gc_state[c].delayed[fe].epoch == epoch-(NEPOCH-2));   // XXX race with setting epoch = 0
+  // unhook list for fe epoch atomically; this shouldn't fail
+  head = gc_state[c].delayed[fe].next;
+  cas = __sync_bool_compare_and_swap(&(gc_state[c].delayed[fe].next), head, 0);
+  assert(cas);
+
+  // insert list into tofree list so that each core can free in parallel and free its elements
+  if(gc_state[c].tofree[fe].epoch != gc_state[c].delayed[fe].epoch) {
+    cprintf("%d: tofree epoch %lu delayed epoch %lu\n", c, gc_state[c].tofree[fe].epoch,
+	    gc_state[c].delayed[fe].epoch);
+    assert(0);
  }
-}
+  cas = __sync_bool_compare_and_swap(&(gc_state[c].tofree[fe].next), 0, head);
+  assert(cas);

+  // move delayed NEPOCH's adhead
+  gc_state[c].delayed[fe].epoch += NEPOCH;
+  assert(gc_state[c].delayed[fe].next == 0);
+
+  return 0;
+}

+// only one thread should call this function
 static void
-gc_worker(void *x)
+gc_move_to_tofree(u64 epoch)
 {
-  struct spinlock wl;
-
-  initlock(&wl, "rcu_gc_worker");   // dummy lock
-
-  for (;;) {
-    gc();
-
-    acquire(&wl);
-    cv_sleep(&rcu_cv[mycpu()->id].cv, &wl);
-    release(&wl);
+  if (gc_debug)
+    cprintf("%d: free epoch %ld\n", mycpu()->id, epoch);
+  for (int c = 0; c < NCPU; c++) {
+    gc_move_to_tofree_cpu(c, epoch);
  }
+  int ok  = __sync_bool_compare_and_swap(&global_epoch, epoch, epoch+1);
+  assert(ok);
 }

-void
-gc_start(void)
+// If all threads have seen global_epoch, we can move elements in global_epoch-2 to tofreelist
+static void
+gc_delayfreelist(void)
 {
-  cv_wakeup(&rcu_cv[mycpu()->id].cv);
+  int r = tryacquire(&gc_lock.l);
+  if (r == 0) return;
+  assert(r == 1);
+
+  u64 global = global_epoch;
+  u64 min = global;
+  // make that global_epoch doesn't run into a core's min_epoch
+  for (int c = 0; c < NCPU; c++) { 
+    int w = gc_state[c].min_epoch + NEPOCH-1;
+    if (w < min) {
+      min = w;
+    }
+  }
+  myproc()->epoch_depth++;// ensure ns_enumate's call to gc_begin_epoch doesn't have sideeffects
+  ns_enumerate(nspid, gc_min, &min);
+  myproc()->epoch_depth--;
+  if (min >= global) {
+    gc_move_to_tofree(min);
+  }
+  release(&gc_lock.l);
 }

 static void
 gc_delayed_int(struct gc *r)
 {
  pushcli();
+  int c = mycpu()->id;
  u64 myepoch = myproc()->epoch;
-  u64 minepoch = gc_epoch[myepoch % NEPOCH][mycpu()->id].epoch;
-  // cprintf("%d: gc_delayed: %lu ndelayed %d\n", myproc()->pid, global_epoch, ndelayed);
+  u64 minepoch = gc_state[c].delayed[myepoch % NEPOCH].epoch;
+  if (gc_debug) 
+    cprintf("(%d, %d): gc_delayed: %lu ndelayed %d\n", c, myproc()->pid, global_epoch, gc_state[c].ndelayed);
  if (myepoch != minepoch) {
    cprintf("%d: myepoch %lu minepoch %lu\n", myproc()->pid, myepoch, minepoch);
    panic("gc_delayed_int");
  }
  r->epoch = myepoch;
  do {
-    r->next = gc_epoch[myepoch % NEPOCH][mycpu()->id].next;
-  } while (!__sync_bool_compare_and_swap(&(gc_epoch[myepoch % NEPOCH][mycpu()->id].next), r->next, r));
+    r->next = gc_state[c].delayed[myepoch % NEPOCH].next;
+  } while (!__sync_bool_compare_and_swap(&(gc_state[c].delayed[myepoch % NEPOCH].next), r->next, r));
  popcli();
 }

@@ -188,31 +228,81 @@ gc_delayed2(int a1, u64 a2, void (*dofree)(int,u64))
 }

 void
+gc_start(void)
+{
+  cv_wakeup(&gc_state[mycpu()->id].cv);
+}
+
+void
 gc_begin_epoch(void)
 {
-  if (myproc() && myproc()->rcu_read_depth++ == 0)
-    myproc()->epoch = global_epoch;
-  __sync_synchronize();
+  if (myproc() == NULL) return;
+  acquire(&myproc()->gc_epoch_lock);
+  if (myproc()->epoch_depth++ > 0)
+    goto done;
+  myproc()->epoch = global_epoch;  // not atomic, but it never goes backwards
+  // __sync_synchronize();
+ done:
+  release(&myproc()->gc_epoch_lock);
 }

 void
 gc_end_epoch(void)
 {
-  if (myproc() && myproc()->rcu_read_depth > 0)
-    myproc()->rcu_read_depth--;
+  if (myproc() == NULL) return;
+  acquire(&myproc()->gc_epoch_lock);
+  --myproc()->epoch_depth;
+  release(&myproc()->gc_epoch_lock);
+}
+
+static void
+gc_worker(void *x)
+{
+  struct spinlock wl;
+
+  cprintf("gc_worker: %d\n", mycpu()->id);
+
+  initlock(&wl, "rcu_gc_worker dummy");   // dummy lock
+  for (;;) {
+    u64 i;
+    acquire(&wl);
+    cv_sleep(&gc_state[mycpu()->id].cv, &wl);  
+    release(&wl);
+
+    u64 global = global_epoch;
+    for (i = gc_state[mycpu()->id].min_epoch; i < global-2; i++) {
+      int nfree = gc_free_tofreelist(&(gc_state[mycpu()->id].tofree[i%NEPOCH].next), i);
+      gc_state[mycpu()->id].tofree[i%NEPOCH].epoch += NEPOCH;
+      if (gc_debug && nfree > 0) {
+	cprintf("%d: epoch %d freed %d\n", mycpu()->id, i, nfree);
+      }
+    }
+    gc_state[mycpu()->id].min_epoch = i;
+    gc_delayfreelist();
+  }
+}
+
+void
+initprocgc(struct proc *p)
+{
+  p->epoch = global_epoch;
+  initlock(&p->gc_epoch_lock, "per process gc_lock");
 }

+
 void
 initgc(void)
 {
+  initlock(&gc_lock.l, "gc");
+  global_epoch = NEPOCH-2;
+
  for (int i = 0; i < NCPU; i++) {
-    initlock(&rcu_lock[i].l, "rcu");
-    initcondvar(&rcu_cv[i].cv, "rcu_gc_cv");
+    for (int j = 0; j < NEPOCH; j++) {
+      gc_state[i].delayed[j].epoch = j;
+      gc_state[i].tofree[j].epoch = j;
+    }
+    initcondvar(&gc_state[i].cv, "gc_cv");
  }
-  global_epoch = NEPOCH-2;
-  for (int i = 0; i < NEPOCH; i++) 
-    for (int j = 0; j < NEPOCH; j++)
-      gc_epoch[i][j].epoch = i;

  for (u32 c = 0; c < NCPU; c++) {
    struct proc *gcp; 

--- a/kernel.h
+++ b/kernel.h
@@ -49,6 +49,7 @@ void            panic(const char*) __attribute__((noreturn));
 void            snprintf(char *buf, u32 n, char *fmt, ...);
 void            consoleintr(int(*)(void));

+#define assert(c)   if (!(c)) { cprintf("%s:%d: ", __FILE__, __LINE__); panic("assertion failure"); }

 // crange.c

@@ -113,6 +114,7 @@ void	        dir_flush(struct inode *dp);

 // gc.c
 void            initgc(void);
+void            initprocgc(struct proc *);
 void            gc_begin_epoch();
 void            gc_end_epoch();
 void            gc_delayed(void*, void (*dofree)(void*));

--- a/param.h
+++ b/param.h
@@ -9,7 +9,7 @@
 #define ROOTDEV       1  // device number of file system root disk
 #define MAXARG       32  // max exec arguments
 #define MAXNAME      16  // max string names
-#define INF          (~0UL)
+#define NEPOCH        4
 #define CACHELINE    64  // cache line size
 #define CPUKSTACKS   (NPROC + NCPU)
 #define QUANTUM      10  // scheduling time quantum and tick length (in msec)

--- a/proc.c
+++ b/proc.c
@@ -189,10 +189,10 @@ allocproc(void)

  p->state = EMBRYO;
  p->pid = ns_allockey(nspid);
-  p->epoch = 0;
  p->cpuid = mycpu()->id;
  p->on_runq = -1;
  p->cpu_pin = 0;
+  initprocgc(p);
 #if MTRACE
  p->mtrace_stacks.curr = -1;
 #endif

--- a/proc.h
+++ b/proc.h
@@ -54,7 +54,8 @@ struct proc {
  SLIST_ENTRY(proc) child_next;
  struct condvar cv;
  u64 epoch;
-  u64 rcu_read_depth;
+  struct spinlock gc_epoch_lock;
+  u64 epoch_depth;
  char lockname[16];
  int on_runq;
  int cpu_pin;