Adopt high-performance gc plan from user-level phash impl

Kernel is trickier because there are processes that don't call begin/end_epoch Premature anyway, since it won't help until we have per-core process lists But, we do garbage collect memory now

Adopt high-performance gc plan from user-level phash impl
6fb270fb · Frans Kaashoek · e65d0098 · 6fb270fb · 6fb270fb · 6fb270fb
--- a/crange.c
+++ b/crange.c
@@ -24,9 +24,7 @@
 // node for deletion by marking its next pointer.
 //

-#define assert(c)   if (!(c)) { panic("assertion failure"); }
-
-#define CRANGE_CHECKING 1
+#define CRANGE_CHECKING 0
 #define MINNLEVEL 10

 #define MARKED(x) (((uintptr) (x)) & 0x1)
@@ -158,7 +156,7 @@ static void
 clist_range_free(void *p)
 {
  struct clist_range *e = (struct clist_range *) p;
-  // cprintf("crange_free: %u(%u)\n", e->key, e->size);
+  if (crange_debug) cprintf("crange_free: %u(%u)\n", e->key, e->size);
  crange_check(e->cr, 0, p);
  for (int l = 0; l < e->nlevel; l++) {
    e->next[l] = (struct clist_range *) 0xDEADBEEF;

--- a/gc.c
+++ b/gc.c
@@ -9,14 +9,20 @@
 #include "cpu.h"
 #include "kmtrace.h"

-#define NEPOCH 4
+#define NGC 1

-static struct { struct spinlock l __mpalign__; } rcu_lock[NCPU];
-static struct { struct condvar cv __mpalign__; } rcu_cv[NCPU];
+// 1 worker in total. more workers makes sense, if we have per-core process
+// lists.
+#define NWORKER 1
+static struct { struct condvar cv __mpalign__; } rcu_cv[NWORKER];
+static struct { struct spinlock l __mpalign__; } gc_lock;
+
+enum { gc_debug = 0 };

 struct gc {
  u64 epoch;
  struct gc *next;  
+  struct gc *free;
  union {
    struct {
      void (*dofree)(void *);
@@ -31,29 +37,31 @@ struct gc {
  };
  int type;
 } __mpalign__;
-struct gc gc_epoch[NEPOCH][NCPU] __mpalign__;
+
 u64 global_epoch __mpalign__;
-int ndelayed __mpalign__;

 enum { rcu_debug = 0 };

 struct gc *
 gc_alloc()
 {
-  return kmalloc(sizeof(struct gc));
+  struct gc *r = kmalloc(sizeof(struct gc));
+  assert(r);
+  myproc()->ndelayed++;
+  return r;
 }

-void *
+static void *
 gc_min(void *vkey, void *v, void *arg){
  u64 *min_epoch_p = arg;
  struct proc *p = (struct proc *) v;
  if (*min_epoch_p > p->epoch) {
      *min_epoch_p = p->epoch;
  }
-  return 0;
+  return NULL;
 }

-void
+static void
 gc_free_elem(struct gc *r)
 {
  switch (r->type) {
@@ -69,79 +77,77 @@ gc_free_elem(struct gc *r)
  kmfree(r);
 }

-// Fraser's reclaimation scheme: free all delayed-free items in global_epoch-2
-static void
-gc_free_epoch(u64 epoch)
+static int
+gc_free_list(struct gc *head, u64 epoch)
 {
-  cprintf("free epoch %d\n", epoch);
-
-  for (int j = 0; j < NCPU; j++) {
-    if (__sync_bool_compare_and_swap(&global_epoch, epoch, epoch+1)) {
-      // only one core succeeds; that core in charge of freeing epoch
-      struct gc *head;
-      struct gc *r, *nr;
-      uint32 fe = (epoch - (NEPOCH-2)) % NEPOCH;
-      int cas;
-
-      if (gc_epoch[fe][j].epoch != epoch - (NEPOCH-2))
-	panic("gc_free_epoch");
-
-      // unhook list for fe epoch atomically
-      head = gc_epoch[fe][j].next;
-      // this shouldn't fail, because no core is modifying it.
-      cas = __sync_bool_compare_and_swap(&gc_epoch[fe][j].next, head, 0);
-      if (!cas) panic("gc_free_epoch");
-      // free list items on the delayed list
-      for (r = head; r != NULL; r = nr) {
-	if (r->epoch > epoch-(NEPOCH-2)) {
-	  cprintf("%lu %lu\n", r->epoch, epoch-(NEPOCH-2));
-	  panic("gc_free_epoch");
-	}
-	nr = r->next;
-	gc_free_elem(r);
-	int x = __sync_fetch_and_sub(&ndelayed, 1);
-	if (x < 0) panic("gc_free_epoch");
-      }
-      if (gc_epoch[fe][j].next != 0)
-	panic("gc_free_epoch");
-      gc_epoch[fe][j].epoch = gc_epoch[fe][j].epoch + NEPOCH;
+  int nfree = 0;
+  struct gc *r, *nr;
+
+  for (r = head; r != NULL; r = nr) {
+    if (r->epoch > epoch) {
+      cprintf("%lu %lu\n", r->epoch, epoch);
+      assert(0);
    }
+    nr = r->next;
+    gc_free_elem(r);
+    nfree++;
  }
+  return nfree;
 }

-void
-gc(void)
-{
-  u64 global = global_epoch;
-  u64 min = global;
-  ns_enumerate(nspid, gc_min, &min);
-  // cprintf("gc: global %lu min %lu ndelay %d\n", global_epoch, min, ndelayed);
-  if (min >= global) {
-    gc_free_epoch(min);
-  }
+// move to free delayed list to free list so that a process can do its own freeing
+void *
+gc_move_to_free_proc(void *vkey, void *v, void *arg){
+  u64 *epoch = arg;
+  struct proc *p = (struct proc *) v;
+  struct gc *head;
+  uint32 fe = (*epoch - (NEPOCH-2)) % NEPOCH;
+  int cas;
+  assert(p->gc_epoch[fe].epoch == *epoch-(NEPOCH-2));   // XXX race with setting epoch = 0
+  // unhook list for fe epoch atomically
+  head = p->gc_epoch[fe].next;
+  // this shouldn't fail, because no core is modifying it.
+  cas = __sync_bool_compare_and_swap(&(p->gc_epoch[fe].next), head, 0);
+  assert(cas);
+  // insert list into local free list so that each core can do its own frees
+  assert (p->gc_epoch[fe].free == 0);
+  cas = __sync_bool_compare_and_swap(&(p->gc_epoch[fe].free), 0, head);
+  assert(cas);
+  assert(p->gc_epoch[fe].next == 0);
+  return 0;
 }

-
+// Fraser's reclaimation scheme: free all delayed-free items in global_epoch-2
+// only one thread should call this function
 static void
-gc_worker(void *x)
+gc_move_to_free(u64 epoch)
 {
-  struct spinlock wl;
-
-  initlock(&wl, "rcu_gc_worker");   // dummy lock
-
-  for (;;) {
-    gc();
-
-    acquire(&wl);
-    cv_sleep(&rcu_cv[mycpu()->id].cv, &wl);
-    release(&wl);
-  }
+  if (gc_debug)
+    cprintf("%d: free epoch %ld\n", myproc()->pid, epoch);
+  myproc()->rcu_read_depth++;  // ensure ns_enumate's call to gc_begin_epoch doesn't call gc()
+  ns_enumerate(nspid, gc_move_to_free_proc, &epoch);
+  myproc()->rcu_read_depth--;
+  int ok  = __sync_bool_compare_and_swap(&global_epoch, epoch, epoch+1);
+  assert(ok);
 }

-void
-gc_start(void)
+// If all threads have seen global_epoch, we can free elements in global_epoch-2
+static void
+gc(void)
 {
-  cv_wakeup(&rcu_cv[mycpu()->id].cv);
+  int r = tryacquire(&gc_lock.l);
+  if (r == 0) return;
+  assert(r == 1);
+
+  u64 global = global_epoch;
+  u64 min = global;
+  myproc()->rcu_read_depth++;  // ensure ns_enumate's call to gc_begin_epoch doesn't call gc()
+  ns_enumerate(nspid, gc_min, &min);
+  myproc()->rcu_read_depth--;
+  if (min >= global) {
+    gc_move_to_free(min);
+  }
+  release(&gc_lock.l);
 }

 static void
@@ -149,17 +155,18 @@ gc_delayed_int(struct gc *r)
 {
  pushcli();
  u64 myepoch = myproc()->epoch;
-  u64 minepoch = gc_epoch[myepoch % NEPOCH][mycpu()->id].epoch;
-  // cprintf("%d: gc_delayed: %lu ndelayed %d\n", myproc()->pid, global_epoch, ndelayed);
+  u64 minepoch = myproc()->gc_epoch[myepoch % NEPOCH].epoch;
+  if (gc_debug) 
+    cprintf("%d: gc_delayed: %lu ndelayed %d\n", myproc()->pid, global_epoch, myproc()->ndelayed);
  if (myepoch != minepoch) {
    cprintf("%d: myepoch %lu minepoch %lu\n", myproc()->pid, myepoch, minepoch);
    panic("gc_delayed_int");
  }
  r->epoch = myepoch;
  do {
-    r->next = gc_epoch[myepoch % NEPOCH][mycpu()->id].next;
-  } while (!__sync_bool_compare_and_swap(&(gc_epoch[myepoch % NEPOCH][mycpu()->id].next), r->next, r));
-   popcli();
+    r->next = myproc()->gc_epoch[myepoch % NEPOCH].next;
+  } while (!__sync_bool_compare_and_swap(&(myproc()->gc_epoch[myepoch % NEPOCH].next), r->next, r));
+  popcli();
 }

 void
@@ -187,34 +194,113 @@ gc_delayed2(int a1, u64 a2, void (*dofree)(int,u64))
  gc_delayed_int(r);
 }

+static void*
+gc_free(void *vkey, void *v, void *arg)
+{
+  struct proc *p = (struct proc *) v;
+  acquire(&p->gc_lock);
+  u64 global = global_epoch;
+  for (u64 epoch = p->epoch; epoch < global; epoch++) {
+    int j = (epoch - (NEPOCH - 2)) % NEPOCH;
+    assert(p->gc_epoch[j].epoch == epoch-2);
+    struct gc *free = p->gc_epoch[j].free;
+    int ok = __sync_bool_compare_and_swap(&(p->gc_epoch[j].free), free, NULL);
+    assert(ok);
+    int nfree = gc_free_list(free, epoch - 2);
+    p->ndelayed -= nfree;
+    if (gc_debug && nfree > 0)
+      cprintf("%d: epoch %d freed %d\n", p->pid, epoch - 2, nfree);
+    p->gc_epoch[j].epoch = p->gc_epoch[j].epoch + NEPOCH;
+  }
+  p->epoch = global;  // not atomic, but it never goes backwards
+  __sync_synchronize();
+  release(&p->gc_lock);
+  return NULL;
+}
+
+void
+gc_start(void)
+{
+  cv_wakeup(&rcu_cv[0].cv);   // NWORKER = 1
+  //  cv_wakeup(&rcu_cv[mycpu()->id].cv);
+}
+
 void
 gc_begin_epoch(void)
 {
-  if (myproc() && myproc()->rcu_read_depth++ == 0)
-    myproc()->epoch = global_epoch;
-  __sync_synchronize();
+  if (myproc() == NULL) return;
+  if (myproc()->rcu_read_depth++ > 0)
+    return;
+  gc_free(NULL, (void *) myproc(), NULL);
 }

 void
 gc_end_epoch(void)
 {
-  if (myproc() && myproc()->rcu_read_depth > 0)
+  if (myproc() == NULL) return;
+  if (--myproc()->rcu_read_depth > 0)
+    return;
+
+#if 0
+  // kick gcc early if under memory pressure
+  int free = 0;
+  for (int j = 0; j < NEPOCH; j++) {
+    if (myproc()->gc_epoch[j].free)
+      free = 1;
+  }
+  u64 nd = myproc()->ndelayed;
+  if (!free && nd > NGC) {
+    gc_start();
+  }
+#endif
+}
+
+static void
+gc_worker(void *x)
+{
+  struct spinlock wl;
+
+  initlock(&wl, "rcu_gc_worker dummy");   // dummy lock
+  for (;;) {
+    acquire(&wl);
+
+    myproc()->rcu_read_depth++;  // call gc_free once for gc_worker
+    ns_enumerate(nspid, gc_free, NULL);
    myproc()->rcu_read_depth--;
+
+    gc();
+
+    cv_sleep(&rcu_cv[0].cv, &wl);   // NWORKER = 1
+    release(&wl);
+  }
+}
+
+void
+initprocgc(struct proc *p)
+{
+  p->epoch = global_epoch;
+  p->gc_epoch = kmalloc(sizeof(struct gc) * NEPOCH);
+  initlock(&p->gc_lock, "per process gc_lock");
+  for (u64 i = global_epoch-2; i < global_epoch+2; i++)  {
+    p->gc_epoch[i % NEPOCH].epoch = i;
+    p->gc_epoch[i % NEPOCH].free = NULL;
+    p->gc_epoch[i % NEPOCH].next = NULL;
+  }
 }

+
 void
 initgc(void)
 {
-  for (int i = 0; i < NCPU; i++) {
-    initlock(&rcu_lock[i].l, "rcu");
+  initlock(&gc_lock.l, "gc");
+  global_epoch = NEPOCH-2;
+
+  for (int i = 0; i < NWORKER; i++) {
    initcondvar(&rcu_cv[i].cv, "rcu_gc_cv");
  }
-  global_epoch = NEPOCH-2;
-  for (int i = 0; i < NEPOCH; i++) 
-    for (int j = 0; j < NEPOCH; j++)
-      gc_epoch[i][j].epoch = i;

-  for (u32 c = 0; c < NCPU; c++) {
+  // one worker for now
+  for (u32 c = 0; c < NWORKER; c++) {
    struct proc *gcp; 

    gcp = threadalloc(gc_worker, NULL);

--- a/kernel.h
+++ b/kernel.h
@@ -23,6 +23,7 @@ struct stat;
 struct proc;
 struct vmap;
 struct pipe;
+struct gc;

 // bio.c
 void            binit(void);
@@ -49,6 +50,7 @@ void            panic(const char*) __attribute__((noreturn));
 void            snprintf(char *buf, u32 n, char *fmt, ...);
 void            consoleintr(int(*)(void));

+#define assert(c)   if (!(c)) { cprintf("%s:%d: ", __FILE__, __LINE__); panic("assertion failure"); }

 // crange.c

@@ -113,6 +115,7 @@ void	        dir_flush(struct inode *dp);

 // gc.c
 void            initgc(void);
+void            initprocgc(struct proc *);
 void            gc_begin_epoch();
 void            gc_end_epoch();
 void            gc_delayed(void*, void (*dofree)(void*));

--- a/param.h
+++ b/param.h
@@ -9,7 +9,7 @@
 #define ROOTDEV       1  // device number of file system root disk
 #define MAXARG       32  // max exec arguments
 #define MAXNAME      16  // max string names
-#define INF          (~0UL)
+#define NEPOCH        4
 #define CACHELINE    64  // cache line size
 #define CPUKSTACKS   (NPROC + NCPU)
 #define QUANTUM      10  // scheduling time quantum and tick length (in msec)

--- a/proc.c
+++ b/proc.c
@@ -189,10 +189,10 @@ allocproc(void)

  p->state = EMBRYO;
  p->pid = ns_allockey(nspid);
-  p->epoch = 0;
  p->cpuid = mycpu()->id;
  p->on_runq = -1;
  p->cpu_pin = 0;
+  initprocgc(p);
 #if MTRACE
  p->mtrace_stacks.curr = -1;
 #endif

--- a/proc.h
+++ b/proc.h
@@ -54,6 +54,9 @@ struct proc {
  SLIST_ENTRY(proc) child_next;
  struct condvar cv;
  u64 epoch;
+  u64 ndelayed;
+  struct gc *gc_epoch;
+  struct spinlock gc_lock;
  u64 rcu_read_depth;
  char lockname[16];
  int on_runq;