More scalable futex and some hacks to get ok performance.

The only locking is "point-to-point" between condvar sleep in one proc and condvar wakeup in another.

More scalable futex and some hacks to get ok performance.
c79b80a7 · Silas Boyd-Wickizer · 39064a18 · c79b80a7 · c79b80a7 · c79b80a7
--- a/include/proc.hh
+++ b/include/proc.hh
@@ -77,7 +77,7 @@ struct proc : public rcu_freed, public sched_link {
  u64 cv_wakeup;               // Wakeup time for this process
  LIST_ENTRY(proc) cv_waiters; // Linked list of processes waiting for oncv
  LIST_ENTRY(proc) cv_sleep;   // Linked list of processes sleeping on a cv
-  LIST_ENTRY(proc) futex_link;
+  struct spinlock futex_lock;
  u64 user_fs_;
  u64 unmap_tlbreq_;
  int exec_cpuid_;
@@ -97,6 +97,8 @@ struct proc : public rcu_freed, public sched_link {
  static int   kill(int pid);
  int          kill();

+  static u64   hash(const u32& p);
+
  virtual void do_gc(void) { delete this; }

 private:

--- a/kernel/futex.cc
+++ b/kernel/futex.cc
@@ -7,6 +7,7 @@
 #include "condvar.h"
 #include "proc.hh"
 #include "cpu.hh"
+#include "percpu.hh"

 //
 // futexkey
@@ -45,11 +46,68 @@ futexkey(const u64* useraddr, vmap* vmap, futexkey_t* key)
 }

 //
+// nscache
+//
+struct nscache {
+  struct spinlock lock_;
+  volatile u64 head_;
+  volatile u64 tail_;
+  xns<u32, proc*, proc::hash>* ns_[16];
+
+  nscache();
+  xns<u32, proc*, proc::hash>* alloc();
+  bool cache(xns<u32, proc*, proc::hash>* ns);
+
+  NEW_DELETE_OPS(nscache);
+};
+
+percpu<nscache> nscache_;
+
+nscache::nscache(void)
+{
+  initlock(&lock_, "nscache::lock_", LOCKSTAT_FUTEX);
+  head_ = 0;
+  tail_ = 0;
+}
+
+xns<u32, proc*, proc::hash>*
+nscache::alloc(void)
+{
+  xns<u32, proc*, proc::hash>* ns = nullptr;
+  
+  acquire(&lock_);
+  if (head_ - tail_ > 0) {
+    u64 i = tail_%NELEM(ns_);
+    ++tail_;
+    ns = ns_[i];
+  }
+  release(&lock_);
+  return ns;
+}
+
+bool
+nscache::cache(xns<u32, proc*, proc::hash>* ns)
+{
+  bool cached = false;
+
+  acquire(&lock_);
+  if (head_ - tail_ < NELEM(ns_)) {
+    u64 i = head_%NELEM(ns_);
+    ++head_;
+    ns_[i] = ns;
+    cached = true;
+  }
+  release(&lock_);
+  return cached;
+}
+
+//
 // futexaddr
 //
 struct futexaddr : public referenced, public rcu_freed
 {
-  futexaddr(futexkey_t key);
+  static futexaddr* alloc(futexkey_t key);
+
  virtual void do_gc();
  virtual void onzero() const;

@@ -57,18 +115,40 @@ struct futexaddr : public referenced, public rcu_freed
  bool inserted_;
  struct spinlock lock_;

-  LIST_HEAD(proclist, proc) list_;
+  xns<u32, proc*, proc::hash>* const nspid_;

+private:
+  futexaddr(futexkey_t key, xns<u32, proc*, proc::hash>* nspid);
  NEW_DELETE_OPS(futexaddr);
 };

 xns<futexkey_t, futexaddr*, futexkey_hash> *nsfutex __mpalign__;

-futexaddr::futexaddr(futexkey_t key)
-  : rcu_freed("futexaddr"), key_(key), inserted_(false)
+futexaddr*
+futexaddr::alloc(futexkey_t key)
+{
+  xns<u32, proc*, proc::hash>* nspid;
+  futexaddr* fa;
+
+  nspid = nscache_->alloc();
+  if (nspid == nullptr)
+    nspid = new xns<u32, proc*, proc::hash>(false);
+
+  if (nspid == nullptr)
+    return nullptr;
+  fa = new futexaddr(key, nspid);
+  if (fa == nullptr) {
+    delete nspid;
+    return nullptr;
+  }
+
+  return fa;
+}
+
+futexaddr::futexaddr(futexkey_t key, xns<u32, proc*, proc::hash>* nspid)
+  : rcu_freed("futexaddr"), key_(key), inserted_(false), nspid_(nspid)
 {
  initlock(&lock_, "futexaddr::lock_", LOCKSTAT_FUTEX);
-  LIST_INIT(&list_);
 }

 void
@@ -82,6 +162,10 @@ futexaddr::onzero(void) const
 {
  if (inserted_)
    assert(nsfutex->remove(key_, nullptr));
+  // Normally deallocate members in the destructor, but in this case
+  // we don't want to wait for the gc to fill the cache
+  if (!nscache_->cache(nspid_))
+    delete nspid_;
  gc_delayed((futexaddr*)this);
 }

@@ -95,7 +179,11 @@ futexwait(futexkey_t key, u64 val, u64 timer)
  again:
    fa = nsfutex->lookup(key);
    if (fa == nullptr) {
-      fa = new futexaddr(key);
+      fa = futexaddr::alloc(key);
+      if (fa == nullptr) {
+        cprintf("futexwait futexaddr::alloc failed\n");
+        return -1;
+      }
      if (nsfutex->insert(key, fa) < 0) {
        fa->dec();
        goto again;
@@ -109,20 +197,28 @@ futexwait(futexkey_t key, u64 val, u64 timer)
  }
  assert(fa->key_ == key);

-  acquire(&fa->lock_);  
+  acquire(&myproc()->futex_lock);  
  auto cleanup = scoped_cleanup([&fa](){
-    release(&fa->lock_);
+    release(&myproc()->futex_lock);
    fa->dec();
  });

+  // This first check is an optimization
  if (futexkey_val(fa->key_) != val)
    return -EWOULDBLOCK;
-  LIST_INSERT_HEAD(&fa->list_, myproc(), futex_link);
+
+  if (fa->nspid_->insert(myproc()->pid, myproc()) < 0)
+    return -1;
+
+  if (futexkey_val(fa->key_) != val) {
+    fa->nspid_->remove(myproc()->pid, nullptr);
+    return -EWOULDBLOCK;
+  }

  u64 nsecto = timer == 0 ? 0 : timer+nsectime();
-  cv_sleepto(&myproc()->cv, &fa->lock_, nsecto);
+  cv_sleepto(&myproc()->cv, &myproc()->futex_lock, nsecto);

-  LIST_REMOVE(myproc(), futex_link);
+  assert(fa->nspid_->remove(myproc()->pid, nullptr));
  return 0;
 }

@@ -131,20 +227,29 @@ futexwake(futexkey_t key, u64 nwake)
 {
  futexaddr* fa;
  u64 nwoke = 0;
-  proc* p;
+
+  if (nwake == 0)
+    return -1;

  scoped_gc_epoch gc;
  fa = nsfutex->lookup(key);
-  if (fa == nullptr)
+  if (fa == nullptr || !fa->tryinc())
    return 0;
-  acquire(&fa->lock_);
-  LIST_FOREACH(p, &fa->list_, futex_link) {
-    if (nwoke >= nwake)
-      break;
+
+  auto cleanup = scoped_cleanup([&fa](){
+    fa->dec();
+  });
+  
+  fa->nspid_->enumerate([&nwoke, &nwake](u32 pid, proc* p) {
+    acquire(&p->futex_lock);
    cv_wakeup(&p->cv);
-    nwoke++;
-  }
-  release(&fa->lock_);
+    release(&p->futex_lock);
+    ++nwoke;
+    if (nwoke >= nwake)
+      return 1;
+    return 0;
+  });
+
  return 0;
 }

@@ -154,4 +259,7 @@ initfutex(void)
  nsfutex = new xns<futexkey_t, futexaddr*, futexkey_hash>(false);
  if (nsfutex == 0)
    panic("initfutex");
+
+  for (int i = 0; i < NCPU; i++)
+    new (&nscache_[i]) nscache();
 }
--- a/kernel/proc.cc
+++ b/kernel/proc.cc
@@ -16,7 +16,7 @@
 #include "wq.hh"

 u64
-proc_hash(const u32 &p)
+proc::hash(const u32 &p)
 {
  return p;
 }
@@ -27,7 +27,7 @@ mycpuid(void)
  return mycpu()->id;
 }

-xns<u32, proc*, proc_hash> *xnspid __mpalign__;
+xns<u32, proc*, proc::hash> *xnspid __mpalign__;
 struct proc *bootproc __mpalign__;

 #if MTRACE
@@ -46,6 +46,7 @@ proc::proc(int npid) :
 {
  snprintf(lockname, sizeof(lockname), "cv:proc:%d", pid);
  initlock(&lock, lockname+3, LOCKSTAT_PROC);
+  initlock(&futex_lock, "proc::futex_lock", LOCKSTAT_PROC);
  initcondvar(&cv, lockname);

  memset(&childq, 0, sizeof(childq));
@@ -257,7 +258,7 @@ proc::alloc(void)
 void
 initproc(void)
 {
-  xnspid = new xns<u32, proc*, proc_hash>(false);
+  xnspid = new xns<u32, proc*, proc::hash>(false);
  if (xnspid == 0)
    panic("pinit");
 }