Merge branch 'scale-amd64' of git+ssh://amsterdam.csail.mit.edu/home/am0/6.828/xv6 into scale-amd64

481e807f · Silas Boyd-Wickizer · 51e3d118 · 67712f8b · 481e807f · 481e807f
--- a/bin/usertests.cc
+++ b/bin/usertests.cc
@@ -8,11 +8,23 @@
 #include <sys/mman.h>
+#include <utility>
 char buf[2048];
 char name[3];
 const char *echoargv[] = { "echo", "ALL", "TESTS", "PASSED", 0 };
 int stdout = 1;
+// Random number generator for randomized tests
+static u64 rseed;
+u64
+rnd(void)
+{
+  rseed = rseed * 6364136223846793005 + 1442695040888963407;
+  return rseed;
+}
 // simple file system tests
 void
@@ -1741,6 +1753,91 @@ unmappedtest(void)
  printf("unmappedtest ok\n");
 }
+bool
+test_fault(char *p)
+{
+  int fds[2], pid;
+  char buf = 0;
+  if (pipe(fds) != 0)
+    die("test_fault: pipe failed");
+  if ((pid = fork(0)) < 0)
+    die("test_fault: fork failed");
+  if (pid == 0) {
+    close(fds[0]);
+    *p = 0x42;
+    if (write(fds[1], &buf, 1) != 1)
+      die("test_fault: write failed");
+    exit();
+  }
+  close(fds[1]);
+  bool faulted = (read(fds[0], &buf, 1) < 1);
+  wait();
+  close(fds[0]);
+  return faulted;
+}
+void
+vmoverlap(void)
+{
+  printf("vmoverlap\n");
+  char *base = (char*)0x1000;
+  char map[10] = {};
+  int mapn = 1;
+  rseed = 0;
+  for (int i = 0; i < 100; i++) {
+    int op = i % 20 >= 10;
+    int lo = rnd() % 10, hi = rnd() % 10;
+    if (lo > hi)
+      std::swap(lo, hi);
+    if (lo == hi)
+      continue;
+    if (op == 0) {
+      // Map
+      void *res = mmap(base + lo * 4096, (hi-lo) * 4096, PROT_READ|PROT_WRITE,
+                       MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
+      if (res == MAP_FAILED)
+        die("vmoverlap: mmap failed");
+    } else {
+      // Unmap
+      int res = munmap(base + lo * 4096, (hi-lo) * 4096);
+      if (res < 0)
+        die("vmoverlap: munmap failed");
+    }
+    for (int i = lo; i < hi; i++) {
+      if (op == 0) {
+        // Check that it zeroed the range
+        if (base[i*4096] != 0)
+          die("did not zero mapped-over region");
+        // Fill it in
+        base[i*4096] = mapn;
+        // Update the expected mapping
+        map[i] = mapn;
+      } else {
+        // Update the expected mapping
+        map[i] = 0;
+      }
+    }
+    // Check entire mapping
+    for (int i = 0; i < sizeof(map)/sizeof(map[0]); i++) {
+      if (map[i] && base[i*4096] != map[i])
+        die("page outside of mapped-over region changed");
+      else if (!map[i] && !test_fault(&base[i*4096]))
+        die("expected fault");
+    }
+  }
+  munmap(base, 10 * 4096);
+  printf("vmoverlap ok\n");
+}
 static int nenabled;
 static char **enabled;
@@ -1782,6 +1879,7 @@ main(int argc, char *argv[])
  // we should be able to grow a user process to consume all phys mem
  TEST(unmappedtest);
+  TEST(vmoverlap);
  TEST(validatetest);

--- a/include/radix.hh
+++ b/include/radix.hh
@@ -231,13 +231,18 @@ struct radix_iterator {
    if (k_ != key_limit_)
      prime_path();
  }
+  radix_iterator() = default;
+  radix_iterator(const radix_iterator &o) = default;
+  radix_iterator(radix_iterator &&o) = default;
+  // Move to the next non-null entry in the collection, or end.
  radix_iterator &operator++() {
    assert(k_ < key_limit_);
    advance();
    return *this;
  }
-  radix_elem* operator*() {
+  radix_elem* operator*() const {
    return path_[level_]->load().elem();
  }
@@ -245,10 +250,28 @@ struct radix_iterator {
  // If the current element is non-null, does nothing.
  void skip_nulls()
  {
-    if (path_[level_]->load().is_null())
+    if (!**this)
      ++(*this);
  }
+  // Return the key of the iterator's current element.
+  u64 key() const
+  {
+    return k_ << r_->shift_;
+  }
+  // Return the span of the key space occupied by the iterator's
+  // current element.
+  u64 span() const
+  {
+    return (u64)1 << (bits_per_level * level_ + r_->shift_);
+  }
+  // Return an iterator that points to the next element that is not
+  // equal to the current element.  If no such element exists, returns
+  // end.  Note that this element may be null.
+  radix_iterator next_change() const;
  // Compare equality on just the key.
  bool operator==(const radix_iterator &other) {
    return r_ == other.r_ && k_ == other.k_; }
@@ -267,9 +290,9 @@ private:
  // Prepare the initial path_ and level_ based on k_.
  void prime_path();
-  // Advance to the next non-null leaf.  This assumes that
+  // Advance to the next leaf.  If skip_nulls is true, advances to the
-  // k_ < key_limit_.
+  // next non-null leaf.  This assumes that k_ < key_limit_.
-  void advance();
+  void advance(bool skip_nulls = true);
 };
 inline radix_iterator

--- a/kernel/radix.cc
+++ b/kernel/radix.cc
@@ -231,6 +231,9 @@ radix_range::replace(u64 start, u64 size, radix_elem *val)
  assert(start >= start_);
  assert(start + size <= start_ + size_);
+  // XXX(austin) We will deadlock with ourselves if we try to replace
+  // a range and the replaced range is on a different level than the
+  // locked range (because this update_range will try to push_down).
  dprintf("%p: replace: [%lx, %lx) with %p\n", r_, start, start + size, val);
  update_range(r_->root_.load(), &r_->root_, [val](radix_entry cur, radix_ptr *ptr) -> radix_entry {
      do {
@@ -246,6 +249,17 @@ radix_range::replace(u64 start, u64 size, radix_elem *val)
    }, 0, 1L << key_bits, start, start + size);
 }
+radix_iterator
+radix_iterator::next_change() const
+{
+  radix_elem *cur = **this;
+  radix_iterator next(*this);
+  do {
+    next.advance(false);
+  } while (next.k_ < next.key_limit_ && *next == cur);
+  return next;
+}
 void
 radix_iterator::prime_path()
 {
@@ -268,7 +282,7 @@ radix_iterator::prime_path()
 }
 void
-radix_iterator::advance()
+radix_iterator::advance(bool skip_nulls)
 {
  while (true) {
    // As long as we haven't reached our limit or an element, advance
@@ -296,8 +310,8 @@ radix_iterator::advance()
      level_--;
    }
-    // Did we reach a non-null leaf?
+    // Did we reach a non-null leaf?  (Or do we not care?)
-    if (!entry.is_null())
+    if (!skip_nulls || !entry.is_null())
      return;
  }
 }
--- a/kernel/vm.cc
+++ b/kernel/vm.cc
@@ -174,6 +174,7 @@ vma::vma(vmap *vmap, uptr start, uptr end, enum vmatype vtype, vmnode *vmn) :
 {
  assert(PGOFFSET(start) == 0);
  assert(PGOFFSET(end) == 0);
+  assert(!vmn || end - start == vmn->npages << PGSHIFT);
  if (n)
    n->incref();
 }
@@ -255,18 +256,23 @@ vmap::incref()
 bool
 vmap::replace_vma(vma *a, vma *b)
 {
+  assert(a->vma_start == b->vma_start);
+  assert(a->vma_end == b->vma_end);
  auto span = vmas.search_lock(a->vma_start, a->vma_end - a->vma_start);
  if (a->deleted())
    return false;
+#if VM_CRANGE
  for (auto e: span)
    assert(a == e);
-#if VM_CRANGE
  span.replace(b);
 #endif
 #if VM_RADIX
-  span.replace(a->vma_start, b->vma_start-a->vma_start, 0);
+  for (auto it = span.begin(); it != span.end(); ++it) {
-  span.replace(b->vma_start, b->vma_end-b->vma_start, b);
+    if (static_cast<vma*>(*it) == a)
-  span.replace(b->vma_end, a->vma_end-b->vma_end, 0);
+      // XXX(austin) replace should take iterators to represent the
+      // span so we don't have to find the keys all over again.
+      span.replace(it.key(), it.span(), b);
+  }
 #endif
  return true;
 }
@@ -277,28 +283,40 @@ vmap::copy(int share)
  vmap *nm = new vmap();
 #if VM_RADIX
-  void *last = 0;
+  radix::iterator next_it;
+  for (auto it = vmas.begin(); it != vmas.end(); it = next_it, it.skip_nulls()) {
+    next_it = it.next_change();
+    u64 range_start = it.key();
+    u64 range_end = next_it.key();
+    vma *e = static_cast<vma*>(*it);
+#endif
+#if 0
+  }  // Ugh.  Un-confuse IDE indentation.
 #endif
+#if VM_CRANGE
  for (auto r: vmas) {
-#if VM_RADIX
+    vma *e = static_cast<vma *>(r);
-    if (!r || r == last)
+    u64 range_start = e->vma_start;
-      continue;
+    u64 range_end = e->vma_end;
-    last = r;
 #endif
-    vma *e = (vma *) r;
+    u64 range_size = range_end - range_start;
    struct vma *ne;
    if (share) {
+      // Because of the pages array, the new vma needs to have the
+      // same start and end, even if that's not where it ends up in
+      // the index.
      ne = new vma(nm, e->vma_start, e->vma_end, COW, e->n);
      // if the original vma wasn't COW, replace it with a COW vma
      if (e->va_type != COW) {
        vma *repl = new vma(this, e->vma_start, e->vma_end, COW, e->n);
-        replace_vma(e, repl);
 #if VM_RADIX
-        last = repl;
+        vmas.search_lock(range_start, range_size).replace(range_start, range_size, repl);
+#elif VM_CRANGE
+        replace_vma(e, repl);
 #endif
-        updatepages(pml4, e->vma_start, e->vma_end, [](atomic<pme_t>* p) {
+        updatepages(pml4, range_start, range_end, [](atomic<pme_t>* p) {
            for (;;) {
              pme_t v = p->load();
              if (v & PTE_LOCK)
@@ -314,7 +332,7 @@ vmap::copy(int share)
      ne = new vma(nm, e->vma_start, e->vma_end, PRIVATE, e->n->copy());
    }
-    auto span = nm->vmas.search_lock(ne->vma_start, ne->vma_end - ne->vma_start);
+    auto span = nm->vmas.search_lock(range_start, range_size);
    for (auto x: span) {
 #if VM_RADIX
      if (!x)
@@ -328,7 +346,7 @@ vmap::copy(int share)
    span.replace(ne);
 #endif
 #if VM_RADIX
-    span.replace(ne->vma_start, ne->vma_end-ne->vma_start, ne);
+    span.replace(range_start, range_size, ne);
 #endif
  }
@@ -397,11 +415,9 @@ again:
    // new scope to release the search lock before tlbflush
    u64 len = n->npages * PGSIZE;
    auto span = vmas.search_lock(vma_start, len);
+#if VM_CRANGE
+    // XXX handle overlaps, set replaced=true
    for (auto r: span) {
-#if VM_RADIX
-      if (!r)
-        continue;
-#endif
      if (!fixed)
        goto again;
@@ -410,8 +426,27 @@ again:
              rvma, rvma->vma_start, rvma->vma_end);
      return -1;
    }
+#endif
-    // XXX handle overlaps, set replaced=true
+#if VM_RADIX
+    // XXX(austin) span.replace also has to do this scan.  It would be
+    // nice if we could do just one scan.
+    for (auto r: span) {
+      if (!r)
+        continue;
+      if (!fixed)
+        goto again;
+      else {
+        // XXX(austin) I don't think anything prevents a page fault
+        // from reading the old VMA now and installing the new page
+        // for the old VMA after the updatepages.  Certainly not
+        // PTE_LOCK, since we don't take that here.  Why not just use
+        // the lock in the radix tree?  (We can't do that with crange,
+        // though, since it can only lock complete ranges.)
+        replaced = true;
+        break;
+      }
+    }
+#endif
    e = new vma(this, vma_start, vma_start+len, PRIVATE, n);
    if (e == 0) {
@@ -432,6 +467,11 @@ again:
    updatepages(pml4, e->vma_start, e->vma_end, [&needtlb](atomic<pme_t> *p) {
        for (;;) {
          pme_t v = p->load();
+          // XXX(austin) Huh?  Why is it okay to skip it if it's
+          // locked?  The page fault could be faulting in a page from
+          // the old VMA, in which case we need to shoot it down
+          // (though if it's already faulting a page from the new VMA,
+          // we need to *not* shoot it down).
          if (v & PTE_LOCK)
            continue;
          if (!(v & PTE_P))
@@ -458,9 +498,11 @@ vmap::remove(uptr vma_start, uptr len)
 {
  {
    // new scope to release the search lock before tlbflush
-    uptr vma_end = vma_start + len;
    auto span = vmas.search_lock(vma_start, len);
+#if VM_CRANGE
+    // XXX handle partial unmap
+    uptr vma_end = vma_start + len;
    for (auto r: span) {
      vma *rvma = (vma*) r;
      if (rvma->vma_start < vma_start || rvma->vma_end > vma_end) {
@@ -469,13 +511,14 @@ vmap::remove(uptr vma_start, uptr len)
        return -1;
      }
    }
+#endif
-    // XXX handle partial unmap
 #if VM_CRANGE
    span.replace(0);
 #endif
 #if VM_RADIX
+    // XXX(austin) If this could tell us that nothing was replaced, we
+    // could skip the updatepages.
    span.replace(vma_start, len, 0);
 #endif
  }
@@ -518,8 +561,20 @@ vmap::pagefault_wcow(vma *m)
  vma *repl = new vma(this, m->vma_start, m->vma_end, PRIVATE, nodecopy);
+  // XXX(austin) This will cause sharing on parts of this range that
+  // have since been unmapped or replaced.  But in our current design
+  // where we need a new vmnode we have to replace all instances of it
+  // at once or we'll end up with a complete vmnode copy for each page
+  // we fault on.  If we replace it all at once, this will waste time
+  // and space copying pages that are no longer mapped, but will only
+  // do that once.  Fixing this requires getting rid of the vmnode.
  replace_vma(m, repl);
  updatepages(pml4, m->vma_start, m->vma_end, [](atomic<pme_t> *p) {
+      // XXX(austin) In radix, this may clear PTEs belonging to other
+      // VMAs that have replaced sub-ranges of the faulting VMA.
+      // That's unfortunate but okay because we'll just bring them
+      // back from the pages array.  Yet another consequence of having
+      // to do a vmnode at a time.
      for (;;) {
        pme_t v = p->load();
        if (v & PTE_LOCK)

--- a/stdinc/utility
+++ b/stdinc/utility
@@ -12,6 +12,23 @@ namespace std {
    return static_cast<typename remove_reference<T>::type&&>(a);
  }
+  template<class T>
+  void
+  swap(T& a, T& b)
+  {
+    T tmp = move(a);
+    a = move(b);
+    b = move(tmp);
+  }
+  template<class T, size_t N>
+  void
+  swap(T (&a)[N], T (&b)[N])
+  {
+    for (size_t n = 0; n < N; n++)
+      swap(a[n], b[n]);
+  }
  template<class A, class B>
  struct pair {
    typedef A first_type;