提交 044966c8 创建 作者: Nickolai Zeldovich's avatar Nickolai Zeldovich

use atomic<> in page table structures

more flexible updatepages() interface PTE locking (using one of the free PTE bits) guard against concurrent unmaps (except no way to check if vma is deleted yet..)
上级 49d7b69c
...@@ -12,75 +12,51 @@ ...@@ -12,75 +12,51 @@
#include "vm.hh" #include "vm.hh"
#include <stddef.h> #include <stddef.h>
extern pml4e_t kpml4[]; using namespace std;
static pme_t* static pgmap*
descend(pme_t *dir, const void *va, u64 flags, int create, int level) descend(pgmap *dir, u64 va, u64 flags, int create, int level)
{ {
atomic<pme_t> *entryp;
pme_t entry; pme_t entry;
pme_t *next; pgmap *next;
retry: retry:
dir = &dir[PX(level, va)]; entryp = &dir->e[PX(level, va)];
entry = *dir; entry = entryp->load();
if (entry & PTE_P) { if (entry & PTE_P) {
next = (pme_t*) p2v(PTE_ADDR(entry)); next = (pgmap*) p2v(PTE_ADDR(entry));
} else { } else {
if (!create) if (!create)
return NULL; return NULL;
next = (pme_t*) kalloc(); next = (pgmap*) kalloc();
if (!next) if (!next)
return NULL; return NULL;
memset(next, 0, PGSIZE); memset(next, 0, PGSIZE);
if (!cmpswap(dir, entry, v2p(next) | PTE_P | PTE_W | flags)) { if (!cmpxch(entryp, entry, v2p(next) | PTE_P | PTE_W | flags)) {
kfree((void*) next); kfree((void*) next);
goto retry; goto retry;
} }
} }
return next; return next;
} }
// Return the address of the PTE in page table pgdir // Return the address of the PTE in page table pgdir
// that corresponds to linear address va. If create!=0, // that corresponds to linear address va. If create!=0,
// create any required page table pages. // create any required page table pages.
pme_t * atomic<pme_t>*
walkpgdir(pml4e_t *pml4, const void *va, int create) walkpgdir(pgmap *pml4, u64 va, int create)
{ {
pme_t *pdp; auto pdp = descend(pml4, va, PTE_U, create, 3);
pme_t *pd;
pme_t *pt;
pdp = descend(pml4, va, PTE_U, create, 3);
if (pdp == NULL) if (pdp == NULL)
return NULL; return NULL;
pd = descend(pdp, va, PTE_U, create, 2); auto pd = descend(pdp, va, PTE_U, create, 2);
if (pd == NULL) if (pd == NULL)
return NULL; return NULL;
pt = descend(pd, va, PTE_U, create, 1); auto pt = descend(pd, va, PTE_U, create, 1);
if (pt == NULL) if (pt == NULL)
return NULL; return NULL;
return &pt[PX(0,va)]; return &pt->e[PX(0,va)];
}
void
updatepages(pme_t *pml4, void *begin, void *end, int perm)
{
char *a, *last;
pme_t *pte;
a = (char*) PGROUNDDOWN(begin);
last = (char*) PGROUNDDOWN(end);
for (;;) {
pte = walkpgdir(pml4, a, 1);
if(pte != 0) {
if (perm == 0) *pte = 0;
else *pte = PTE_ADDR(*pte) | perm | PTE_P;
}
if (a == last)
break;
a += PGSIZE;
}
} }
// Map from 0 to 128Gbytes starting at KBASE. // Map from 0 to 128Gbytes starting at KBASE.
...@@ -88,43 +64,43 @@ void ...@@ -88,43 +64,43 @@ void
initpg(void) initpg(void)
{ {
extern char end[]; extern char end[];
void *va = (void*)KBASE; u64 va = KBASE;
paddr pa = 0; paddr pa = 0;
while (va < (void*)(KBASE+(128ull<<30))) { while (va < (KBASE+(128ull<<30))) {
pme_t *pdp = descend(kpml4, va, 0, 1, 3); auto pdp = descend(&kpml4, va, 0, 1, 3);
pme_t *pd = descend(pdp, va, 0, 1, 2); auto pd = descend(pdp, va, 0, 1, 2);
pme_t *sp = &pd[PX(1,va)]; atomic<pme_t> *sp = &pd->e[PX(1,va)];
u64 flags = PTE_W | PTE_P | PTE_PS; u64 flags = PTE_W | PTE_P | PTE_PS;
// Set NX for non-code pages // Set NX for non-code pages
if (va >= (void*) end) if (va >= (u64) end)
flags |= PTE_NX; flags |= PTE_NX;
*sp = pa | flags; *sp = pa | flags;
va = (char*)va + PGSIZE*512; va = va + PGSIZE*512;
pa += PGSIZE*512; pa += PGSIZE*512;
} }
} }
// Set up kernel part of a page table. // Set up kernel part of a page table.
pml4e_t* pgmap*
setupkvm(void) setupkvm(void)
{ {
pml4e_t *pml4; pgmap *pml4;
int k; int k;
if((pml4 = (pml4e_t*)kalloc()) == 0) if((pml4 = (pgmap*)kalloc()) == 0)
return 0; return 0;
k = PX(3, KBASE); k = PX(3, KBASE);
memset(&pml4[0], 0, 8*k); memset(&pml4->e[0], 0, 8*k);
memmove(&pml4[k], &kpml4[k], 8*(512-k)); memmove(&pml4->e[k], &kpml4.e[k], 8*(512-k));
return pml4; return pml4;
} }
int int
setupkshared(pml4e_t *pml4, char *kshared) setupkshared(pgmap *pml4, char *kshared)
{ {
for (u64 off = 0; off < KSHAREDSIZE; off+=4096) { for (u64 off = 0; off < KSHAREDSIZE; off+=4096) {
pme_t *pte = walkpgdir(pml4, (void*)(KSHARED+off), 1); atomic<pme_t> *pte = walkpgdir(pml4, (u64) (KSHARED+off), 1);
if (pte == NULL) if (pte == NULL)
panic("setupkshared: oops"); panic("setupkshared: oops");
*pte = v2p(kshared+off) | PTE_P | PTE_U | PTE_W; *pte = v2p(kshared+off) | PTE_P | PTE_U | PTE_W;
...@@ -137,7 +113,7 @@ setupkshared(pml4e_t *pml4, char *kshared) ...@@ -137,7 +113,7 @@ setupkshared(pml4e_t *pml4, char *kshared)
void void
switchkvm(void) switchkvm(void)
{ {
lcr3(v2p(kpml4)); // switch to the kernel page table lcr3(v2p(&kpml4)); // switch to the kernel page table
} }
// Switch TSS and h/w page table to correspond to process p. // Switch TSS and h/w page table to correspond to process p.
...@@ -159,14 +135,15 @@ switchuvm(struct proc *p) ...@@ -159,14 +135,15 @@ switchuvm(struct proc *p)
} }
static void static void
freepm(pme_t *pm, int level) freepm(pgmap *pm, int level)
{ {
int i; int i;
if (level != 0) { if (level != 0) {
for (i = 0; i < 512; i++) { for (i = 0; i < 512; i++) {
if (pm[i] & PTE_P) pme_t entry = pm->e[i];
freepm((pme_t*) p2v(PTE_ADDR(pm[i])), level - 1); if (entry & PTE_P)
freepm((pgmap*) p2v(PTE_ADDR(entry)), level - 1);
} }
} }
...@@ -176,7 +153,7 @@ freepm(pme_t *pm, int level) ...@@ -176,7 +153,7 @@ freepm(pme_t *pm, int level)
// Free a page table and all the physical memory pages // Free a page table and all the physical memory pages
// in the user part. // in the user part.
void void
freevm(pml4e_t *pml4) freevm(pgmap *pml4)
{ {
int k; int k;
int i; int i;
...@@ -187,8 +164,9 @@ freevm(pml4e_t *pml4) ...@@ -187,8 +164,9 @@ freevm(pml4e_t *pml4)
// Don't free kernel portion of the pml4 // Don't free kernel portion of the pml4
k = PX(3, KBASE); k = PX(3, KBASE);
for (i = 0; i < k; i++) { for (i = 0; i < k; i++) {
if (pml4[i] & PTE_P) { pme_t entry = pml4->e[i];
freepm((pme_t*) p2v(PTE_ADDR(pml4[i])), 2); if (entry & PTE_P) {
freepm((pgmap*) p2v(PTE_ADDR(entry)), 2);
} }
} }
......
...@@ -4,6 +4,8 @@ extern "C" { ...@@ -4,6 +4,8 @@ extern "C" {
#include "kern_c.h" #include "kern_c.h"
} }
#include "atomic.hh"
#include "hwvm.hh"
#include <stdarg.h> #include <stdarg.h>
#define KBASE 0xFFFFFF0000000000ull #define KBASE 0xFFFFFF0000000000ull
...@@ -109,13 +111,6 @@ class rcu_freed; ...@@ -109,13 +111,6 @@ class rcu_freed;
void gc_delayed(rcu_freed *); void gc_delayed(rcu_freed *);
#endif #endif
// hwvm.c
void freevm(pml4e_t*);
pml4e_t* setupkvm(void);
int setupkshared(pml4e_t *pml4, char *kshared);
pme_t * walkpgdir(pml4e_t*, const void*, int);
void tlbflush(void);
// hz.c // hz.c
void microdelay(u64); void microdelay(u64);
u64 nsectime(void); u64 nsectime(void);
...@@ -251,7 +246,6 @@ void uartintr(void); ...@@ -251,7 +246,6 @@ void uartintr(void);
void switchuvm(struct proc*); void switchuvm(struct proc*);
void switchkvm(void); void switchkvm(void);
int pagefault(struct vmap *, uptr, u32); int pagefault(struct vmap *, uptr, u32);
void updatepages(pml4e_t*, void*, void*, int);
// wq.c // wq.c
int wq_trywork(void); int wq_trywork(void);
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "kalloc.h" #include "kalloc.h"
#include "cpu.hh" #include "cpu.hh"
#include "amd64.h" #include "amd64.h"
#include "hwvm.hh"
static volatile int bstate; static volatile int bstate;
...@@ -57,7 +58,6 @@ bootothers(void) ...@@ -57,7 +58,6 @@ bootothers(void)
void void
cmain(u64 mbmagic, u64 mbaddr) cmain(u64 mbmagic, u64 mbaddr)
{ {
extern pml4e_t kpml4[];
extern u64 cpuhz; extern u64 cpuhz;
initpg(); initpg();
...@@ -93,7 +93,7 @@ cmain(u64 mbmagic, u64 mbaddr) ...@@ -93,7 +93,7 @@ cmain(u64 mbmagic, u64 mbaddr)
inituser(); // first user process inituser(); // first user process
bootothers(); // start other processors bootothers(); // start other processors
kpml4[0] = 0; // don't need 1 GB identity mapping anymore kpml4.e[0] = 0; // don't need 1 GB identity mapping anymore
lcr3(rcr3()); lcr3(rcr3());
scheduler(); scheduler();
......
...@@ -15,8 +15,11 @@ ...@@ -15,8 +15,11 @@
#define PTE_A 0x020 // Accessed #define PTE_A 0x020 // Accessed
#define PTE_D 0x040 // Dirty #define PTE_D 0x040 // Dirty
#define PTE_PS 0x080 // Page Size #define PTE_PS 0x080 // Page Size
#define PTE_G 0x100 // Global
#define PTE_MBZ 0x180 // Bits must be zero #define PTE_MBZ 0x180 // Bits must be zero
#define PTE_COW 0x800 // copy-on-write #define PTE_LOCK 0x200 // xv6: lock
#define PTE_UNUSED 0x400 // xv6: unused
#define PTE_COW 0x800 // xv6: copy-on-write
#define PTE_NX 0x8000000000000000ull // No-execute enable #define PTE_NX 0x8000000000000000ull // No-execute enable
#define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1)) #define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1))
......
...@@ -11,7 +11,6 @@ typedef uptr paddr; ...@@ -11,7 +11,6 @@ typedef uptr paddr;
// Page Map Entry (refers to any entry in any level) // Page Map Entry (refers to any entry in any level)
typedef u64 pme_t; typedef u64 pme_t;
typedef pme_t pml4e_t;
#ifdef XV6 #ifdef XV6
// POSIX types // POSIX types
......
...@@ -213,7 +213,14 @@ vmap::copy(int share) ...@@ -213,7 +213,14 @@ vmap::copy(int share)
scoped_acquire sae(&e->lock); scoped_acquire sae(&e->lock);
e->va_type = COW; e->va_type = COW;
updatepages(pml4, (void *) (e->vma_start), (void *) (e->vma_end), PTE_COW); updatepages(pml4, e->vma_start, e->vma_end, [](atomic<pme_t>* p) {
for (;;) {
pme_t v = p->load();
if (!(v & PTE_P) || !(v & PTE_U) || !(v & PTE_W) ||
cmpxch(p, v, PTE_ADDR(v) | PTE_P | PTE_U | PTE_COW))
break;
}
});
} else { } else {
ne->n = e->n->copy(); ne->n = e->n->copy();
ne->va_type = e->va_type; ne->va_type = e->va_type;
...@@ -292,7 +299,15 @@ vmap::insert(vmnode *n, uptr vma_start) ...@@ -292,7 +299,15 @@ vmap::insert(vmnode *n, uptr vma_start)
span.replace(new range(&cr, vma_start, len, e, 0)); span.replace(new range(&cr, vma_start, len, e, 0));
} }
updatepages(pml4, (void*) e->vma_start, (void*) (e->vma_end-1), 0); updatepages(pml4, e->vma_start, e->vma_end, [](atomic<pme_t> *p) {
for (;;) {
pme_t v = p->load();
if (v & PTE_LOCK)
continue;
if (cmpxch(p, v, (pme_t) 0))
break;
}
});
tlbflush(); tlbflush();
return 0; return 0;
} }
...@@ -317,7 +332,15 @@ vmap::remove(uptr vma_start, uptr len) ...@@ -317,7 +332,15 @@ vmap::remove(uptr vma_start, uptr len)
span.replace(0); span.replace(0);
} }
updatepages(pml4, (void*) vma_start, (void*) (vma_start + len - 1), 0); updatepages(pml4, vma_start, vma_start + len, [](atomic<pme_t> *p) {
for (;;) {
pme_t v = p->load();
if (v & PTE_LOCK)
continue;
if (cmpxch(p, v, (pme_t) 0))
break;
}
});
tlbflush(); tlbflush();
return 0; return 0;
} }
...@@ -327,7 +350,7 @@ vmap::remove(uptr vma_start, uptr len) ...@@ -327,7 +350,7 @@ vmap::remove(uptr vma_start, uptr len)
*/ */
vma * vma *
vmap::pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock) vmap::pagefault_ondemand(uptr va, vma *m, scoped_acquire *mlock)
{ {
if (m->n->allocpg() < 0) if (m->n->allocpg() < 0)
panic("pagefault: couldn't allocate pages"); panic("pagefault: couldn't allocate pages");
...@@ -342,7 +365,7 @@ vmap::pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock) ...@@ -342,7 +365,7 @@ vmap::pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock)
} }
int int
vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg) vmap::pagefault_wcow(vma *m)
{ {
// Always make a copy of n, even if this process has the only ref, // Always make a copy of n, even if this process has the only ref,
// because other processes may change ref count while this process // because other processes may change ref count while this process
...@@ -356,10 +379,16 @@ vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg) ...@@ -356,10 +379,16 @@ vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg)
c->ref = 1; c->ref = 1;
m->va_type = PRIVATE; m->va_type = PRIVATE;
m->n = c; m->n = c;
// Update the hardware page tables to reflect the change to the vma // Update the hardware page tables to reflect the change to the vma
updatepages(pml4, (void *) m->vma_start, (void *) m->vma_end, 0); updatepages(pml4, m->vma_start, m->vma_end, [](atomic<pme_t> *p) {
pte = walkpgdir(pml4, (const void *)va, 0); for (;;) {
*pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_W; pme_t v = p->load();
if (!(v & PTE_P) || cmpxch(p, v, v & ~PTE_P))
break;
}
});
// drop my ref to vmnode // drop my ref to vmnode
n->decref(); n->decref();
return 0; return 0;
...@@ -368,11 +397,18 @@ vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg) ...@@ -368,11 +397,18 @@ vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg)
int int
vmap::pagefault(uptr va, u32 err) vmap::pagefault(uptr va, u32 err)
{ {
pme_t *pte = walkpgdir(pml4, (const void *)va, 1); bool needflush = false;
atomic<pme_t> *pte = walkpgdir(pml4, va, 1);
retry:
(void) 0;
pme_t ptev = pte->load();
// optimize checks of args to syscals // optimize checks of args to syscals
if((*pte & (PTE_P|PTE_U|PTE_W)) == (PTE_P|PTE_U|PTE_W)) if ((ptev & (PTE_P|PTE_U|PTE_W)) == (PTE_P|PTE_U|PTE_W)) {
// XXX using pagefault() as a security check in syscalls is prone to races
return 0; return 0;
}
scoped_gc_epoch gc; scoped_gc_epoch gc;
vma *m = lookup(va, 1); vma *m = lookup(va, 1);
...@@ -383,25 +419,33 @@ vmap::pagefault(uptr va, u32 err) ...@@ -383,25 +419,33 @@ vmap::pagefault(uptr va, u32 err)
u64 npg = (PGROUNDDOWN(va) - m->vma_start) / PGSIZE; u64 npg = (PGROUNDDOWN(va) - m->vma_start) / PGSIZE;
if (m->n && m->n->type == ONDEMAND && m->n->page[npg] == 0) if (m->n && m->n->type == ONDEMAND && m->n->page[npg] == 0)
m = pagefault_ondemand(va, err, m, &mlock); m = pagefault_ondemand(va, m, &mlock);
if (vm_debug) if (vm_debug)
cprintf("pagefault: err 0x%x va 0x%lx type %d ref %lu pid %d\n", cprintf("pagefault: err 0x%x va 0x%lx type %d ref %lu pid %d\n",
err, va, m->va_type, m->n->ref.load(), myproc()->pid); err, va, m->va_type, m->n->ref.load(), myproc()->pid);
if (m->va_type == COW && (err & FEC_WR)) { if (m->va_type == COW && (err & FEC_WR)) {
if (pagefault_wcow(va, pte, m, npg) < 0) if (pagefault_wcow(m) < 0)
return -1; return -1;
} else if (m->va_type == COW) { needflush = true;
}
if ((ptev & PTE_LOCK) || !cmpxch(pte, ptev, ptev | PTE_LOCK))
goto retry;
// XXX check if vma has been deleted, and if so, unlock & goto retry
if (m->va_type == COW) {
*pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_COW; *pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_COW;
} else { } else {
if (m->n->ref != 1) assert(m->n->ref == 1);
panic("pagefault");
*pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_W; *pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_W;
} }
// XXX(sbw) Why reload hardware page tables? mlock.release();
lcr3(v2p(pml4)); // Reload hardware page tables if (needflush)
tlbflush();
return 1; return 1;
} }
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "atomic.hh" #include "atomic.hh"
#include "crange.hh" #include "crange.hh"
#include "cpputil.hh" #include "cpputil.hh"
#include "hwvm.hh"
using std::atomic; using std::atomic;
...@@ -52,7 +53,7 @@ struct vmap { ...@@ -52,7 +53,7 @@ struct vmap {
struct spinlock lock; // serialize map/lookup/unmap struct spinlock lock; // serialize map/lookup/unmap
atomic<u64> ref; atomic<u64> ref;
u64 alloc; u64 alloc;
pml4e_t *pml4; // Page table pgmap *pml4; // Page table
char *kshared; char *kshared;
char lockname[16]; char lockname[16];
...@@ -69,6 +70,6 @@ struct vmap { ...@@ -69,6 +70,6 @@ struct vmap {
int copyout(uptr va, void *p, u64 len); int copyout(uptr va, void *p, u64 len);
private: private:
vma* pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock); vma* pagefault_ondemand(uptr va, vma *m, scoped_acquire *mlock);
int pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg); int pagefault_wcow(vma *m);
}; };
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论