提交 044966c8 创建 作者: Nickolai Zeldovich's avatar Nickolai Zeldovich

use atomic<> in page table structures

more flexible updatepages() interface PTE locking (using one of the free PTE bits) guard against concurrent unmaps (except no way to check if vma is deleted yet..)
上级 49d7b69c
......@@ -12,75 +12,51 @@
#include "vm.hh"
#include <stddef.h>
extern pml4e_t kpml4[];
using namespace std;
static pme_t*
descend(pme_t *dir, const void *va, u64 flags, int create, int level)
static pgmap*
descend(pgmap *dir, u64 va, u64 flags, int create, int level)
{
atomic<pme_t> *entryp;
pme_t entry;
pme_t *next;
pgmap *next;
retry:
dir = &dir[PX(level, va)];
entry = *dir;
entryp = &dir->e[PX(level, va)];
entry = entryp->load();
if (entry & PTE_P) {
next = (pme_t*) p2v(PTE_ADDR(entry));
next = (pgmap*) p2v(PTE_ADDR(entry));
} else {
if (!create)
return NULL;
next = (pme_t*) kalloc();
next = (pgmap*) kalloc();
if (!next)
return NULL;
memset(next, 0, PGSIZE);
if (!cmpswap(dir, entry, v2p(next) | PTE_P | PTE_W | flags)) {
if (!cmpxch(entryp, entry, v2p(next) | PTE_P | PTE_W | flags)) {
kfree((void*) next);
goto retry;
}
}
return next;
}
// Return the address of the PTE in page table pgdir
// that corresponds to linear address va. If create!=0,
// create any required page table pages.
pme_t *
walkpgdir(pml4e_t *pml4, const void *va, int create)
atomic<pme_t>*
walkpgdir(pgmap *pml4, u64 va, int create)
{
pme_t *pdp;
pme_t *pd;
pme_t *pt;
pdp = descend(pml4, va, PTE_U, create, 3);
auto pdp = descend(pml4, va, PTE_U, create, 3);
if (pdp == NULL)
return NULL;
pd = descend(pdp, va, PTE_U, create, 2);
auto pd = descend(pdp, va, PTE_U, create, 2);
if (pd == NULL)
return NULL;
pt = descend(pd, va, PTE_U, create, 1);
auto pt = descend(pd, va, PTE_U, create, 1);
if (pt == NULL)
return NULL;
return &pt[PX(0,va)];
}
void
updatepages(pme_t *pml4, void *begin, void *end, int perm)
{
char *a, *last;
pme_t *pte;
a = (char*) PGROUNDDOWN(begin);
last = (char*) PGROUNDDOWN(end);
for (;;) {
pte = walkpgdir(pml4, a, 1);
if(pte != 0) {
if (perm == 0) *pte = 0;
else *pte = PTE_ADDR(*pte) | perm | PTE_P;
}
if (a == last)
break;
a += PGSIZE;
}
return &pt->e[PX(0,va)];
}
// Map from 0 to 128Gbytes starting at KBASE.
......@@ -88,43 +64,43 @@ void
initpg(void)
{
extern char end[];
void *va = (void*)KBASE;
u64 va = KBASE;
paddr pa = 0;
while (va < (void*)(KBASE+(128ull<<30))) {
pme_t *pdp = descend(kpml4, va, 0, 1, 3);
pme_t *pd = descend(pdp, va, 0, 1, 2);
pme_t *sp = &pd[PX(1,va)];
while (va < (KBASE+(128ull<<30))) {
auto pdp = descend(&kpml4, va, 0, 1, 3);
auto pd = descend(pdp, va, 0, 1, 2);
atomic<pme_t> *sp = &pd->e[PX(1,va)];
u64 flags = PTE_W | PTE_P | PTE_PS;
// Set NX for non-code pages
if (va >= (void*) end)
if (va >= (u64) end)
flags |= PTE_NX;
*sp = pa | flags;
va = (char*)va + PGSIZE*512;
va = va + PGSIZE*512;
pa += PGSIZE*512;
}
}
// Set up kernel part of a page table.
pml4e_t*
pgmap*
setupkvm(void)
{
pml4e_t *pml4;
pgmap *pml4;
int k;
if((pml4 = (pml4e_t*)kalloc()) == 0)
if((pml4 = (pgmap*)kalloc()) == 0)
return 0;
k = PX(3, KBASE);
memset(&pml4[0], 0, 8*k);
memmove(&pml4[k], &kpml4[k], 8*(512-k));
memset(&pml4->e[0], 0, 8*k);
memmove(&pml4->e[k], &kpml4.e[k], 8*(512-k));
return pml4;
}
int
setupkshared(pml4e_t *pml4, char *kshared)
setupkshared(pgmap *pml4, char *kshared)
{
for (u64 off = 0; off < KSHAREDSIZE; off+=4096) {
pme_t *pte = walkpgdir(pml4, (void*)(KSHARED+off), 1);
atomic<pme_t> *pte = walkpgdir(pml4, (u64) (KSHARED+off), 1);
if (pte == NULL)
panic("setupkshared: oops");
*pte = v2p(kshared+off) | PTE_P | PTE_U | PTE_W;
......@@ -137,7 +113,7 @@ setupkshared(pml4e_t *pml4, char *kshared)
void
switchkvm(void)
{
lcr3(v2p(kpml4)); // switch to the kernel page table
lcr3(v2p(&kpml4)); // switch to the kernel page table
}
// Switch TSS and h/w page table to correspond to process p.
......@@ -159,14 +135,15 @@ switchuvm(struct proc *p)
}
static void
freepm(pme_t *pm, int level)
freepm(pgmap *pm, int level)
{
int i;
if (level != 0) {
for (i = 0; i < 512; i++) {
if (pm[i] & PTE_P)
freepm((pme_t*) p2v(PTE_ADDR(pm[i])), level - 1);
pme_t entry = pm->e[i];
if (entry & PTE_P)
freepm((pgmap*) p2v(PTE_ADDR(entry)), level - 1);
}
}
......@@ -176,7 +153,7 @@ freepm(pme_t *pm, int level)
// Free a page table and all the physical memory pages
// in the user part.
void
freevm(pml4e_t *pml4)
freevm(pgmap *pml4)
{
int k;
int i;
......@@ -187,8 +164,9 @@ freevm(pml4e_t *pml4)
// Don't free kernel portion of the pml4
k = PX(3, KBASE);
for (i = 0; i < k; i++) {
if (pml4[i] & PTE_P) {
freepm((pme_t*) p2v(PTE_ADDR(pml4[i])), 2);
pme_t entry = pml4->e[i];
if (entry & PTE_P) {
freepm((pgmap*) p2v(PTE_ADDR(entry)), 2);
}
}
......
......@@ -4,6 +4,8 @@ extern "C" {
#include "kern_c.h"
}
#include "atomic.hh"
#include "hwvm.hh"
#include <stdarg.h>
#define KBASE 0xFFFFFF0000000000ull
......@@ -109,13 +111,6 @@ class rcu_freed;
void gc_delayed(rcu_freed *);
#endif
// hwvm.c
void freevm(pml4e_t*);
pml4e_t* setupkvm(void);
int setupkshared(pml4e_t *pml4, char *kshared);
pme_t * walkpgdir(pml4e_t*, const void*, int);
void tlbflush(void);
// hz.c
void microdelay(u64);
u64 nsectime(void);
......@@ -251,7 +246,6 @@ void uartintr(void);
void switchuvm(struct proc*);
void switchkvm(void);
int pagefault(struct vmap *, uptr, u32);
void updatepages(pml4e_t*, void*, void*, int);
// wq.c
int wq_trywork(void);
......
......@@ -5,6 +5,7 @@
#include "kalloc.h"
#include "cpu.hh"
#include "amd64.h"
#include "hwvm.hh"
static volatile int bstate;
......@@ -57,7 +58,6 @@ bootothers(void)
void
cmain(u64 mbmagic, u64 mbaddr)
{
extern pml4e_t kpml4[];
extern u64 cpuhz;
initpg();
......@@ -93,7 +93,7 @@ cmain(u64 mbmagic, u64 mbaddr)
inituser(); // first user process
bootothers(); // start other processors
kpml4[0] = 0; // don't need 1 GB identity mapping anymore
kpml4.e[0] = 0; // don't need 1 GB identity mapping anymore
lcr3(rcr3());
scheduler();
......
......@@ -15,8 +15,11 @@
#define PTE_A 0x020 // Accessed
#define PTE_D 0x040 // Dirty
#define PTE_PS 0x080 // Page Size
#define PTE_G 0x100 // Global
#define PTE_MBZ 0x180 // Bits must be zero
#define PTE_COW 0x800 // copy-on-write
#define PTE_LOCK 0x200 // xv6: lock
#define PTE_UNUSED 0x400 // xv6: unused
#define PTE_COW 0x800 // xv6: copy-on-write
#define PTE_NX 0x8000000000000000ull // No-execute enable
#define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1))
......
......@@ -11,7 +11,6 @@ typedef uptr paddr;
// Page Map Entry (refers to any entry in any level)
typedef u64 pme_t;
typedef pme_t pml4e_t;
#ifdef XV6
// POSIX types
......
......@@ -213,7 +213,14 @@ vmap::copy(int share)
scoped_acquire sae(&e->lock);
e->va_type = COW;
updatepages(pml4, (void *) (e->vma_start), (void *) (e->vma_end), PTE_COW);
updatepages(pml4, e->vma_start, e->vma_end, [](atomic<pme_t>* p) {
for (;;) {
pme_t v = p->load();
if (!(v & PTE_P) || !(v & PTE_U) || !(v & PTE_W) ||
cmpxch(p, v, PTE_ADDR(v) | PTE_P | PTE_U | PTE_COW))
break;
}
});
} else {
ne->n = e->n->copy();
ne->va_type = e->va_type;
......@@ -292,7 +299,15 @@ vmap::insert(vmnode *n, uptr vma_start)
span.replace(new range(&cr, vma_start, len, e, 0));
}
updatepages(pml4, (void*) e->vma_start, (void*) (e->vma_end-1), 0);
updatepages(pml4, e->vma_start, e->vma_end, [](atomic<pme_t> *p) {
for (;;) {
pme_t v = p->load();
if (v & PTE_LOCK)
continue;
if (cmpxch(p, v, (pme_t) 0))
break;
}
});
tlbflush();
return 0;
}
......@@ -317,7 +332,15 @@ vmap::remove(uptr vma_start, uptr len)
span.replace(0);
}
updatepages(pml4, (void*) vma_start, (void*) (vma_start + len - 1), 0);
updatepages(pml4, vma_start, vma_start + len, [](atomic<pme_t> *p) {
for (;;) {
pme_t v = p->load();
if (v & PTE_LOCK)
continue;
if (cmpxch(p, v, (pme_t) 0))
break;
}
});
tlbflush();
return 0;
}
......@@ -327,7 +350,7 @@ vmap::remove(uptr vma_start, uptr len)
*/
vma *
vmap::pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock)
vmap::pagefault_ondemand(uptr va, vma *m, scoped_acquire *mlock)
{
if (m->n->allocpg() < 0)
panic("pagefault: couldn't allocate pages");
......@@ -342,7 +365,7 @@ vmap::pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock)
}
int
vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg)
vmap::pagefault_wcow(vma *m)
{
// Always make a copy of n, even if this process has the only ref,
// because other processes may change ref count while this process
......@@ -356,10 +379,16 @@ vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg)
c->ref = 1;
m->va_type = PRIVATE;
m->n = c;
// Update the hardware page tables to reflect the change to the vma
updatepages(pml4, (void *) m->vma_start, (void *) m->vma_end, 0);
pte = walkpgdir(pml4, (const void *)va, 0);
*pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_W;
updatepages(pml4, m->vma_start, m->vma_end, [](atomic<pme_t> *p) {
for (;;) {
pme_t v = p->load();
if (!(v & PTE_P) || cmpxch(p, v, v & ~PTE_P))
break;
}
});
// drop my ref to vmnode
n->decref();
return 0;
......@@ -368,11 +397,18 @@ vmap::pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg)
int
vmap::pagefault(uptr va, u32 err)
{
pme_t *pte = walkpgdir(pml4, (const void *)va, 1);
bool needflush = false;
atomic<pme_t> *pte = walkpgdir(pml4, va, 1);
retry:
(void) 0;
pme_t ptev = pte->load();
// optimize checks of args to syscals
if((*pte & (PTE_P|PTE_U|PTE_W)) == (PTE_P|PTE_U|PTE_W))
if ((ptev & (PTE_P|PTE_U|PTE_W)) == (PTE_P|PTE_U|PTE_W)) {
// XXX using pagefault() as a security check in syscalls is prone to races
return 0;
}
scoped_gc_epoch gc;
vma *m = lookup(va, 1);
......@@ -383,25 +419,33 @@ vmap::pagefault(uptr va, u32 err)
u64 npg = (PGROUNDDOWN(va) - m->vma_start) / PGSIZE;
if (m->n && m->n->type == ONDEMAND && m->n->page[npg] == 0)
m = pagefault_ondemand(va, err, m, &mlock);
m = pagefault_ondemand(va, m, &mlock);
if (vm_debug)
cprintf("pagefault: err 0x%x va 0x%lx type %d ref %lu pid %d\n",
err, va, m->va_type, m->n->ref.load(), myproc()->pid);
if (m->va_type == COW && (err & FEC_WR)) {
if (pagefault_wcow(va, pte, m, npg) < 0)
if (pagefault_wcow(m) < 0)
return -1;
} else if (m->va_type == COW) {
needflush = true;
}
if ((ptev & PTE_LOCK) || !cmpxch(pte, ptev, ptev | PTE_LOCK))
goto retry;
// XXX check if vma has been deleted, and if so, unlock & goto retry
if (m->va_type == COW) {
*pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_COW;
} else {
if (m->n->ref != 1)
panic("pagefault");
assert(m->n->ref == 1);
*pte = v2p(m->n->page[npg]) | PTE_P | PTE_U | PTE_W;
}
// XXX(sbw) Why reload hardware page tables?
lcr3(v2p(pml4)); // Reload hardware page tables
mlock.release();
if (needflush)
tlbflush();
return 1;
}
......
......@@ -2,6 +2,7 @@
#include "atomic.hh"
#include "crange.hh"
#include "cpputil.hh"
#include "hwvm.hh"
using std::atomic;
......@@ -52,7 +53,7 @@ struct vmap {
struct spinlock lock; // serialize map/lookup/unmap
atomic<u64> ref;
u64 alloc;
pml4e_t *pml4; // Page table
pgmap *pml4; // Page table
char *kshared;
char lockname[16];
......@@ -69,6 +70,6 @@ struct vmap {
int copyout(uptr va, void *p, u64 len);
private:
vma* pagefault_ondemand(uptr va, u32 err, vma *m, scoped_acquire *mlock);
int pagefault_wcow(uptr va, pme_t *pte, vma *m, u64 npg);
vma* pagefault_ondemand(uptr va, vma *m, scoped_acquire *mlock);
int pagefault_wcow(vma *m);
};
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论