Lab cow: Copy-on-Write Fork
Implement copy-on write(hard)
Your task is to implement copy-on-write fork in the xv6 kernel. You are done if your modified kernel executes both the cowtest and usertests programs successfully.
To help you test your implementation, we've provided an xv6 program called cowtest (source in user/cowtest.c). cowtest runs various tests, but even the first will fail on unmodified xv6. Thus, initially, you will see:
$ cowtest
simple: fork() failed
$
The "simple" test allocates more than half of available physical memory, and then fork()s. The fork fails because there is not enough free physical memory to give the child a complete copy of the parent's memory.
When you are done, your kernel should pass all the tests in both cowtest and usertests. That is:
$ cowtest
simple: ok
simple: ok
three: zombie!
ok
three: zombie!
ok
three: zombie!
ok
file: ok
ALL COW TESTS PASSED
$ usertests
...
ALL TESTS PASSED
$
Here's a reasonable plan of attack.
Modify uvmcopy() to map the parent's physical pages into the child, instead of allocating new pages. Clear
PTE_W
in the PTEs of both child and parent.Modify usertrap() to recognize page faults. When a page-fault occurs on a COW page, allocate a new page with kalloc(), copy the old page to the new page, and install the new page in the PTE with
PTE_W
set.Ensure that each physical page is freed when the last PTE reference to it goes away -- but not before. A good way to do this is to keep, for each physical page, a "reference count" of the number of user page tables that refer to that page. Set a page's reference count to one when
kalloc()
allocates it. Increment a page's reference count when fork causes a child to share the page, and decrement a page's count each time any process drops the page from its page table.kfree()
should only place a page back on the free list if its reference count is zero. It's OK to to keep these counts in a fixed-size array of integers. You'll have to work out a scheme for how to index the array and how to choose its size. For example, you could index the array with the page's physical address divided by 4096, and give the array a number of elements equal to highest physical address of any page placed on the free list bykinit()
in kalloc.c.Modify copyout() to use the same scheme as page faults when it encounters a COW page.
Some hints:
The lazy page allocation lab has likely made you familiar with much of the xv6 kernel code that's relevant for copy-on-write. However, you should not base this lab on your lazy allocation solution; instead, please start with a fresh copy of xv6 as directed above.
It may be useful to have a way to record, for each PTE, whether it is a COW mapping. You can use the RSW (reserved for software) bits in the RISC-V PTE for this.
usertests
explores scenarios thatcowtest
does not test, so don't forget to check that all tests pass for both.Some helpful macros and definitions for page table flags are at the end of
kernel/riscv.h
.If a COW page fault occurs and there's no free memory, the process should be killed.
实现cow(copy on write)写时复制
trap.c
//处理cow页面,0表示不是一个cow页面,1表示cow处理成功,-1表示处理失败
int
copyonwrite(pagetable_t pagetable,uint64 va)
{
uint64 va0 = PGROUNDDOWN(va);
if(va0 >= MAXVA){
return -1;
}
pte_t* pte = walk(pagetable,va0,0);
if(pte == 0){
return -1;
}
// 如果出错页面是一个cow页面
if((*pte & PTE_C) == PTE_C){
uint64 pa = PTE2PA(*pte);
int num = knum(PA2INDEX(PTE2PA(*pte)));
// 开辟并复制页面
if(num >= 1){
char* mem = kalloc();
// 退出进程
if(mem == 0) {
myproc()->killed = 1;
exit(-1);
}
memmove(mem,(char *) pa,PGSIZE);
uint flags = PTE_FLAGS(*pte);
// 新pte设置可写
flags |= PTE_W;
// 新pte取消cow
flags &= ~(PTE_C);
// 虚拟地址映射到新的物理地址
mappages(pagetable, va0, PGSIZE, (uint64) mem, flags);
// 原物理地址计数-1
kfree((void *)pa);
}else{
panic("cow:memory count should > 0");
}
return 1;
}else{
return 0;
}
}
//
// handle an interrupt, exception, or system call from user space.
// called from trampoline.S
//
void
usertrap(void)
{
int which_dev = 0;
if((r_sstatus() & SSTATUS_SPP) != 0)
panic("usertrap: not from user mode");
// send interrupts and exceptions to kerneltrap(),
// since we're now in the kernel.
w_stvec((uint64)kernelvec);
struct proc *p = myproc();
// save user program counter.
p->trapframe->epc = r_sepc();
if(r_scause() == 8){
// system call
if(p->killed)
exit(-1);
// sepc points to the ecall instruction,
// but we want to return to the next instruction.
p->trapframe->epc += 4;
// an interrupt will change sstatus &c registers,
// so don't enable until done with those registers.
intr_on();
syscall();
} else if((which_dev = devintr()) != 0){
// ok
} else if(r_scause() == 13 || r_scause() == 15){
//当且仅当被读取时,调用cow
if(copyonwrite(myproc()->pagetable,r_stval())<=0){
printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}
}else{
printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}
if(p->killed)
exit(-1);
// give up the CPU if this is a timer interrupt.
if(which_dev == 2)
yield();
usertrapret();
}
vm.c
int
mappages(pagetable_t pagetable, uint64 va, uint64 size, uint64 pa, int perm)
{
uint64 a, last;
pte_t *pte;
if(size == 0)
panic("mappages: size");
a = PGROUNDDOWN(va);
last = PGROUNDDOWN(va + size - 1);
for(;;){
if((pte = walk(pagetable, a, 1)) == 0)
return -1;
// 允许pte重新映射
/* if(*pte & PTE_V)
panic("mappages: remap");*/
*pte = PA2PTE(pa) | perm | PTE_V;
if(a == last)
break;
a += PGSIZE;
pa += PGSIZE;
}
return 0;
}
...
int
uvmcopy(pagetable_t old, pagetable_t new, uint64 sz)
{
pte_t *pte;
uint64 pa, i;
uint flags;
/* char *mem;*/
for(i = 0; i < sz; i += PGSIZE){
if((pte = walk(old, i, 0)) == 0)
panic("uvmcopy: pte should exist");
if((*pte & PTE_V) == 0)
panic("uvmcopy: page not present");
pa = PTE2PA(*pte);
// 计数+1
kcount(PA2INDEX(pa),1);
// 标记为cow页面
*pte |= PTE_C;
// 标记为不可写
*pte &= ~(PTE_W);
flags = PTE_FLAGS(*pte);
if(mappages(new, i, PGSIZE, pa, flags) != 0){
/* kfree(mem);*/
goto err;
}
}
return 0;
err:
uvmunmap(new, 0, i / PGSIZE, 1);
return -1;
}
...
int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len)
{
uint64 n, va0, pa0;
while(len > 0){
va0 = PGROUNDDOWN(dstva);
// 调用cow
if(copyonwrite(pagetable,va0) < 0){
return -1;
}
pa0 = walkaddr(pagetable, va0);
if(pa0 == 0)
return -1;
n = PGSIZE - (dstva - va0);
if(n > len)
n = len;
memmove((void *)(pa0 + (dstva - va0)), src, n);
len -= n;
src += n;
dstva = va0 + PGSIZE;
}
return 0;
}
kalloc.c
// 物理内存计数
int memcount[PHYSTOP/PGSIZE] = {0};
// 操作memcount数组的锁
struct spinlock cnt_lock;
...
void
kfree(void *pa)
{
struct run *r;
if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP)
panic("kfree");
acquire(&cnt_lock);
int index = PA2INDEX((uint64)pa);
// 计数-1
if(memcount[index] >= 1){
memcount[index]--;
}
release(&cnt_lock);
// 如果计数仍大于0,直接返回
if(memcount[index] > 0){
return;
}
// Fill with junk to catch dangling refs.
memset(pa, 1, PGSIZE);
r = (struct run*)pa;
acquire(&kmem.lock);
r->next = kmem.freelist;
kmem.freelist = r;
release(&kmem.lock);
}
...
void *
kalloc(void)
{
struct run *r;
acquire(&kmem.lock);
r = kmem.freelist;
if(r){
kmem.freelist = r->next;
acquire(&cnt_lock);
// 如果新开辟的内存引用次数不为0,panic
if(memcount[PA2INDEX((uint64)r)]!=0){
panic("memory count should equal 0");
}
// 计数设置为1
memcount[PA2INDEX((uint64)r)] = 1;
release(&cnt_lock);
}
release(&kmem.lock);
if(r)
memset((char*)r, 5, PGSIZE); // fill with junk
return (void*)r;
}
// 为指定下标的物理内存修改引用次数
void
kcount(int index,int cnt)
{
acquire(&cnt_lock);
memcount[index]+=cnt;
release(&cnt_lock);
}
// 获取指定下标的引用次数
int
knum(int index)
{
acquire(&cnt_lock);
int num = memcount[index];
release(&cnt_lock);
return num;
}
riscv.h
// 将物理地址转换为引数组下标
#define PA2INDEX(pa) ((pa)>>12)
Q.E.D.