kern/arch/x86/page_alloc.c - upstream - Git at Google

 /* Copyright (c) 2009 The Regents of the University  of California.
  * See the COPYRIGHT files at the top of this source tree for full
  * license information.
  *
  * Barret Rhoden <brho@cs.berkeley.edu>
  * Kevin Klues <klueska@cs.berkeley.edu> */

 #ifdef __SHARC__
 #pragma nosharc
 #define SINIT(x) x
 #endif

 #include <sys/queue.h>
 #include <page_alloc.h>
 #include <pmap.h>
 #include <kmalloc.h>
 #include <multiboot.h>

 spinlock_t colored_page_free_list_lock = SPINLOCK_INITIALIZER_IRQSAVE;

 page_list_t LCKD(&colored_page_free_list_lock) * CT(llc_cache->num_colors) RO
   colored_page_free_list = NULL;

 static void page_alloc_bootstrap() {
 	// Allocate space for the array required to manage the free lists
 	size_t list_size = llc_cache->num_colors*sizeof(page_list_t);
 	page_list_t LCKD(&colored_page_free_list_lock)*tmp =
 	    (page_list_t*)boot_alloc(list_size,PGSIZE);
 	colored_page_free_list = SINIT(tmp);
 	for (int i = 0; i < llc_cache->num_colors; i++)
 		BSD_LIST_INIT(&colored_page_free_list[i]);
 }

 /* Can do whatever here.  For now, our page allocator just works with colors,
  * not NUMA zones or anything. */
 static void track_free_page(struct page *page)
 {
 	BSD_LIST_INSERT_HEAD(&colored_page_free_list[get_page_color(page2ppn(page),
 	                                                            llc_cache)],
 	                 page, pg_link);
 	nr_free_pages++;
 	/* Page was previous marked as busy, need to set it free explicitly */
 	page_setref(page, 0);
 }

 static struct page *pa64_to_page(uint64_t paddr)
 {
 	return &pages[paddr >> PGSHIFT];
 }

 static bool pa64_is_in_kernel(uint64_t paddr)
 {
 	extern char end[];
 	/* kernel is linked and loaded here (in kernel{32,64}.ld */
 	return (EXTPHYSMEM <= paddr) && (paddr < PADDR(end));
 }

 /* Helper.  For every page in the entry, this will determine whether or not the
  * page is free, and handle accordingly.  All pages are marked as busy by
  * default, and we're just determining which of them could be free. */
 static void parse_mboot_region(struct multiboot_mmap_entry *entry, void *data)
 {
 	physaddr_t boot_freemem_paddr = (physaddr_t)data;
 	bool in_bootzone = (entry->addr <= boot_freemem_paddr) &&
 	                   (boot_freemem_paddr < entry->addr + entry->len);

 	if (entry->type != MULTIBOOT_MEMORY_AVAILABLE)
 		return;
 	/* TODO: we'll have some issues with jumbo allocation */
 	/* Most entries are page aligned, though on some machines below EXTPHYSMEM
 	 * we may have some that aren't.  If two regions collide on the same page
 	 * (one of them starts unaligned), we need to only handle the page once, and
 	 * err on the side of being busy.
 	 *
 	 * Since these regions happen below EXTPHYSMEM, they are all marked busy (or
 	 * else we'll panic).  I'll probably rewrite this for jumbos before I find a
 	 * machine with unaligned mboot entries in higher memory. */
 	if (PGOFF(entry->addr))
 		assert(entry->addr < EXTPHYSMEM);
 	for (uint64_t i = ROUNDDOWN(entry->addr, PGSIZE);
 	     i < entry->addr + entry->len;
 	     i += PGSIZE) {
 		/* Skip pages we'll never map (above KERNBASE).  Once we hit one of
 		 * them, we know the rest are too (for this entry). */
 		if (i >= max_paddr)
 			return;
 		/* Mark low mem as busy (multiboot stuff is there, usually, too).  Since
 		 * that memory may be freed later (like the smp_boot page), we'll treat
 		 * it like it is busy/allocated. */
 		if (i < EXTPHYSMEM)
 			continue;
 		/* Mark as busy pages already allocated in boot_alloc() */
 		if (in_bootzone && (i < boot_freemem_paddr))
 			continue;
 		/* Need to double check for the kernel, in case it wasn't in the
 		 * bootzone.  If it was in the bootzone, we already skipped it. */
 		if (pa64_is_in_kernel(i))
 			continue;
 		track_free_page(pa64_to_page(i));
 	}
 }

 static void check_range(uint64_t start, uint64_t end, int expect)
 {
 	int ref;
 	if (PGOFF(start))
 		printk("Warning: check_range given unaligned addr 0x%016llx\n", start);
 	for (uint64_t i = start; i < end; i += PGSIZE)  {
 		ref = kref_refcnt(&pa64_to_page(i)->pg_kref);
 		if (ref != expect) {
 			printk("Error: while checking range [0x%016llx, 0x%016llx), "
 			       "physaddr 0x%016llx refcnt was %d, expected %d\n", start,
 			       end, i, ref, expect);
 			panic("");
 		}
 	}
 }

 /* Note this doesn't check all of memory.  There are some chunks of 'memory'
  * that aren't reported by MB at all, like the VRAM sections at 0xa0000. */
 static void check_mboot_region(struct multiboot_mmap_entry *entry, void *data)
 {
 	extern char end[];
 	physaddr_t boot_freemem_paddr = (physaddr_t)data;
 	bool in_bootzone = (entry->addr <= boot_freemem_paddr) &&
 	                   (boot_freemem_paddr < entry->addr + entry->len);
 	/* Need to deal with 32b wrap-around */
 	uint64_t zone_end = MIN(entry->addr + entry->len, (uint64_t)max_paddr);

 	if (entry->type != MULTIBOOT_MEMORY_AVAILABLE) {
 		check_range(entry->addr, zone_end, 1);
 		return;
 	}
 	if (zone_end <= EXTPHYSMEM) {
 		check_range(entry->addr, zone_end, 1);
 		return;
 	}
 	/* this may include the kernel */
 	if (in_bootzone) {
 		/* boot_freemem might not be page aligned.  If it's part-way through a
 		 * page, that page should be busy */
 		check_range(entry->addr, ROUNDUP(PADDR(boot_freemem), PGSIZE), 1);
 		check_range(ROUNDUP(PADDR(boot_freemem), PGSIZE), zone_end, 0);
 		assert(zone_end == PADDR(boot_freelimit));
 		return;
 	}
 	/* kernel's range (hardcoded in the linker script).  If we're checking now,
 	 * it means the kernel is not in the same entry as the bootzone. */
 	if (entry->addr == EXTPHYSMEM) {
 		check_range(EXTPHYSMEM, PADDR(end), 1);
 		check_range(ROUNDUP(PADDR(end), PGSIZE), zone_end, 0);
 		return;
 	}
 }

 /* Since we can't parse multiboot mmap entries, we need to just guess at what
  * pages are free and which ones aren't.
  *
  * Despite the lack of info from mbi, I know there is a magic hole in physical
  * memory that we can't use, from the IOAPIC_PBASE on up [0xfec00000,
  * 0xffffffff] (I'm being pessimistic).  But, that's not pessimistic enough!
  * Qemu still doesn't like that.   From using 0xe0000000 instead works for mine.
  * According to http://wiki.osdev.org/Memory_Map_(x86), some systems could
  * reserve from [0xc0000000, 0xffffffff].  Anyway, in lieu of real memory
  * detection, I'm just skipping that entire region.
  *
  * We may or may not have more free memory above this magic hole, depending on
  * both the amount of RAM we have as well as 32 vs 64 bit.
  *
  * So we'll go with two free memory regions:
  *
  * 		[ 0, ROUNDUP(boot_freemem_paddr, PGSIZE) ) = busy
  * 		[ ROUNDUP(boot_freemem_paddr, PGSIZE), TOP_OF_1 ) = free
  * 		[ MAGIC_HOLE, 0x0000000100000000 ) = busy
  * 		(and maybe this:)
  * 		[ 0x0000000100000000, max_paddr ) = free
  *
  * where TOP_OF_1 is the min of IOAPIC_PBASE and max_paddr.
  *
  * For the busy regions, I don't actually need to mark the pages as busy.  They
  * were marked busy when the pages array was created (same as when we parse
  * multiboot info).  I'll just assert that they are properly marked as busy.
  *
  * As with parsing mbi regions, this will ignore the hairy areas below
  * EXTPHYSMEM, and mark the entire kernel and anything we've boot alloc'd as
  * busy. */
 static void account_for_pages(physaddr_t boot_freemem_paddr)
 {
 	physaddr_t top_of_busy = ROUNDUP(boot_freemem_paddr, PGSIZE);
 	physaddr_t top_of_free_1 = MIN(0xc0000000, max_paddr);
 	physaddr_t start_of_free_2;

 	printk("Warning: poor memory detection (qemu?).  May lose 1GB of RAM\n");
 	for (physaddr_t i = 0; i < top_of_busy; i += PGSIZE)
 		assert(kref_refcnt(&pa64_to_page(i)->pg_kref) == 1);
 	for (physaddr_t i = top_of_busy; i < top_of_free_1; i += PGSIZE)
 		track_free_page(pa64_to_page(i));
 	/* If max_paddr is less than the start of our potential second free mem
 	 * region, we can just leave.  We also don't want to poke around the pages
 	 * array either (and accidentally run off the end of the array).
 	 *
 	 * Additionally, 32 bit doesn't acknowledge pmem above the 4GB mark. */
 	start_of_free_2 = 0x0000000100000000;
 	if (max_paddr < start_of_free_2)
 		return;
 	for (physaddr_t i = top_of_free_1; i < start_of_free_2; i += PGSIZE)
 		assert(kref_refcnt(&pa64_to_page(i)->pg_kref) == 1);
 	for (physaddr_t i = start_of_free_2; i < max_paddr; i += PGSIZE)
 		track_free_page(pa64_to_page(i));
 }

 /* Initialize the memory free lists.  After this, do not use boot_alloc. */
 void page_alloc_init(struct multiboot_info *mbi)
 {
 	page_alloc_bootstrap();
 	/* First, we need to initialize the pages array such that all memory is busy
 	 * by default.
 	 *
 	 * To init the free list(s), each page that is already allocated/busy will
 	 * remain increfed.  All other pages that were reported as 'free' will be
 	 * added to a free list.  Their refcnts are set to 0.
 	 *
 	 * To avoid a variety of headaches, any memory below 1MB is considered busy.
 	 * Likewise, everything in the kernel, up to _end is also busy.  And
 	 * everything we've already boot_alloc'd is busy.  These chunks of memory
 	 * are reported as 'free' by multiboot.
 	 *
 	 * We'll also abort the mapping for any addresses over max_paddr, since
 	 * we'll never use them.  'pages' does not track them either.
 	 *
 	 * One special note: we actually use the memory at 0x1000 for smp_boot.
 	 * It'll get set to 'used' like the others; just FYI.
 	 *
 	 * Finally, if we want to use actual jumbo page allocation (not just
 	 * mapping), we need to round up _end, and make sure all of multiboot's
 	 * sections are jumbo-aligned. */
 	physaddr_t boot_freemem_paddr = PADDR(ROUNDUP(boot_freemem, PGSIZE));

 	for (long i = 0; i < max_nr_pages; i++)
 		page_setref(&pages[i], 1);
 	if (mboot_has_mmaps(mbi)) {
 		mboot_foreach_mmap(mbi, parse_mboot_region, (void*)boot_freemem_paddr);
 		/* Test the page alloc - if this gets slow, we can CONFIG it */
 		mboot_foreach_mmap(mbi, check_mboot_region, (void*)boot_freemem_paddr);
 	} else {
 		/* No multiboot mmap regions (probably run from qemu with -kernel) */
 		account_for_pages(boot_freemem_paddr);
 	}
 	printk("Number of free pages: %lu\n", nr_free_pages);
 	printk("Page alloc init successful\n");
 }
	/* Copyright (c) 2009 The Regents of the University of California.
	* See the COPYRIGHT files at the top of this source tree for full
	* license information.
	*
	* Barret Rhoden <brho@cs.berkeley.edu>
	* Kevin Klues <klueska@cs.berkeley.edu> */

	#ifdef __SHARC__
	#pragma nosharc
	#define SINIT(x) x
	#endif

	#include <sys/queue.h>
	#include <page_alloc.h>
	#include <pmap.h>
	#include <kmalloc.h>
	#include <multiboot.h>

	spinlock_t colored_page_free_list_lock = SPINLOCK_INITIALIZER_IRQSAVE;

	page_list_t LCKD(&colored_page_free_list_lock) * CT(llc_cache->num_colors) RO
	colored_page_free_list = NULL;

	static void page_alloc_bootstrap() {
	// Allocate space for the array required to manage the free lists
	size_t list_size = llc_cache->num_colors*sizeof(page_list_t);
	page_list_t LCKD(&colored_page_free_list_lock)*tmp =
	(page_list_t*)boot_alloc(list_size,PGSIZE);
	colored_page_free_list = SINIT(tmp);
	for (int i = 0; i < llc_cache->num_colors; i++)
	BSD_LIST_INIT(&colored_page_free_list[i]);
	}

	/* Can do whatever here. For now, our page allocator just works with colors,
	* not NUMA zones or anything. */
	static void track_free_page(struct page *page)
	{
	BSD_LIST_INSERT_HEAD(&colored_page_free_list[get_page_color(page2ppn(page),
	llc_cache)],
	page, pg_link);
	nr_free_pages++;
	/* Page was previous marked as busy, need to set it free explicitly */
	page_setref(page, 0);
	}

	static struct page *pa64_to_page(uint64_t paddr)
	{
	return &pages[paddr >> PGSHIFT];
	}

	static bool pa64_is_in_kernel(uint64_t paddr)
	{
	extern char end[];
	/* kernel is linked and loaded here (in kernel{32,64}.ld */
	return (EXTPHYSMEM <= paddr) && (paddr < PADDR(end));
	}

	/* Helper. For every page in the entry, this will determine whether or not the
	* page is free, and handle accordingly. All pages are marked as busy by
	* default, and we're just determining which of them could be free. */
	static void parse_mboot_region(struct multiboot_mmap_entry entry, void data)
	{
	physaddr_t boot_freemem_paddr = (physaddr_t)data;
	bool in_bootzone = (entry->addr <= boot_freemem_paddr) &&
	(boot_freemem_paddr < entry->addr + entry->len);

	if (entry->type != MULTIBOOT_MEMORY_AVAILABLE)
	return;
	/* TODO: we'll have some issues with jumbo allocation */
	/* Most entries are page aligned, though on some machines below EXTPHYSMEM
	* we may have some that aren't. If two regions collide on the same page
	* (one of them starts unaligned), we need to only handle the page once, and
	* err on the side of being busy.
	*
	* Since these regions happen below EXTPHYSMEM, they are all marked busy (or
	* else we'll panic). I'll probably rewrite this for jumbos before I find a
	* machine with unaligned mboot entries in higher memory. */
	if (PGOFF(entry->addr))
	assert(entry->addr < EXTPHYSMEM);
	for (uint64_t i = ROUNDDOWN(entry->addr, PGSIZE);
	i < entry->addr + entry->len;
	i += PGSIZE) {
	/* Skip pages we'll never map (above KERNBASE). Once we hit one of
	* them, we know the rest are too (for this entry). */
	if (i >= max_paddr)
	return;
	/* Mark low mem as busy (multiboot stuff is there, usually, too). Since
	* that memory may be freed later (like the smp_boot page), we'll treat
	* it like it is busy/allocated. */
	if (i < EXTPHYSMEM)
	continue;
	/* Mark as busy pages already allocated in boot_alloc() */
	if (in_bootzone && (i < boot_freemem_paddr))
	continue;
	/* Need to double check for the kernel, in case it wasn't in the
	* bootzone. If it was in the bootzone, we already skipped it. */
	if (pa64_is_in_kernel(i))
	continue;
	track_free_page(pa64_to_page(i));
	}
	}

	static void check_range(uint64_t start, uint64_t end, int expect)
	{
	int ref;
	if (PGOFF(start))
	printk("Warning: check_range given unaligned addr 0x%016llx\n", start);
	for (uint64_t i = start; i < end; i += PGSIZE) {
	ref = kref_refcnt(&pa64_to_page(i)->pg_kref);
	if (ref != expect) {
	printk("Error: while checking range [0x%016llx, 0x%016llx), "
	"physaddr 0x%016llx refcnt was %d, expected %d\n", start,
	end, i, ref, expect);
	panic("");
	}
	}
	}

	/* Note this doesn't check all of memory. There are some chunks of 'memory'
	* that aren't reported by MB at all, like the VRAM sections at 0xa0000. */
	static void check_mboot_region(struct multiboot_mmap_entry entry, void data)
	{
	extern char end[];
	physaddr_t boot_freemem_paddr = (physaddr_t)data;
	bool in_bootzone = (entry->addr <= boot_freemem_paddr) &&
	(boot_freemem_paddr < entry->addr + entry->len);
	/* Need to deal with 32b wrap-around */
	uint64_t zone_end = MIN(entry->addr + entry->len, (uint64_t)max_paddr);

	if (entry->type != MULTIBOOT_MEMORY_AVAILABLE) {
	check_range(entry->addr, zone_end, 1);
	return;
	}
	if (zone_end <= EXTPHYSMEM) {
	check_range(entry->addr, zone_end, 1);
	return;
	}
	/* this may include the kernel */
	if (in_bootzone) {
	/* boot_freemem might not be page aligned. If it's part-way through a
	* page, that page should be busy */
	check_range(entry->addr, ROUNDUP(PADDR(boot_freemem), PGSIZE), 1);
	check_range(ROUNDUP(PADDR(boot_freemem), PGSIZE), zone_end, 0);
	assert(zone_end == PADDR(boot_freelimit));
	return;
	}
	/* kernel's range (hardcoded in the linker script). If we're checking now,
	* it means the kernel is not in the same entry as the bootzone. */
	if (entry->addr == EXTPHYSMEM) {
	check_range(EXTPHYSMEM, PADDR(end), 1);
	check_range(ROUNDUP(PADDR(end), PGSIZE), zone_end, 0);
	return;
	}
	}

	/* Since we can't parse multiboot mmap entries, we need to just guess at what
	* pages are free and which ones aren't.
	*
	* Despite the lack of info from mbi, I know there is a magic hole in physical
	* memory that we can't use, from the IOAPIC_PBASE on up [0xfec00000,
	* 0xffffffff] (I'm being pessimistic). But, that's not pessimistic enough!
	* Qemu still doesn't like that. From using 0xe0000000 instead works for mine.
	* According to http://wiki.osdev.org/Memory_Map_(x86), some systems could
	* reserve from [0xc0000000, 0xffffffff]. Anyway, in lieu of real memory
	* detection, I'm just skipping that entire region.
	*
	* We may or may not have more free memory above this magic hole, depending on
	* both the amount of RAM we have as well as 32 vs 64 bit.
	*
	* So we'll go with two free memory regions:
	*
	* [ 0, ROUNDUP(boot_freemem_paddr, PGSIZE) ) = busy
	* [ ROUNDUP(boot_freemem_paddr, PGSIZE), TOP_OF_1 ) = free
	* [ MAGIC_HOLE, 0x0000000100000000 ) = busy
	* (and maybe this:)
	* [ 0x0000000100000000, max_paddr ) = free
	*
	* where TOP_OF_1 is the min of IOAPIC_PBASE and max_paddr.
	*
	* For the busy regions, I don't actually need to mark the pages as busy. They
	* were marked busy when the pages array was created (same as when we parse
	* multiboot info). I'll just assert that they are properly marked as busy.
	*
	* As with parsing mbi regions, this will ignore the hairy areas below
	* EXTPHYSMEM, and mark the entire kernel and anything we've boot alloc'd as
	* busy. */
	static void account_for_pages(physaddr_t boot_freemem_paddr)
	{
	physaddr_t top_of_busy = ROUNDUP(boot_freemem_paddr, PGSIZE);
	physaddr_t top_of_free_1 = MIN(0xc0000000, max_paddr);
	physaddr_t start_of_free_2;

	printk("Warning: poor memory detection (qemu?). May lose 1GB of RAM\n");
	for (physaddr_t i = 0; i < top_of_busy; i += PGSIZE)
	assert(kref_refcnt(&pa64_to_page(i)->pg_kref) == 1);
	for (physaddr_t i = top_of_busy; i < top_of_free_1; i += PGSIZE)
	track_free_page(pa64_to_page(i));
	/* If max_paddr is less than the start of our potential second free mem
	* region, we can just leave. We also don't want to poke around the pages
	* array either (and accidentally run off the end of the array).
	*
	* Additionally, 32 bit doesn't acknowledge pmem above the 4GB mark. */
	start_of_free_2 = 0x0000000100000000;
	if (max_paddr < start_of_free_2)
	return;
	for (physaddr_t i = top_of_free_1; i < start_of_free_2; i += PGSIZE)
	assert(kref_refcnt(&pa64_to_page(i)->pg_kref) == 1);
	for (physaddr_t i = start_of_free_2; i < max_paddr; i += PGSIZE)
	track_free_page(pa64_to_page(i));
	}

	/* Initialize the memory free lists. After this, do not use boot_alloc. */
	void page_alloc_init(struct multiboot_info *mbi)
	{
	page_alloc_bootstrap();
	/* First, we need to initialize the pages array such that all memory is busy
	* by default.
	*
	* To init the free list(s), each page that is already allocated/busy will
	* remain increfed. All other pages that were reported as 'free' will be
	* added to a free list. Their refcnts are set to 0.
	*
	* To avoid a variety of headaches, any memory below 1MB is considered busy.
	* Likewise, everything in the kernel, up to _end is also busy. And
	* everything we've already boot_alloc'd is busy. These chunks of memory
	* are reported as 'free' by multiboot.
	*
	* We'll also abort the mapping for any addresses over max_paddr, since
	* we'll never use them. 'pages' does not track them either.
	*
	* One special note: we actually use the memory at 0x1000 for smp_boot.
	* It'll get set to 'used' like the others; just FYI.
	*
	* Finally, if we want to use actual jumbo page allocation (not just
	* mapping), we need to round up _end, and make sure all of multiboot's
	* sections are jumbo-aligned. */
	physaddr_t boot_freemem_paddr = PADDR(ROUNDUP(boot_freemem, PGSIZE));

	for (long i = 0; i < max_nr_pages; i++)
	page_setref(&pages[i], 1);
	if (mboot_has_mmaps(mbi)) {
	mboot_foreach_mmap(mbi, parse_mboot_region, (void*)boot_freemem_paddr);
	/* Test the page alloc - if this gets slow, we can CONFIG it */
	mboot_foreach_mmap(mbi, check_mboot_region, (void*)boot_freemem_paddr);
	} else {
	/* No multiboot mmap regions (probably run from qemu with -kernel) */
	account_for_pages(boot_freemem_paddr);
	}
	printk("Number of free pages: %lu\n", nr_free_pages);
	printk("Page alloc init successful\n");
	}