kern/src/blockdev.c - upstream - Git at Google

 /* Copyright (c) 2010 The Regents of the University of California
  * Barret Rhoden <brho@cs.berkeley.edu>
  * See LICENSE for details.
  *
  * Block devices and generic blockdev infrastructure */

 #include <devfs.h>
 #include <blockdev.h>
 #include <kmalloc.h>
 #include <slab.h>
 #include <page_alloc.h>
 #include <pmap.h>
 /* These two are needed for the fake interrupt */
 #include <alarm.h>
 #include <smp.h>

 struct file_operations block_f_op;
 struct page_map_operations block_pm_op;
 struct kmem_cache *breq_kcache;

 void block_init(void)
 {
 	breq_kcache = kmem_cache_create("block_reqs",
 					sizeof(struct block_request),
 					__alignof__(struct block_request), 0,
 					NULL, 0, 0, NULL);
 	bh_kcache = kmem_cache_create("buffer_heads",
 				      sizeof(struct buffer_head),
 				      __alignof__(struct buffer_head), 0,
 				      NULL, 0, 0, NULL);

 	#ifdef CONFIG_EXT2FS
 	/* Now probe for and init the block device for the ext2 ram disk */
 	extern uint8_t _binary_mnt_ext2fs_img_size[];
 	extern uint8_t _binary_mnt_ext2fs_img_start[];
 	/* Build and init the block device */
 	struct block_device *ram_bd = kmalloc(sizeof(struct block_device), 0);
 	memset(ram_bd, 0, sizeof(struct block_device));
 	ram_bd->b_id = 31337;
 	ram_bd->b_sector_sz = 512;
 	ram_bd->b_nr_sector = (unsigned long)_binary_mnt_ext2fs_img_size / 512;
 	kref_init(&ram_bd->b_kref, fake_release, 1);
 	pm_init(&ram_bd->b_pm, &block_pm_op, ram_bd);
 	ram_bd->b_data = _binary_mnt_ext2fs_img_start;
 	strlcpy(ram_bd->b_name, "RAMDISK", BDEV_INLINE_NAME);
 	/* Connect it to the file system */
 	struct file *ram_bf = make_device("/dev_vfs/ramdisk", S_IRUSR | S_IWUSR,
 	                                  __S_IFBLK, &block_f_op);
 	/* make sure the inode tracks the right pm (not it's internal one) */
 	ram_bf->f_dentry->d_inode->i_mapping = &ram_bd->b_pm;
 	ram_bf->f_dentry->d_inode->i_bdev = ram_bd;	/* this holds the bd kref */
 	kref_put(&ram_bf->f_kref);
 	#endif /* CONFIG_EXT2FS */
 }

 /* Generic helper, returns a kref'd reference out of principle. */
 struct block_device *get_bdev(char *path)
 {
 	struct block_device *bdev;
 	struct file *block_f;
 	block_f = do_file_open(path, O_RDWR, 0);
 	assert(block_f);
 	bdev = block_f->f_dentry->d_inode->i_bdev;
 	kref_get(&bdev->b_kref, 1);
 	kref_put(&block_f->f_kref);
 	return bdev;
 }

 /* Frees all the BHs associated with page.  There could be 0, to deal with one
  * that wasn't UPTODATE.  Don't call this on a page that isn't a PG_BUFFER.
  * Note, these are not a circular LL (for now). */
 void free_bhs(struct page *page)
 {
 	struct buffer_head *bh, *next;
 	assert(atomic_read(&page->pg_flags) & PG_BUFFER);
 	bh = (struct buffer_head*)page->pg_private;
 	while (bh) {
 		next = bh->bh_next;
 		bh->bh_next = 0;
 		kmem_cache_free(bh_kcache, bh);
 		bh = next;
 	}
 	page->pg_private = 0;		/* catch bugs */
 }

 /* This ultimately will handle the actual request processing, all the way down
  * to the driver, and will deal with blocking.  For now, we just fulfill the
  * request right away (RAM based block devs). */
 int bdev_submit_request(struct block_device *bdev, struct block_request *breq)
 {
 	void *src, *dst;
 	unsigned long first_sector;
 	unsigned int nr_sector;

 	for (int i = 0; i < breq->nr_bhs; i++) {
 		first_sector = breq->bhs[i]->bh_sector;
 		nr_sector = breq->bhs[i]->bh_nr_sector;
 		/* Sectors are indexed starting with 0, for now. */
 		if (first_sector + nr_sector > bdev->b_nr_sector) {
 			warn("Exceeding the num sectors!");
 			return -1;
 		}
 		if (breq->flags & BREQ_READ) {
 			dst = breq->bhs[i]->bh_buffer;
 			src = bdev->b_data + (first_sector << SECTOR_SZ_LOG);
 		} else if (breq->flags & BREQ_WRITE) {
 			dst = bdev->b_data + (first_sector << SECTOR_SZ_LOG);
 			src = breq->bhs[i]->bh_buffer;
 		} else {
 			panic("Need a request type!\n");
 		}
 		memcpy(dst, src, nr_sector << SECTOR_SZ_LOG);
 	}
 	/* Faking the device interrupt with an alarm */
 	void breq_handler(struct alarm_waiter *waiter)
 	{
 		/* In the future, we'll need to figure out which breq this was in
 		 * response to */
 		struct block_request *breq = (struct block_request*)waiter->data;
 		if (breq->callback)
 			breq->callback(breq);
 		kfree(waiter);
 	}
 	struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
 	struct alarm_waiter *waiter = kmalloc(sizeof(struct alarm_waiter), 0);
 	init_awaiter(waiter, breq_handler);
 	/* Stitch things up, so we know how to find things later */
 	waiter->data = breq;
 	/* Set for 5ms. */
 	set_awaiter_rel(waiter, 5000);
 	set_alarm(tchain, waiter);
 	return 0;
 }

 /* Helper method, unblocks someone blocked on sleep_on_breq(). */
 void generic_breq_done(struct block_request *breq)
 {
 	int8_t irq_state = 0;
 	if (!sem_up_irqsave(&breq->sem, &irq_state)) {
 		/* This shouldn't happen anymore.  Let brho know if it does. */
 		warn("[kernel] no one waiting on breq %p", breq);
 	}
 }

 /* Helper, pairs with generic_breq_done().  Note we sleep here on a semaphore
  * instead of faking it with an alarm.  Ideally, this code will be the same even
  * for real block devices (that don't fake things with timer interrupts). */
 void sleep_on_breq(struct block_request *breq)
 {
 	int8_t irq_state = 0;
 	/* Since printk takes a while, this may make you lose the race */
 	printd("Sleeping on breq %p\n", breq);
 	assert(irq_is_enabled());
 	sem_down_irqsave(&breq->sem, &irq_state);
 }

 /* This just tells the page cache that it is 'up to date'.  Due to the nature of
  * the blocks in the page cache, we don't actually read the items in on
  * readpage, we read them in when a specific block is there */
 int block_readpage(struct page_map *pm, struct page *page)
 {
 	atomic_or(&page->pg_flags, PG_UPTODATE);
 	return 0;
 }

 /* Returns a BH pointing to the buffer where blk_num from bdev is located (given
  * blocks of size blk_sz).  This uses the page cache for the page allocations
  * and evictions, but only caches blocks that are requested.  Check the docs for
  * more info.  The BH isn't refcounted, but a page refcnt is returned.  Call
  * put_block (nand/xor dirty block).
  *
  * Note we're using the lock_page() to sync (which is what we do with the page
  * cache too.  It's not ideal, but keeps things simpler for now.
  *
  * Also note we're a little inconsistent with the use of sector sizes in certain
  * files.  We'll sort it eventually. */
 struct buffer_head *bdev_get_buffer(struct block_device *bdev,
                                     unsigned long blk_num, unsigned int blk_sz)
 {
 	struct page *page;
 	struct page_map *pm = &bdev->b_pm;
 	struct buffer_head *bh, *new, *prev, **next_loc;
 	struct block_request *breq;
 	int error;
 	unsigned int blk_per_pg = PGSIZE / blk_sz;
 	unsigned int sct_per_blk = blk_sz / bdev->b_sector_sz;
 	unsigned int blk_offset = (blk_num % blk_per_pg) * blk_sz;
 	void *my_buf;
 	assert(blk_offset < PGSIZE);
 	if (!blk_num)
 		warn("Asking for the 0th block of a bdev...");
 	/* Make sure there's a page in the page cache.  Should always be one. */
 	error = pm_load_page(pm, blk_num / blk_per_pg, &page);
 	if (error)
 		panic("Failed to load page! (%d)", error);
 	my_buf = page2kva(page) + blk_offset;
 	atomic_or(&page->pg_flags, PG_BUFFER);
 retry:
 	bh = (struct buffer_head*)page->pg_private;
 	prev = 0;
 	/* look through all the BHs for ours, stopping if we go too far. */
 	while (bh) {
 		if (bh->bh_buffer == my_buf) {
 			goto found;
 		} else if (bh->bh_buffer > my_buf) {
 			break;
 		}
 		prev = bh;
 		bh = bh->bh_next;
 	}
 	/* At this point, bh points to the one beyond our space (or 0), and prev is
 	 * either the one before us or 0.  We make a BH, and try to insert */
 	new = kmem_cache_alloc(bh_kcache, 0);
 	assert(new);
 	new->bh_page = page;					/* weak ref */
 	new->bh_buffer = my_buf;
 	new->bh_flags = 0;
 	new->bh_next = bh;
 	new->bh_bdev = bdev;					/* uncounted ref */
 	new->bh_sector = blk_num * sct_per_blk;
 	new->bh_nr_sector = sct_per_blk;
 	/* Try to insert the new one in place.  If it fails, retry the whole "find
 	 * the bh" process.  This should be rare, so no sense optimizing it. */
 	next_loc = prev ? &prev->bh_next : (struct buffer_head**)&page->pg_private;
 	/* Normally, there'd be an ABA problem here, but we never actually remove
 	 * bhs from the chain until the whole page gets cleaned up, which can't
 	 * happen while we hold a reference to the page. */
 	if (!atomic_cas_ptr((void**)next_loc, bh, new)) {
 		kmem_cache_free(bh_kcache, new);
 		goto retry;
 	}
 	bh = new;
 found:
 	/* At this point, we have the BH for our buf, but it might not be up to
 	 * date, and there might be someone else trying to update it. */
 	/* is it already here and up to date?  if so, we're done */
 	if (bh->bh_flags & BH_UPTODATE)
 		return bh;
 	/* if not, try to lock the page (could BLOCK).  Using this for syncing. */
 	lock_page(page);
 	/* double check, are we up to date?  if so, we're done */
 	if (bh->bh_flags & BH_UPTODATE) {
 		unlock_page(page);
 		return bh;
 	}
 	/* if we're here, the page is locked by us, we need to read the block */
 	breq = kmem_cache_alloc(breq_kcache, 0);
 	assert(breq);
 	breq->flags = BREQ_READ;
 	breq->callback = generic_breq_done;
 	breq->data = 0;
 	sem_init_irqsave(&breq->sem, 0);
 	breq->bhs = breq->local_bhs;
 	breq->bhs[0] = bh;
 	breq->nr_bhs = 1;
 	error = bdev_submit_request(bdev, breq);
 	assert(!error);
 	sleep_on_breq(breq);
 	kmem_cache_free(breq_kcache, breq);
 	/* after the data is read, we mark it up to date and unlock the page. */
 	bh->bh_flags |= BH_UPTODATE;
 	unlock_page(page);
 	return bh;
 }

 /* Will dirty the block/BH/page for the given block/buffer.  Will have to be
  * careful with the page reclaimer - if someone holds a reference, they can
  * still dirty it. */
 void bdev_dirty_buffer(struct buffer_head *bh)
 {
 	struct page *page = bh->bh_page;
 	/* TODO: race on flag modification */
 	bh->bh_flags |= BH_DIRTY;
 	atomic_or(&page->pg_flags, PG_DIRTY);
 }

 /* Decrefs the buffer from bdev_get_buffer().  Call this when you no longer
  * reference your block/buffer.  For now, we do refcnting on the page, since the
  * reclaiming will be in page sized chunks from the page cache. */
 void bdev_put_buffer(struct buffer_head *bh)
 {
 	pm_put_page(bh->bh_page);
 }

 /* Block device page map ops: */
 struct page_map_operations block_pm_op = {
 	block_readpage,
 };

 /* Block device file ops: for now, we don't let you do much of anything */
 struct file_operations block_f_op = {
 	dev_c_llseek,
 	0,
 	0,
 	kfs_readdir,	/* this will fail gracefully */
 	dev_mmap,
 	kfs_open,
 	kfs_flush,
 	kfs_release,
 	0,	/* fsync - makes no sense */
 	kfs_poll,
 	0,	/* readv */
 	0,	/* writev */
 	kfs_sendpage,
 	kfs_check_flags,
 };
	/* Copyright (c) 2010 The Regents of the University of California
	* Barret Rhoden <brho@cs.berkeley.edu>
	* See LICENSE for details.
	*
	* Block devices and generic blockdev infrastructure */

	#include <devfs.h>
	#include <blockdev.h>
	#include <kmalloc.h>
	#include <slab.h>
	#include <page_alloc.h>
	#include <pmap.h>
	/* These two are needed for the fake interrupt */
	#include <alarm.h>
	#include <smp.h>

	struct file_operations block_f_op;
	struct page_map_operations block_pm_op;
	struct kmem_cache *breq_kcache;

	void block_init(void)
	{
	breq_kcache = kmem_cache_create("block_reqs",
	sizeof(struct block_request),
	__alignof__(struct block_request), 0,
	NULL, 0, 0, NULL);
	bh_kcache = kmem_cache_create("buffer_heads",
	sizeof(struct buffer_head),
	__alignof__(struct buffer_head), 0,
	NULL, 0, 0, NULL);

	#ifdef CONFIG_EXT2FS
	/* Now probe for and init the block device for the ext2 ram disk */
	extern uint8_t _binary_mnt_ext2fs_img_size[];
	extern uint8_t _binary_mnt_ext2fs_img_start[];
	/* Build and init the block device */
	struct block_device *ram_bd = kmalloc(sizeof(struct block_device), 0);
	memset(ram_bd, 0, sizeof(struct block_device));
	ram_bd->b_id = 31337;
	ram_bd->b_sector_sz = 512;
	ram_bd->b_nr_sector = (unsigned long)_binary_mnt_ext2fs_img_size / 512;
	kref_init(&ram_bd->b_kref, fake_release, 1);
	pm_init(&ram_bd->b_pm, &block_pm_op, ram_bd);
	ram_bd->b_data = _binary_mnt_ext2fs_img_start;
	strlcpy(ram_bd->b_name, "RAMDISK", BDEV_INLINE_NAME);
	/* Connect it to the file system */
	struct file *ram_bf = make_device("/dev_vfs/ramdisk", S_IRUSR \| S_IWUSR,
	__S_IFBLK, &block_f_op);
	/* make sure the inode tracks the right pm (not it's internal one) */
	ram_bf->f_dentry->d_inode->i_mapping = &ram_bd->b_pm;
	ram_bf->f_dentry->d_inode->i_bdev = ram_bd; /* this holds the bd kref */
	kref_put(&ram_bf->f_kref);
	#endif /* CONFIG_EXT2FS */
	}

	/* Generic helper, returns a kref'd reference out of principle. */
	struct block_device get_bdev(char path)
	{
	struct block_device *bdev;
	struct file *block_f;
	block_f = do_file_open(path, O_RDWR, 0);
	assert(block_f);
	bdev = block_f->f_dentry->d_inode->i_bdev;
	kref_get(&bdev->b_kref, 1);
	kref_put(&block_f->f_kref);
	return bdev;
	}

	/* Frees all the BHs associated with page. There could be 0, to deal with one
	* that wasn't UPTODATE. Don't call this on a page that isn't a PG_BUFFER.
	* Note, these are not a circular LL (for now). */
	void free_bhs(struct page *page)
	{
	struct buffer_head bh, next;
	assert(atomic_read(&page->pg_flags) & PG_BUFFER);
	bh = (struct buffer_head*)page->pg_private;
	while (bh) {
	next = bh->bh_next;
	bh->bh_next = 0;
	kmem_cache_free(bh_kcache, bh);
	bh = next;
	}
	page->pg_private = 0; /* catch bugs */
	}

	/* This ultimately will handle the actual request processing, all the way down
	* to the driver, and will deal with blocking. For now, we just fulfill the
	* request right away (RAM based block devs). */
	int bdev_submit_request(struct block_device bdev, struct block_request breq)
	{
	void src, dst;
	unsigned long first_sector;
	unsigned int nr_sector;

	for (int i = 0; i < breq->nr_bhs; i++) {
	first_sector = breq->bhs[i]->bh_sector;
	nr_sector = breq->bhs[i]->bh_nr_sector;
	/* Sectors are indexed starting with 0, for now. */
	if (first_sector + nr_sector > bdev->b_nr_sector) {
	warn("Exceeding the num sectors!");
	return -1;
	}
	if (breq->flags & BREQ_READ) {
	dst = breq->bhs[i]->bh_buffer;
	src = bdev->b_data + (first_sector << SECTOR_SZ_LOG);
	} else if (breq->flags & BREQ_WRITE) {
	dst = bdev->b_data + (first_sector << SECTOR_SZ_LOG);
	src = breq->bhs[i]->bh_buffer;
	} else {
	panic("Need a request type!\n");
	}
	memcpy(dst, src, nr_sector << SECTOR_SZ_LOG);
	}
	/* Faking the device interrupt with an alarm */
	void breq_handler(struct alarm_waiter *waiter)
	{
	/* In the future, we'll need to figure out which breq this was in
	* response to */
	struct block_request breq = (struct block_request)waiter->data;
	if (breq->callback)
	breq->callback(breq);
	kfree(waiter);
	}
	struct timer_chain *tchain = &per_cpu_info[core_id()].tchain;
	struct alarm_waiter *waiter = kmalloc(sizeof(struct alarm_waiter), 0);
	init_awaiter(waiter, breq_handler);
	/* Stitch things up, so we know how to find things later */
	waiter->data = breq;
	/* Set for 5ms. */
	set_awaiter_rel(waiter, 5000);
	set_alarm(tchain, waiter);
	return 0;
	}

	/* Helper method, unblocks someone blocked on sleep_on_breq(). */
	void generic_breq_done(struct block_request *breq)
	{
	int8_t irq_state = 0;
	if (!sem_up_irqsave(&breq->sem, &irq_state)) {
	/* This shouldn't happen anymore. Let brho know if it does. */
	warn("[kernel] no one waiting on breq %p", breq);
	}
	}

	/* Helper, pairs with generic_breq_done(). Note we sleep here on a semaphore
	* instead of faking it with an alarm. Ideally, this code will be the same even
	* for real block devices (that don't fake things with timer interrupts). */
	void sleep_on_breq(struct block_request *breq)
	{
	int8_t irq_state = 0;
	/* Since printk takes a while, this may make you lose the race */
	printd("Sleeping on breq %p\n", breq);
	assert(irq_is_enabled());
	sem_down_irqsave(&breq->sem, &irq_state);
	}

	/* This just tells the page cache that it is 'up to date'. Due to the nature of
	* the blocks in the page cache, we don't actually read the items in on
	* readpage, we read them in when a specific block is there */
	int block_readpage(struct page_map pm, struct page page)
	{
	atomic_or(&page->pg_flags, PG_UPTODATE);
	return 0;
	}

	/* Returns a BH pointing to the buffer where blk_num from bdev is located (given
	* blocks of size blk_sz). This uses the page cache for the page allocations
	* and evictions, but only caches blocks that are requested. Check the docs for
	* more info. The BH isn't refcounted, but a page refcnt is returned. Call
	* put_block (nand/xor dirty block).
	*
	* Note we're using the lock_page() to sync (which is what we do with the page
	* cache too. It's not ideal, but keeps things simpler for now.
	*
	* Also note we're a little inconsistent with the use of sector sizes in certain
	* files. We'll sort it eventually. */
	struct buffer_head bdev_get_buffer(struct block_device bdev,
	unsigned long blk_num, unsigned int blk_sz)
	{
	struct page *page;
	struct page_map *pm = &bdev->b_pm;
	struct buffer_head bh, new, prev, *next_loc;
	struct block_request *breq;
	int error;
	unsigned int blk_per_pg = PGSIZE / blk_sz;
	unsigned int sct_per_blk = blk_sz / bdev->b_sector_sz;
	unsigned int blk_offset = (blk_num % blk_per_pg) * blk_sz;
	void *my_buf;
	assert(blk_offset < PGSIZE);
	if (!blk_num)
	warn("Asking for the 0th block of a bdev...");
	/* Make sure there's a page in the page cache. Should always be one. */
	error = pm_load_page(pm, blk_num / blk_per_pg, &page);
	if (error)
	panic("Failed to load page! (%d)", error);
	my_buf = page2kva(page) + blk_offset;
	atomic_or(&page->pg_flags, PG_BUFFER);
	retry:
	bh = (struct buffer_head*)page->pg_private;
	prev = 0;
	/* look through all the BHs for ours, stopping if we go too far. */
	while (bh) {
	if (bh->bh_buffer == my_buf) {
	goto found;
	} else if (bh->bh_buffer > my_buf) {
	break;
	}
	prev = bh;
	bh = bh->bh_next;
	}
	/* At this point, bh points to the one beyond our space (or 0), and prev is
	* either the one before us or 0. We make a BH, and try to insert */
	new = kmem_cache_alloc(bh_kcache, 0);
	assert(new);
	new->bh_page = page; /* weak ref */
	new->bh_buffer = my_buf;
	new->bh_flags = 0;
	new->bh_next = bh;
	new->bh_bdev = bdev; /* uncounted ref */
	new->bh_sector = blk_num * sct_per_blk;
	new->bh_nr_sector = sct_per_blk;
	/* Try to insert the new one in place. If it fails, retry the whole "find
	* the bh" process. This should be rare, so no sense optimizing it. */
	next_loc = prev ? &prev->bh_next : (struct buffer_head**)&page->pg_private;
	/* Normally, there'd be an ABA problem here, but we never actually remove
	* bhs from the chain until the whole page gets cleaned up, which can't
	* happen while we hold a reference to the page. */
	if (!atomic_cas_ptr((void**)next_loc, bh, new)) {
	kmem_cache_free(bh_kcache, new);
	goto retry;
	}
	bh = new;
	found:
	/* At this point, we have the BH for our buf, but it might not be up to
	* date, and there might be someone else trying to update it. */
	/* is it already here and up to date? if so, we're done */
	if (bh->bh_flags & BH_UPTODATE)
	return bh;
	/* if not, try to lock the page (could BLOCK). Using this for syncing. */
	lock_page(page);
	/* double check, are we up to date? if so, we're done */
	if (bh->bh_flags & BH_UPTODATE) {
	unlock_page(page);
	return bh;
	}
	/* if we're here, the page is locked by us, we need to read the block */
	breq = kmem_cache_alloc(breq_kcache, 0);
	assert(breq);
	breq->flags = BREQ_READ;
	breq->callback = generic_breq_done;
	breq->data = 0;
	sem_init_irqsave(&breq->sem, 0);
	breq->bhs = breq->local_bhs;
	breq->bhs[0] = bh;
	breq->nr_bhs = 1;
	error = bdev_submit_request(bdev, breq);
	assert(!error);
	sleep_on_breq(breq);
	kmem_cache_free(breq_kcache, breq);
	/* after the data is read, we mark it up to date and unlock the page. */
	bh->bh_flags \|= BH_UPTODATE;
	unlock_page(page);
	return bh;
	}

	/* Will dirty the block/BH/page for the given block/buffer. Will have to be
	* careful with the page reclaimer - if someone holds a reference, they can
	* still dirty it. */
	void bdev_dirty_buffer(struct buffer_head *bh)
	{
	struct page *page = bh->bh_page;
	/* TODO: race on flag modification */
	bh->bh_flags \|= BH_DIRTY;
	atomic_or(&page->pg_flags, PG_DIRTY);
	}

	/* Decrefs the buffer from bdev_get_buffer(). Call this when you no longer
	* reference your block/buffer. For now, we do refcnting on the page, since the
	* reclaiming will be in page sized chunks from the page cache. */
	void bdev_put_buffer(struct buffer_head *bh)
	{
	pm_put_page(bh->bh_page);
	}

	/* Block device page map ops: */
	struct page_map_operations block_pm_op = {
	block_readpage,
	};

	/* Block device file ops: for now, we don't let you do much of anything */
	struct file_operations block_f_op = {
	dev_c_llseek,
	0,
	0,
	kfs_readdir, /* this will fail gracefully */
	dev_mmap,
	kfs_open,
	kfs_flush,
	kfs_release,
	0, /* fsync - makes no sense */
	kfs_poll,
	0, /* readv */
	0, /* writev */
	kfs_sendpage,
	kfs_check_flags,
	};