kern/src/ns/qio.c - upstream - Git at Google

 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
  * Portions Copyright © 1997-1999 Vita Nuova Limited
  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
  *                                (www.vitanuova.com)
  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
  *
  * Modified for the Akaros operating system:
  * Copyright (c) 2013-2014 The Regents of the University of California
  * Copyright (c) 2013-2015 Google Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  * copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE. */

 #include <slab.h>
 #include <kmalloc.h>
 #include <kref.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <error.h>
 #include <cpio.h>
 #include <pmap.h>
 #include <smp.h>
 #include <net/ip.h>

 static uint32_t padblockcnt;
 static uint32_t concatblockcnt;
 static uint32_t pullupblockcnt;
 static uint32_t copyblockcnt;
 static uint32_t consumecnt;
 static uint32_t producecnt;
 static uint32_t qcopycnt;

 static int debugging;

 #define QDEBUG	if(0)

 /*
  *  IO queues
  */

 struct queue {
 	spinlock_t lock;;

 	struct block *bfirst;		/* buffer */
 	struct block *blast;

 	int dlen;			/* data bytes in queue */
 	int limit;			/* max bytes in queue */
 	int inilim;			/* initial limit */
 	int state;
 	int eof;			/* number of eofs read by user */
 	size_t bytes_read;

 	void (*kick) (void *);		/* restart output */
 	void (*bypass) (void *, struct block *); /* bypass queue altogether */
 	void *arg;			/* argument to kick */

 	struct rendez rr;		/* process waiting to read */
 	struct rendez wr;		/* process waiting to write */
 	qio_wake_cb_t wake_cb;		/* callbacks for qio wakeups */
 	void *wake_data;

 	char err[ERRMAX];
 };

 enum {
 	Maxatomic = 64 * 1024,
 	QIO_CAN_ERR_SLEEP = (1 << 0),	/* can throw errors or block/sleep */
 	QIO_LIMIT = (1 << 1),		/* respect q->limit */
 	QIO_DROP_OVERFLOW = (1 << 2),	/* alternative to qdropoverflow */
 	QIO_JUST_ONE_BLOCK = (1 << 3),	/* when qbreading, just get one block */
 	QIO_NON_BLOCK = (1 << 4),	/* throw EAGAIN instead of blocking */
 	QIO_DONT_KICK = (1 << 5),	/* don't kick when waking */
 };

 unsigned int qiomaxatomic = Maxatomic;

 static ssize_t __qbwrite(struct queue *q, struct block *b, int flags);
 static struct block *__qbread(struct queue *q, size_t len, int qio_flags,
                               int mem_flags);
 static bool qwait_and_ilock(struct queue *q, int qio_flags);

 /* Helper: fires a wake callback, sending 'filter' */
 static void qwake_cb(struct queue *q, int filter)
 {
 	if (q->wake_cb)
 		q->wake_cb(q, q->wake_data, filter);
 }

 void ixsummary(void)
 {
 	debugging ^= 1;
 	printd("pad %lu, concat %lu, pullup %lu, copy %lu\n",
 		   padblockcnt, concatblockcnt, pullupblockcnt, copyblockcnt);
 	printd("consume %lu, produce %lu, qcopy %lu\n",
 		   consumecnt, producecnt, qcopycnt);
 }

 /* Pad a block to the front (or the back if size is negative).  Returns the
  * block pointer that you must use instead of bp.  i.e. bp = padblock(bp, x);
  *
  * This space is in the main body / header space, not the extra data.  In
  * essence, the block has size bytes of uninitialized data placed in the front
  * (left) of the old header data.  The caller needs to fill in that data,
  * presumably with packet headers.  Any block[offset] in the old block is now at
  * block[offset + size].
  *
  * Negative padding applies at the end of the block.  This means that there is
  * space in block header for size bytes (in between wp and lim).  Given that all
  * of the block 'padding' needs to be in the main header, that means we need to
  * linearize the entire block, such that the end padding is in the main
  * body/header space.
  */
 struct block *padblock(struct block *bp, int size)
 {
 	int n;
 	struct block *nbp;

 	QDEBUG checkb(bp, "padblock 1");
 	if (size >= 0) {
 		if (bp->rp - bp->base >= size) {
 			block_add_to_offsets(bp, size);
 			bp->rp -= size;
 			return bp;
 		}
 		if (bp->next)
 			panic("%s %p had a next", __func__, bp);
 		n = BHLEN(bp);
 		padblockcnt++;
 		nbp = block_alloc(size + n, MEM_WAIT);
 		block_copy_metadata(nbp, bp);
 		block_replace_extras(nbp, bp);
 		/* This effectively copies the old block main body such that we
 		 * know we have size bytes to the left of rp.  All of the
 		 * metadata offsets (e.g. tx_csum) are relative from this blob
 		 * of data: i.e. nbp->rp + size. */
 		nbp->rp += size;
 		nbp->wp = nbp->rp;
 		memmove(nbp->wp, bp->rp, n);
 		nbp->wp += n;
 		freeb(bp);
 		block_add_to_offsets(nbp, size);
 		nbp->rp -= size;
 	} else {
 		/* No one I know of calls this yet, so this is untested.  Maybe
 		 * we can remove it. */
 		warn_once("pad attempt with negative size %d", size);
 		size = -size;
 		if (bp->next)
 			panic("%s %p had a next", __func__, bp);
 		if (bp->lim - bp->wp >= size)
 			return bp;
 		/* Negative padding goes after all data.  In essence, we'll need
 		 * to pull all block extra data up into the headers.  The
 		 * easiest thing is to linearize, then do the old algorithm.  We
 		 * may do extra allocations - if we ever use this, we can fix it
 		 * up.  Maybe make linearizeblock/copyblock take neg-padding. */
 		if (bp->extra_len) {
 			bp = linearizeblock(bp);
 			if (bp->lim - bp->wp >= size)
 				return bp;
 		}
 		n = BLEN(bp);
 		padblockcnt++;
 		nbp = block_alloc(size + n, MEM_WAIT);
 		block_copy_metadata(nbp, bp);
 		memmove(nbp->wp, bp->rp, n);
 		nbp->wp += n;
 		freeb(bp);
 	}
 	QDEBUG checkb(nbp, "padblock 1");
 	return nbp;
 }

 /*
  *  return count of bytes in a string of blocks
  */
 int blocklen(struct block *bp)
 {
 	int len;

 	len = 0;
 	while (bp) {
 		len += BLEN(bp);
 		bp = bp->next;
 	}
 	return len;
 }

 /*
  * return count of space in blocks
  */
 int blockalloclen(struct block *bp)
 {
 	int len;

 	len = 0;
 	while (bp) {
 		len += BALLOC(bp);
 		bp = bp->next;
 	}
 	return len;
 }

 /*
  *  copy the  string of blocks into
  *  a single block and free the string
  */
 struct block *concatblock(struct block *bp)
 {
 	if (bp->next == 0)
 		return bp;
 	/* If this is slow, we can get fancy and append a bunch of ebds to bp
 	 * for each subsequent block in the list. */
 	return pullupblock(bp, blocklen(bp));
 }

 /* Makes an identical copy of the block, collapsing all the data into the block
  * body.  It does not point to the contents of the original, it is a copy
  * (unlike qclone).  Since we're copying, we might as well put the memory into
  * one contiguous chunk. */
 struct block *copyblock(struct block *bp, int mem_flags)
 {
 	struct block *newb;
 	struct extra_bdata *ebd;
 	size_t amt;

 	QDEBUG checkb(bp, "copyblock 0");
 	newb = block_alloc(BLEN(bp), mem_flags);
 	if (!newb)
 		return 0;
 	amt = block_copy_to_body(newb, bp->rp, BHLEN(bp));
 	assert(amt == BHLEN(bp));
 	for (int i = 0; i < bp->nr_extra_bufs; i++) {
 		ebd = &bp->extra_data[i];
 		if (!ebd->base || !ebd->len)
 			continue;
 		amt = block_copy_to_body(newb, (void*)ebd->base + ebd->off,
 					 ebd->len);
 		assert(amt == ebd->len);
 	}
 	block_copy_metadata(newb, bp);
 	copyblockcnt++;
 	QDEBUG checkb(newb, "copyblock 1");
 	return newb;
 }

 /* Returns a block with the remaining contents of b all in the main body of the
  * returned block.  Replace old references to b with the returned value (which
  * may still be 'b', if no change was needed. */
 struct block *linearizeblock(struct block *b)
 {
 	struct block *newb;

 	if (!b->extra_len)
 		return b;
 	newb = copyblock(b, MEM_WAIT);
 	freeb(b);
 	return newb;
 }

 /* Helper for bookkeeping: we removed amt bytes from block b's ebd, which may
  * have emptied it. */
 static void block_consume_ebd_bytes(struct block *b, struct extra_bdata *ebd,
 				    size_t amt)
 {
 	ebd->len -= amt;
 	ebd->off += amt;
 	b->extra_len -= amt;
 	if (!ebd->len) {
 		/* we don't actually have to decref here.  it's also
 		 * done in freeb().  this is the earliest we can free. */
 		kfree((void*)ebd->base);
 		ebd->base = ebd->off = 0;
 	}
 }

 /* Helper: Yanks up to size bytes worth of extra data blocks from 'from', puts
  * them in the main body of 'to'.  Returns the amount yanked.  Will panic if
  * there is not enough room in 'to'. */
 static size_t block_yank_ebd_bytes(struct block *to, struct block *from,
 				   size_t amt)
 {
 	size_t copy_amt, total = 0;
 	struct extra_bdata *ebd;

 	for (int i = 0; amt && i < from->nr_extra_bufs; i++) {
 		ebd = &from->extra_data[i];
 		if (!ebd->base || !ebd->len)
 			continue;
 		copy_amt = MIN(amt, ebd->len);
 		copy_amt = block_copy_to_body(to, (void*)ebd->base + ebd->off,
 					      copy_amt);
 		if (copy_amt != MIN(amt, ebd->len))
 			panic("'to' block didn't have enough space! %d %d %d",
 			      amt, ebd->len, copy_amt);
 		block_consume_ebd_bytes(from, ebd, copy_amt);
 		total += copy_amt;
 		amt -= copy_amt;
 	}
 	return total;
 }

 /* Make sure the first block has at least n bytes in its headers/main body.
  * Pulls up data from the *list* of blocks.  Returns 0 if there is not enough
  * data in the block list.
  *
  * Any data to the *left* of rp, such as old network headers that were popped
  * off when processing an inbound packet, may be discarded. */
 struct block *pullupblock(struct block *bp, size_t n)
 {
 	struct block *i;
 	struct extra_bdata *ebd;
 	size_t copy_amt;

 	/*
 	 *  this should almost always be true, it's
 	 *  just to avoid every caller checking.
 	 */
 	if (BHLEN(bp) >= n)
 		return bp;

 	/* If there's no chance, just bail out now.  This might be slightly
 	 * wasteful if there's a long blist that does have enough data. */
 	if (n > blocklen(bp))
 		return 0;

 	/* Replace bp with a new block with enough size, and yank up enough of
 	 * its ebds. */
 	bp = block_realloc(bp, n);
 	pullupblockcnt++;
 	n -= BHLEN(bp);
 	n -= block_yank_ebd_bytes(bp, bp, n);
 	/* Need to pull from the remainder of the list, both the main bodies and
 	 * the ebds. */
 	while (n) {
 		i = bp->next;
 		if (!i)
 			panic("Not enough b's in the list; we checked the len");
 		copy_amt = MIN(BHLEN(i), n);
 		block_copy_to_body(bp, i->rp, copy_amt);
 		/* That may or may not have consumed all of the main body */
 		i->rp += copy_amt;
 		/* Subsequent blocks with offsets shouldn't be subject to
 		 * pullup. */
 		warn_on(i->tx_csum_offset || i->network_offset ||
 			i->transport_offset);
 		n -= copy_amt;
 		n -= block_yank_ebd_bytes(bp, i, n);
 		if (!BLEN(i)) {
 			bp->next = i->next;
 			i->next = NULL;
 			freeb(i);
 		} else {
 			assert(!n);
 		}
 	}
 	return bp;
 }

 /*
  *  make sure the first block has at least n bytes in its main body
  */
 struct block *pullupqueue(struct queue *q, size_t n)
 {
 	struct block *b;

 	/* TODO: lock to protect the queue links? */
 	if ((BHLEN(q->bfirst) >= n))
 		return q->bfirst;
 	/* This is restoring qio metadata.  If pullupblock did any work, it
 	 * changed the list - at least the first block and possibly the last in
 	 * the blist. */
 	q->bfirst = pullupblock(q->bfirst, n);
 	for (b = q->bfirst; b != NULL && b->next != NULL; b = b->next) ;
 	q->blast = b;
 	return q->bfirst;
 }

 /* throw away count bytes from the front of
  * block's extradata.  Returns count of bytes
  * thrown away
  */

 static int pullext(struct block *bp, int count)
 {
 	struct extra_bdata *ed;
 	int i, rem, bytes = 0;

 	for (i = 0; bp->extra_len && count && i < bp->nr_extra_bufs; i++) {
 		ed = &bp->extra_data[i];
 		rem = MIN(count, ed->len);
 		bp->extra_len -= rem;
 		count -= rem;
 		bytes += rem;
 		ed->off += rem;
 		ed->len -= rem;
 		if (ed->len == 0) {
 			kfree((void *)ed->base);
 			ed->base = 0;
 			ed->off = 0;
 		}
 	}
 	return bytes;
 }

 /* throw away count bytes from the end of a
  * block's extradata.  Returns count of bytes
  * thrown away
  */

 static int dropext(struct block *bp, int count)
 {
 	struct extra_bdata *ed;
 	int i, rem, bytes = 0;

 	for (i = bp->nr_extra_bufs - 1; bp->extra_len && count && i >= 0; i--) {
 		ed = &bp->extra_data[i];
 		rem = MIN(count, ed->len);
 		bp->extra_len -= rem;
 		count -= rem;
 		bytes += rem;
 		ed->len -= rem;
 		if (ed->len == 0) {
 			kfree((void *)ed->base);
 			ed->base = 0;
 			ed->off = 0;
 		}
 	}
 	return bytes;
 }

 /*
  *  throw away up to count bytes from a
  *  list of blocks.  Return count of bytes
  *  thrown away.
  */
 static int _pullblock(struct block **bph, int count, int free)
 {
 	struct block *bp;
 	int n, bytes;

 	bytes = 0;
 	if (bph == NULL)
 		return 0;

 	while (*bph != NULL && count != 0) {
 		bp = *bph;

 		n = MIN(BHLEN(bp), count);
 		bytes += n;
 		count -= n;
 		bp->rp += n;
 		n = pullext(bp, count);
 		bytes += n;
 		count -= n;
 		QDEBUG checkb(bp, "pullblock ");
 		if (BLEN(bp) == 0 && (free || count)) {
 			*bph = bp->next;
 			bp->next = NULL;
 			freeb(bp);
 		}
 	}
 	return bytes;
 }

 int pullblock(struct block **bph, int count)
 {
 	return _pullblock(bph, count, 1);
 }

 /*
  *  trim to len bytes starting at offset
  */
 struct block *trimblock(struct block *bp, int offset, int len)
 {
 	uint32_t l, trim;
 	int olen = len;

 	QDEBUG checkb(bp, "trimblock 1");
 	if (blocklen(bp) < offset + len) {
 		freeblist(bp);
 		return NULL;
 	}

 	l =_pullblock(&bp, offset, 0);
 	if (bp == NULL)
 		return NULL;
 	if (l != offset) {
 		freeblist(bp);
 		return NULL;
 	}

 	while ((l = BLEN(bp)) < len) {
 		len -= l;
 		bp = bp->next;
 	}

 	trim = BLEN(bp) - len;
 	trim -= dropext(bp, trim);
 	bp->wp -= trim;

 	if (bp->next) {
 		freeblist(bp->next);
 		bp->next = NULL;
 	}
 	return bp;
 }

 /* Adjust block @bp so that its size is exactly @len.
  * If the size is increased, fill in the new contents with zeros.
  * If the size is decreased, discard some of the old contents at the tail. */
 struct block *adjustblock(struct block *bp, int len)
 {
 	struct extra_bdata *ebd;
 	void *buf;
 	int i;

 	if (len < 0) {
 		freeb(bp);
 		return NULL;
 	}

 	if (len == BLEN(bp))
 		return bp;

 	/* Shrink within block main body. */
 	if (len <= BHLEN(bp)) {
 		free_block_extra(bp);
 		bp->wp = bp->rp + len;
 		QDEBUG checkb(bp, "adjustblock 1");
 		return bp;
 	}

 	/* Need to grow. */
 	if (len > BLEN(bp)) {
 		/* Grow within block main body. */
 		if (bp->extra_len == 0 && bp->rp + len <= bp->lim) {
 			memset(bp->wp, 0, len - BLEN(bp));
 			bp->wp = bp->rp + len;
 			QDEBUG checkb(bp, "adjustblock 2");
 			return bp;
 		}
 		/* Grow with extra data buffers. */
 		buf = kzmalloc(len - BLEN(bp), MEM_WAIT);
 		block_append_extra(bp, (uintptr_t)buf, 0, len - BLEN(bp),
 				   MEM_WAIT);
 		QDEBUG checkb(bp, "adjustblock 3");
 		return bp;
 	}

 	/* Shrink extra data buffers.
 	 * len is how much of ebd we need to keep.
 	 * extra_len is re-accumulated. */
 	assert(bp->extra_len > 0);
 	len -= BHLEN(bp);
 	bp->extra_len = 0;
 	for (i = 0; i < bp->nr_extra_bufs; i++) {
 		ebd = &bp->extra_data[i];
 		if (len <= ebd->len)
 			break;
 		len -= ebd->len;
 		bp->extra_len += ebd->len;
 	}
 	/* If len becomes zero, extra_data[i] should be freed. */
 	if (len > 0) {
 		ebd = &bp->extra_data[i];
 		ebd->len = len;
 		bp->extra_len += ebd->len;
 		i++;
 	}
 	for (; i < bp->nr_extra_bufs; i++) {
 		ebd = &bp->extra_data[i];
 		if (ebd->base)
 			kfree((void*)ebd->base);
 		ebd->base = ebd->off = ebd->len = 0;
 	}
 	QDEBUG checkb(bp, "adjustblock 4");
 	return bp;
 }

 /* Helper: removes and returns the first block from q */
 static struct block *pop_first_block(struct queue *q)
 {
 	struct block *b = q->bfirst;

 	q->dlen -= BLEN(b);
 	q->bytes_read += BLEN(b);
 	q->bfirst = b->next;
 	b->next = 0;
 	return b;
 }

 /* Accounting helper.  Block b in q lost amt extra_data */
 static void block_and_q_lost_extra(struct block *b, struct queue *q, size_t amt)
 {
 	b->extra_len -= amt;
 	q->dlen -= amt;
 	q->bytes_read += amt;
 }

 /* Helper: moves ebd from a block (in from_q) to another block.  The *ebd is
  * fixed in 'from', so we move its contents and zero it out in 'from'.
  *
  * Returns the length moved (0 on failure). */
 static size_t move_ebd(struct extra_bdata *ebd, struct block *to,
                        struct block *from, struct queue *from_q)
 {
 	size_t ret = ebd->len;

 	if (block_append_extra(to, ebd->base, ebd->off, ebd->len, MEM_ATOMIC))
 		return 0;
 	block_and_q_lost_extra(from, from_q, ebd->len);
 	ebd->base = ebd->len = ebd->off = 0;
 	return ret;
 }

 /* Copy up to len bytes from q->bfirst to @to, leaving the block in place.  May
  * return with less than len, but greater than 0, even if there is more
  * available in q.
  *
  * At any moment that we have copied anything and things are tricky, we can just
  * return.  The trickiness comes from a bunch of variables: is the main body
  * empty?  How do we split the ebd?  If our alloc fails, then we can fall back
  * to @to's main body, but only if we haven't used it yet. */
 static size_t copy_from_first_block(struct queue *q, struct block *to,
                                     size_t len)
 {
 	struct block *from = q->bfirst;
 	size_t copy_amt, amt;
 	struct extra_bdata *ebd;

 	assert(len < BLEN(from));	/* sanity */
 	/* Try to extract from the main body */
 	copy_amt = MIN(BHLEN(from), len);
 	if (copy_amt) {
 		copy_amt = block_copy_to_body(to, from->rp, copy_amt);
 		from->rp += copy_amt;
 		/* We only change dlen, (data len), not q->len, since the q
 		 * still has the same block memory allocation (no kfrees
 		 * happened) */
 		q->dlen -= copy_amt;
 		q->bytes_read += copy_amt;
 	}
 	/* Try to extract the remainder from the extra data */
 	len -= copy_amt;
 	for (int i = 0; (i < from->nr_extra_bufs) && len; i++) {
 		ebd = &from->extra_data[i];
 		if (!ebd->base || !ebd->len)
 			continue;
 		if (len >= ebd->len) {
 			amt = move_ebd(ebd, to, from, q);
 			if (!amt) {
 				/* our internal alloc could have failed.   this
 				 * ebd is now the last one we'll consider.
 				 * let's handle it separately and put it in the
 				 * main body. */
 				if (copy_amt)
 					return copy_amt;
 				copy_amt = block_copy_to_body(to,
 							      (void*)ebd->base +
 							      ebd->off,
 				                              ebd->len);
 				block_and_q_lost_extra(from, q, copy_amt);
 				break;
 			}
 			len -= amt;
 			copy_amt += amt;
 			continue;
 		} else {
 			/* If we're here, we reached our final ebd, which we'll
 			 * need to split to get anything from it. */
 			if (copy_amt)
 				return copy_amt;
 			copy_amt = block_copy_to_body(to, (void*)ebd->base +
 						      ebd->off, len);
 			ebd->off += copy_amt;
 			ebd->len -= copy_amt;
 			block_and_q_lost_extra(from, q, copy_amt);
 			break;
 		}
 	}
 	if (len)
 		assert(copy_amt);	/* sanity */
 	return copy_amt;
 }

 /* Return codes for __qbread and __try_qbread. */
 enum {
 	QBR_OK,
 	QBR_FAIL,
 	QBR_SPARE,	/* we need a spare block */
 	QBR_AGAIN,	/* do it again, we are coalescing blocks */
 };

 /* Helper and back-end for __qbread: extracts and returns a list of blocks
  * containing up to len bytes.  It may contain less than len even if q has more
  * data.
  *
  * Returns a code interpreted by __qbread, and the returned blist in ret. */
 static int __try_qbread(struct queue *q, size_t len, int qio_flags,
                         struct block **real_ret, struct block *spare)
 {
 	struct block *ret, *ret_last, *first;
 	size_t blen;
 	bool was_unwritable = FALSE;

 	if (qio_flags & QIO_CAN_ERR_SLEEP) {
 		if (!qwait_and_ilock(q, qio_flags)) {
 			spin_unlock_irqsave(&q->lock);
 			return QBR_FAIL;
 		}
 		/* we qwaited and still hold the lock, so the q is not empty */
 		first = q->bfirst;
 	} else {
 		spin_lock_irqsave(&q->lock);
 		first = q->bfirst;
 		if (!first) {
 			spin_unlock_irqsave(&q->lock);
 			return QBR_FAIL;
 		}
 	}
 	/* We need to check before adjusting q->len.  We're checking the
 	 * writer's sleep condition / tap condition.  When set, we *might* be
 	 * making an edge transition (from unwritable to writable), which needs
 	 * to wake and fire taps.  But, our read might not drain the queue below
 	 * q->lim.  We'll check again later to see if we should really wake
 	 * them.  */
 	was_unwritable = !qwritable(q);
 	blen = BLEN(first);
 	if ((q->state & Qcoalesce) && (blen == 0)) {
 		freeb(pop_first_block(q));
 		spin_unlock_irqsave(&q->lock);
 		/* Need to retry to make sure we have a first block */
 		return QBR_AGAIN;
 	}
 	/* Qmsg: just return the first block.  Be careful, since our caller
 	 * might not read all of the block and thus drop bytes.  Similar to
 	 * SOCK_DGRAM. */
 	if (q->state & Qmsg) {
 		ret = pop_first_block(q);
 		goto out_ok;
 	}
 	/* Let's get at least something first - makes the code easier.  This
 	 * way, we'll only ever split the block once. */
 	if (blen <= len) {
 		ret = pop_first_block(q);
 		len -= blen;
 	} else {
 		/* need to split the block.  we won't actually take the first
 		 * block out of the queue - we're just extracting a little bit.
 		 */
 		if (!spare) {
 			/* We have nothing and need a spare block.  Retry! */
 			spin_unlock_irqsave(&q->lock);
 			return QBR_SPARE;
 		}
 		copy_from_first_block(q, spare, len);
 		ret = spare;
 		goto out_ok;
 	}
 	/* At this point, we just grabbed the first block.  We can try to grab
 	 * some more, up to len (if they want). */
 	if (qio_flags & QIO_JUST_ONE_BLOCK)
 		goto out_ok;
 	ret_last = ret;
 	while (q->bfirst && (len > 0)) {
 		blen = BLEN(q->bfirst);
 		if ((q->state & Qcoalesce) && (blen == 0)) {
 			/* remove the intermediate 0 blocks */
 			freeb(pop_first_block(q));
 			continue;
 		}
 		if (blen > len) {
 			/* We could try to split the block, but that's a huge
 			 * pain.  For instance, we might need to move the main
 			 * body of b into an extra_data of ret_last.  lots of
 			 * ways for that to fail, and lots of cases to consider.
 			 * Easier to just bail out.  This is why I did the first
 			 * block above: we don't need to worry about this. */
 			 break;
 		}
 		ret_last->next = pop_first_block(q);
 		ret_last = ret_last->next;
 		len -= blen;
 	}
 out_ok:
 	/* Don't wake them up or fire tap if we didn't drain enough. */
 	if (!qwritable(q))
 		was_unwritable = FALSE;
 	spin_unlock_irqsave(&q->lock);
 	if (was_unwritable) {
 		if (q->kick && !(qio_flags & QIO_DONT_KICK))
 			q->kick(q->arg);
 		rendez_wakeup(&q->wr);
 		qwake_cb(q, FDTAP_FILT_WRITABLE);
 	}
 	*real_ret = ret;
 	return QBR_OK;
 }

 /* Helper and front-end for __try_qbread: extracts and returns a list of blocks
  * containing up to len bytes.  It may contain less than len even if q has more
  * data.
  *
  * Returns 0 if the q is closed, if it would require blocking and !CAN_BLOCK, or
  * if it required a spare and the memory allocation failed.
  *
  * Technically, there's a weird corner case with !Qcoalesce and Qmsg where you
  * could get a zero length block back. */
 static struct block *__qbread(struct queue *q, size_t len, int qio_flags,
                               int mem_flags)
 {
 	ERRSTACK(1);
 	struct block *ret = 0;
 	struct block *volatile spare = 0;	/* volatile for the waserror */

 	/* __try_qbread can throw, based on qio flags. */
 	if ((qio_flags & QIO_CAN_ERR_SLEEP) && waserror()) {
 		if (spare)
 			freeb(spare);
 		nexterror();
 	}
 	while (1) {
 		switch (__try_qbread(q, len, qio_flags, &ret, spare)) {
 		case QBR_OK:
 		case QBR_FAIL:
 			if (spare && (ret != spare))
 				freeb(spare);
 			goto out_ret;
 		case QBR_SPARE:
 			assert(!spare);
 			/* Due to some nastiness, we need a fresh block so we
 			 * can read out anything from the queue.  'len' seems
 			 * like a reasonable amount.  Maybe we can get away with
 			 * less. */
 			spare = block_alloc(len, mem_flags);
 			if (!spare) {
 				/* Careful here: a memory failure (possible with
 				 * MEM_ATOMIC) could look like 'no data in the
 				 * queue' (QBR_FAIL).  The only one who does is
 				 * this qget(), who happens to know that we
 				 * won't need a spare, due to the len argument.
 				 * Spares are only needed when we need to split
 				 * a block. */
 				ret = 0;
 				goto out_ret;
 			}
 			break;
 		case QBR_AGAIN:
 			/* if the first block is 0 and we are Qcoalesce, then
 			 * we'll need to try again.  We bounce out of __try so
 			 * we can perform the "is there a block" logic again
 			 * from the top. */
 			break;
 		}
 	}
 	assert(0);
 out_ret:
 	if (qio_flags & QIO_CAN_ERR_SLEEP)
 		poperror();
 	return ret;
 }

 /*
  *  get next block from a queue, return null if nothing there
  */
 struct block *qget(struct queue *q)
 {
 	/* since len == SIZE_MAX, we should never need to do a mem alloc */
 	return __qbread(q, SIZE_MAX, QIO_JUST_ONE_BLOCK, MEM_ATOMIC);
 }

 /* Throw away the next 'len' bytes in the queue returning the number actually
  * discarded.
  *
  * If the bytes are in the queue, then they must be discarded.  The only time to
  * return less than len is if the q itself has less than len bytes.
  *
  * This won't trigger a kick when waking up any sleepers.  This seems to be Plan
  * 9's intent, since the TCP stack will deadlock if qdiscard kicks. */
 size_t qdiscard(struct queue *q, size_t len)
 {
 	struct block *blist;
 	size_t removed_amt;
 	size_t sofar = 0;

 	/* This is racy.  There could be multiple qdiscarders or other
 	 * consumers, where the consumption could be interleaved. */
 	while (qlen(q) && len) {
 		blist = __qbread(q, len, QIO_DONT_KICK, MEM_WAIT);
 		removed_amt = freeblist(blist);
 		sofar += removed_amt;
 		len -= removed_amt;
 	}
 	return sofar;
 }

 ssize_t qpass(struct queue *q, struct block *b)
 {
 	return __qbwrite(q, b, QIO_LIMIT | QIO_DROP_OVERFLOW);
 }

 ssize_t qpassnolim(struct queue *q, struct block *b)
 {
 	return __qbwrite(q, b, 0);
 }

 /*
  *  if the allocated space is way out of line with the used
  *  space, reallocate to a smaller block
  */
 struct block *packblock(struct block *bp)
 {
 	struct block **l, *nbp;
 	int n;

 	if (bp->extra_len)
 		return bp;
 	for (l = &bp; *l; l = &(*l)->next) {
 		nbp = *l;
 		n = BLEN(nbp);
 		if ((n << 2) < BALLOC(nbp)) {
 			*l = block_alloc(n, MEM_WAIT);
 			memmove((*l)->wp, nbp->rp, n);
 			(*l)->wp += n;
 			(*l)->next = nbp->next;
 			nbp->next = NULL;
 			freeb(nbp);
 		}
 	}

 	return bp;
 }

 /* Add an extra_data entry to newb at newb_idx pointing to b's body, starting at
  * body_rp, for up to len.  Returns the len consumed.
  *
  * The base is 'b', so that we can kfree it later.  This currently ties us to
  * using kfree for the release method for all extra_data.
  *
  * It is possible to have a body size that is 0, if there is no offset, and
  * b->wp == b->rp.  This will have an extra data entry of 0 length. */
 static size_t point_to_body(struct block *b, uint8_t *body_rp,
                             struct block *newb, unsigned int newb_idx,
                             size_t len)
 {
 	struct extra_bdata *ebd = &newb->extra_data[newb_idx];

 	assert(newb_idx < newb->nr_extra_bufs);

 	kmalloc_incref(b);
 	ebd->base = (uintptr_t)b;
 	ebd->off = (uint32_t)(body_rp - (uint8_t*)b);
 	ebd->len = MIN(b->wp - body_rp, len);	/* think of body_rp as b->rp */
 	assert((int)ebd->len >= 0);
 	newb->extra_len += ebd->len;
 	return ebd->len;
 }

 /* Add an extra_data entry to newb at newb_idx pointing to b's b_idx'th
  * extra_data buf, at b_off within that buffer, for up to len.  Returns the len
  * consumed.
  *
  * We can have blocks with 0 length, but they are still refcnt'd.  See above. */
 static size_t point_to_buf(struct block *b, unsigned int b_idx, uint32_t b_off,
                            struct block *newb, unsigned int newb_idx,
                            size_t len)
 {
 	struct extra_bdata *n_ebd = &newb->extra_data[newb_idx];
 	struct extra_bdata *b_ebd = &b->extra_data[b_idx];

 	assert(b_idx < b->nr_extra_bufs);
 	assert(newb_idx < newb->nr_extra_bufs);

 	kmalloc_incref((void*)b_ebd->base);
 	n_ebd->base = b_ebd->base;
 	n_ebd->off = b_ebd->off + b_off;
 	n_ebd->len = MIN(b_ebd->len - b_off, len);
 	newb->extra_len += n_ebd->len;
 	return n_ebd->len;
 }

 /* given a string of blocks, sets up the new block's extra_data such that it
  * *points* to the contents of the blist [offset, len + offset).  This does not
  * make a separate copy of the contents of the blist.
  *
  * returns 0 on success.  the only failure is if the extra_data array was too
  * small, so this returns a positive integer saying how big the extra_data needs
  * to be.
  *
  * callers are responsible for protecting the list structure. */
 static int __blist_clone_to(struct block *blist, struct block *newb, int len,
                             uint32_t offset)
 {
 	struct block *b, *first;
 	unsigned int nr_bufs = 0;
 	unsigned int b_idx, newb_idx = 0;
 	uint8_t *first_main_body = 0;
 	ssize_t sofar = 0;

 	/* find the first block; keep offset relative to the latest b in the
 	 * list */
 	for (b = blist; b; b = b->next) {
 		if (BLEN(b) > offset)
 			break;
 		offset -= BLEN(b);
 	}
 	/* qcopy semantics: if you asked for an offset outside the block list,
 	 * you get an empty block back */
 	if (!b)
 		return 0;
 	first = b;
 	sofar -= offset; /* don't count the remaining offset in the first b */
 	/* upper bound for how many buffers we'll need in newb */
 	for (/* b is set*/; b; b = b->next) {
 		nr_bufs += BHLEN(b) ? 1 : 0;
 		/* still assuming nr_extra == nr_valid */
 		nr_bufs += b->nr_extra_bufs;
 		sofar += BLEN(b);
 		if (sofar > len)
 			break;
 	}
 	/* we might be holding a spinlock here, so we won't wait for kmalloc */
 	if (block_add_extd(newb, nr_bufs, 0) != 0) {
 		/* caller will need to alloc these, then re-call us */
 		return nr_bufs;
 	}
 	for (b = first; b && len; b = b->next) {
 		b_idx = 0;
 		if (offset) {
 			if (offset < BHLEN(b)) {
 				/* off is in the main body */
 				len -= point_to_body(b, b->rp + offset, newb,
 						     newb_idx, len);
 				newb_idx++;
 			} else {
 				/* off is in one of the buffers (or just past
 				 * the last one).  we're not going to point to
 				 * b's main body at all. */
 				offset -= BHLEN(b);
 				assert(b->extra_data);
 				/* assuming these extrabufs are packed, or at
 				 * least that len isn't gibberish */
 				while (b->extra_data[b_idx].len <= offset) {
 					offset -= b->extra_data[b_idx].len;
 					b_idx++;
 				}
 				/* now offset is set to our offset in the
 				 * b_idx'th buf */
 				len -= point_to_buf(b, b_idx, offset, newb,
 						    newb_idx, len);
 				newb_idx++;
 				b_idx++;
 			}
 			offset = 0;
 		} else {
 			if (BHLEN(b)) {
 				len -= point_to_body(b, b->rp, newb, newb_idx,
 						     len);
 				newb_idx++;
 			}
 		}
 		/* knock out all remaining bufs.  we only did one point_to_ op
 		 * by now, and any point_to_ could be our last if it consumed
 		 * all of len. */
 		for (int i = b_idx; (i < b->nr_extra_bufs) && len; i++) {
 			len -= point_to_buf(b, i, 0, newb, newb_idx, len);
 			newb_idx++;
 		}
 	}
 	return 0;
 }

 struct block *blist_clone(struct block *blist, int header_len, int len,
                           uint32_t offset)
 {
 	int ret;
 	struct block *newb = block_alloc(header_len, MEM_WAIT);

 	do {
 		ret = __blist_clone_to(blist, newb, len, offset);
 		if (ret)
 			block_add_extd(newb, ret, MEM_WAIT);
 	} while (ret);
 	return newb;
 }

 /* given a queue, makes a single block with header_len reserved space in the
  * block main body, and the contents of [offset, len + offset) pointed to in the
  * new blocks ext_data.  This does not make a copy of the q's contents, though
  * you do have a ref count on the memory. */
 struct block *qclone(struct queue *q, int header_len, int len, uint32_t offset)
 {
 	int ret;
 	struct block *newb = block_alloc(header_len, MEM_WAIT);
 	/* the while loop should rarely be used: it would require someone
 	 * concurrently adding to the queue. */
 	do {
 		/* TODO: RCU protect the q list (b->next) (need read lock) */
 		spin_lock_irqsave(&q->lock);
 		ret = __blist_clone_to(q->bfirst, newb, len, offset);
 		spin_unlock_irqsave(&q->lock);
 		if (ret)
 			block_add_extd(newb, ret, MEM_WAIT);
 	} while (ret);
 	return newb;
 }

 struct block *qcopy(struct queue *q, int len, uint32_t offset)
 {
 	return qclone(q, 0, len, offset);
 }

 static void qinit_common(struct queue *q)
 {
 	spinlock_init_irqsave(&q->lock);
 	rendez_init(&q->rr);
 	rendez_init(&q->wr);
 }

 /*
  *  called by non-interrupt code
  */
 struct queue *qopen(int limit, int msg, void (*kick) (void *), void *arg)
 {
 	struct queue *q;

 	q = kzmalloc(sizeof(struct queue), 0);
 	if (q == 0)
 		return 0;
 	qinit_common(q);

 	q->limit = q->inilim = limit;
 	q->kick = kick;
 	q->arg = arg;
 	q->state = msg;
 	q->eof = 0;

 	return q;
 }

 /* open a queue to be bypassed */
 struct queue *qbypass(void (*bypass) (void *, struct block *), void *arg)
 {
 	struct queue *q;

 	q = kzmalloc(sizeof(struct queue), 0);
 	if (q == 0)
 		return 0;
 	qinit_common(q);

 	q->limit = 0;
 	q->arg = arg;
 	q->bypass = bypass;
 	q->state = 0;

 	return q;
 }

 static int notempty(void *a)
 {
 	struct queue *q = a;

 	return (q->state & Qclosed) || q->bfirst != 0;
 }

 /* Block, waiting for the queue to be non-empty or closed.  Returns with
  * the spinlock held.  Returns TRUE when there queue is not empty, FALSE if it
  * was naturally closed.  Throws an error o/w. */
 static bool qwait_and_ilock(struct queue *q, int qio_flags)
 {
 	while (1) {
 		spin_lock_irqsave(&q->lock);
 		if (q->bfirst != NULL)
 			return TRUE;
 		if (q->state & Qclosed) {
 			if (++q->eof > 3) {
 				spin_unlock_irqsave(&q->lock);
 				error(EPIPE,
 				      "multiple reads on a closed queue");
 			}
 			if (q->err[0]) {
 				spin_unlock_irqsave(&q->lock);
 				error(EPIPE, q->err);
 			}
 			return FALSE;
 		}
 		if (qio_flags & QIO_NON_BLOCK) {
 			spin_unlock_irqsave(&q->lock);
 			error(EAGAIN, "queue empty");
 		}
 		spin_unlock_irqsave(&q->lock);
 		/* As with the producer side, we check for a condition while
 		 * holding the q->lock, decide to sleep, then unlock.  It's like
 		 * the "check, signal, check again" pattern, but we do it
 		 * conditionally.  Both sides agree synchronously to do it, and
 		 * those decisions are made while holding q->lock.  I think this
 		 * is OK.
 		 *
 		 * The invariant is that no reader sleeps when the queue has
 		 * data.  While holding the rendez lock, if we see there's no
 		 * data, we'll sleep.  Since we saw there was no data, the next
 		 * writer will see (or already saw) no data, and then the writer
 		 * decides to rendez_wake, which will grab the rendez lock.  If
 		 * the writer already did that, then we'll see notempty when we
 		 * do our check-again. */
 		rendez_sleep(&q->rr, notempty, q);
 	}
 }

 /*
  * add a block list to a queue
  * XXX basically the same as enqueue blist, and has no locking!
  */
 void qaddlist(struct queue *q, struct block *b)
 {
 	/* TODO: q lock? */
 	/* queue the block */
 	if (q->bfirst)
 		q->blast->next = b;
 	else
 		q->bfirst = b;
 	q->dlen += blocklen(b);
 	while (b->next)
 		b = b->next;
 	q->blast = b;
 }

 static size_t read_from_block(struct block *b, uint8_t *to, size_t amt)
 {
 	size_t copy_amt, retval = 0;
 	struct extra_bdata *ebd;

 	copy_amt = MIN(BHLEN(b), amt);
 	memcpy(to, b->rp, copy_amt);
 	/* advance the rp, since this block might not be completely consumed and
 	 * future reads need to know where to pick up from */
 	b->rp += copy_amt;
 	to += copy_amt;
 	amt -= copy_amt;
 	retval += copy_amt;
 	for (int i = 0; (i < b->nr_extra_bufs) && amt; i++) {
 		ebd = &b->extra_data[i];
 		/* skip empty entires.  if we track this in the struct block, we
 		 * can just start the for loop early */
 		if (!ebd->base || !ebd->len)
 			continue;
 		copy_amt = MIN(ebd->len, amt);
 		memcpy(to, (void*)(ebd->base + ebd->off), copy_amt);
 		/* we're actually consuming the entries, just like how we
 		 * advance rp up above, and might only consume part of one. */
 		block_consume_ebd_bytes(b, ebd, copy_amt);
 		to += copy_amt;
 		amt -= copy_amt;
 		retval += copy_amt;
 	}
 	return retval;
 }

 /*
  *  copy the contents of a string of blocks into
  *  memory.  emptied blocks are freed.  return
  *  pointer to first unconsumed block.
  */
 struct block *bl2mem(uint8_t * p, struct block *b, int n)
 {
 	int i;
 	struct block *next;

 	/* could be slicker here, since read_from_block is smart */
 	for (; b != NULL; b = next) {
 		i = BLEN(b);
 		if (i > n) {
 			/* partial block, consume some */
 			read_from_block(b, p, n);
 			return b;
 		}
 		/* full block, consume all and move on */
 		i = read_from_block(b, p, i);
 		n -= i;
 		p += i;
 		next = b->next;
 		b->next = NULL;
 		freeb(b);
 	}
 	return NULL;
 }

 /* Extract the contents of all blocks and copy to va, up to len.  Returns the
  * actual amount copied. */
 static size_t read_all_blocks(struct block *b, void *va, size_t len)
 {
 	size_t sofar = 0;
 	struct block *next;

 	do {
 		assert(va);
 		assert(b->rp);
 		sofar += read_from_block(b, va + sofar, len - sofar);
 		if (BLEN(b) && b->next)
 			panic("Failed to drain entire block (Qmsg?) but had a next!");
 		next = b->next;
 		b->next = NULL;
 		freeb(b);
 		b = next;
 	} while (b);
 	return sofar;
 }

 /*
  *  copy the contents of memory into a string of blocks.
  *  return NULL on error.
  */
 struct block *mem2bl(uint8_t * p, int len)
 {
 	ERRSTACK(1);
 	int n;
 	struct block *b, *first, **l;

 	first = NULL;
 	l = &first;
 	if (waserror()) {
 		freeblist(first);
 		nexterror();
 	}
 	do {
 		n = len;
 		if (n > Maxatomic)
 			n = Maxatomic;

 		*l = b = block_alloc(n, MEM_WAIT);
 		/* TODO consider extra_data */
 		memmove(b->wp, p, n);
 		b->wp += n;
 		p += n;
 		len -= n;
 		l = &b->next;
 	} while (len > 0);
 	poperror();

 	return first;
 }

 /*
  *  put a block back to the front of the queue
  *  called with q ilocked
  */
 void qputback(struct queue *q, struct block *b)
 {
 	b->next = q->bfirst;
 	if (q->bfirst == NULL)
 		q->blast = b;
 	q->bfirst = b;
 	q->dlen += BLEN(b);
 	/* qputback seems to undo a read, so we can undo the accounting too. */
 	q->bytes_read -= BLEN(b);
 }

 /*
  *  get next block from a queue (up to a limit)
  *
  */
 struct block *qbread(struct queue *q, size_t len)
 {
 	return __qbread(q, len, QIO_JUST_ONE_BLOCK | QIO_CAN_ERR_SLEEP,
 			MEM_WAIT);
 }

 struct block *qbread_nonblock(struct queue *q, size_t len)
 {
 	return __qbread(q, len, QIO_JUST_ONE_BLOCK | QIO_CAN_ERR_SLEEP |
 	                QIO_NON_BLOCK, MEM_WAIT);
 }

 /* read up to len from a queue into vp. */
 size_t qread(struct queue *q, void *va, size_t len)
 {
 	struct block *blist = __qbread(q, len, QIO_CAN_ERR_SLEEP, MEM_WAIT);

 	if (!blist)
 		return 0;
 	return read_all_blocks(blist, va, len);
 }

 size_t qread_nonblock(struct queue *q, void *va, size_t len)
 {
 	struct block *blist = __qbread(q, len, QIO_CAN_ERR_SLEEP |
 				       QIO_NON_BLOCK, MEM_WAIT);

 	if (!blist)
 		return 0;
 	return read_all_blocks(blist, va, len);
 }

 /* This is the rendez wake condition for writers. */
 static int qwriter_should_wake(void *a)
 {
 	struct queue *q = a;

 	return qwritable(q) || (q->state & Qclosed);
 }

 /* Helper: enqueues a list of blocks to a queue.  Returns the total length. */
 static size_t enqueue_blist(struct queue *q, struct block *b)
 {
 	size_t dlen;

 	if (q->bfirst)
 		q->blast->next = b;
 	else
 		q->bfirst = b;
 	dlen = BLEN(b);
 	while (b->next) {
 		b = b->next;
 		dlen += BLEN(b);
 	}
 	q->blast = b;
 	q->dlen += dlen;
 	return dlen;
 }

 /* Adds block (which can be a list of blocks) to the queue, subject to
  * qio_flags.  Returns the length written on success or -1 on non-throwable
  * error.  Adjust qio_flags to control the value-added features!. */
 static ssize_t __qbwrite(struct queue *q, struct block *b, int qio_flags)
 {
 	ssize_t ret;
 	bool was_unreadable;

 	if (q->bypass) {
 		ret = blocklen(b);
 		(*q->bypass) (q->arg, b);
 		return ret;
 	}
 	spin_lock_irqsave(&q->lock);
 	was_unreadable = q->dlen == 0;
 	if (q->state & Qclosed) {
 		spin_unlock_irqsave(&q->lock);
 		freeblist(b);
 		if (!(qio_flags & QIO_CAN_ERR_SLEEP))
 			return -1;
 		if (q->err[0])
 			error(EPIPE, q->err);
 		else
 			error(EPIPE, "connection closed");
 	}
 	if ((qio_flags & QIO_LIMIT) && (q->dlen >= q->limit)) {
 		/* drop overflow takes priority over regular non-blocking */
 		if ((qio_flags & QIO_DROP_OVERFLOW)
 		    || (q->state & Qdropoverflow)) {
 			spin_unlock_irqsave(&q->lock);
 			freeb(b);
 			return -1;
 		}
 		/* People shouldn't set NON_BLOCK without CAN_ERR, but we can be
 		 * nice and catch it. */
 		if ((qio_flags & QIO_CAN_ERR_SLEEP)
 		    && (qio_flags & QIO_NON_BLOCK)) {
 			spin_unlock_irqsave(&q->lock);
 			freeb(b);
 			error(EAGAIN, "queue full");
 		}
 	}
 	ret = enqueue_blist(q, b);
 	QDEBUG checkb(b, "__qbwrite");
 	spin_unlock_irqsave(&q->lock);
 	/* TODO: not sure if the usage of a kick is mutually exclusive with a
 	 * wakeup, meaning that actual users either want a kick or have
 	 * qreaders. */
 	if (q->kick && (was_unreadable || (q->state & Qkick)))
 		q->kick(q->arg);
 	if (was_unreadable) {
 		/* Unlike the read side, there's no double-check to make sure
 		 * the queue transitioned across an edge.  We know we added
 		 * something, so that's enough.  We wake if the queue was empty.
 		 * Both sides are the same, in that the condition for which we
 		 * do the rendez_wakeup() is the same as the condition done for
 		 * the rendez_sleep(). */
 		rendez_wakeup(&q->rr);
 		qwake_cb(q, FDTAP_FILT_READABLE);
 	}
 	/*
 	 *  flow control, wait for queue to get below the limit
 	 *  before allowing the process to continue and queue
 	 *  more.  We do this here so that postnote can only
 	 *  interrupt us after the data has been queued.  This
 	 *  means that things like 9p flushes and ssl messages
 	 *  will not be disrupted by software interrupts.
 	 *
 	 *  Note - this is moderately dangerous since a process
 	 *  that keeps getting interrupted and rewriting will
 	 *  queue infinite crud.
 	 */
 	if ((qio_flags & QIO_CAN_ERR_SLEEP) &&
 	    !(q->state & Qdropoverflow) && !(qio_flags & QIO_NON_BLOCK)) {
 		/* This is a racy peek at the q status.  If we accidentally
 		 * block, our rendez will return.  The rendez's peak
 		 * (qwriter_should_wake) is also racy w.r.t.  the q's spinlock
 		 * (that lock protects writes, but not reads).
 		 *
 		 * Here's the deal: when holding the rendez lock, if we see the
 		 * sleep condition, the consumer will wake us.  The condition
 		 * will only ever be changed by the next qbread() (consumer,
 		 * changes q->dlen).  That code will do a rendez wake, which
 		 * will spin on the rendez lock, meaning it won't procede until
 		 * we either see the new state (and return) or put ourselves on
 		 * the rendez, and wake up.
 		 *
 		 * The pattern is one side writes mem, then signals.  Our side
 		 * checks the signal, then reads the mem.  The goal is to not
 		 * miss seeing the signal AND missing the memory write.  In this
 		 * specific case, the signal is actually synchronous (the rendez
 		 * lock) and not basic shared memory.
 		 *
 		 * Oh, and we spin in case we woke early and someone else filled
 		 * the queue, mesa-style. */
 		while (!qwriter_should_wake(q))
 			rendez_sleep(&q->wr, qwriter_should_wake, q);
 	}
 	return ret;
 }

 /*
  *  add a block to a queue obeying flow control
  */
 ssize_t qbwrite(struct queue *q, struct block *b)
 {
 	return __qbwrite(q, b, QIO_CAN_ERR_SLEEP | QIO_LIMIT);
 }

 ssize_t qbwrite_nonblock(struct queue *q, struct block *b)
 {
 	return __qbwrite(q, b, QIO_CAN_ERR_SLEEP | QIO_LIMIT | QIO_NON_BLOCK);
 }

 ssize_t qibwrite(struct queue *q, struct block *b)
 {
 	return __qbwrite(q, b, 0);
 }

 /* Helper, allocs a block and copies [from, from + len) into it.  Returns the
  * block on success, 0 on failure. */
 static struct block *build_block(void *from, size_t len, int mem_flags)
 {
 	struct block *b;
 	void *ext_buf;

 	/* If len is small, we don't need to bother with the extra_data.  But
 	 * until the whole stack can handle extd blocks, we'll use them
 	 * unconditionally.  */

 	/* allocb builds in 128 bytes of header space to all blocks, but this is
 	 * only available via padblock (to the left).  we also need some space
 	 * for pullupblock for some basic headers (like icmp) that get written
 	 * in directly */
 	b = block_alloc(64, mem_flags);
 	if (!b)
 		return 0;
 	ext_buf = kmalloc(len, mem_flags);
 	if (!ext_buf) {
 		kfree(b);
 		return 0;
 	}
 	memcpy(ext_buf, from, len);
 	if (block_add_extd(b, 1, mem_flags)) {
 		kfree(ext_buf);
 		kfree(b);
 		return 0;
 	}
 	b->extra_data[0].base = (uintptr_t)ext_buf;
 	b->extra_data[0].off = 0;
 	b->extra_data[0].len = len;
 	b->extra_len += len;
 	return b;
 }

 static ssize_t __qwrite(struct queue *q, void *vp, size_t len, int mem_flags,
                         int qio_flags)
 {
 	ERRSTACK(1);
 	size_t n;
 	volatile size_t sofar = 0;	/* volatile for the waserror */
 	struct block *b;
 	uint8_t *p = vp;
 	void *ext_buf;

 	/* Only some callers can throw.  Others might be in a context where
 	 * waserror isn't safe. */
 	if ((qio_flags & QIO_CAN_ERR_SLEEP) && waserror()) {
 		/* Any error (EAGAIN for nonblock, syscall aborted, even EPIPE)
 		 * after some data has been sent should be treated as a partial
 		 * write. */
 		if (sofar)
 			goto out_ok;
 		nexterror();
 	}
 	do {
 		n = len - sofar;
 		/* This is 64K, the max amount per single block.  Still a good
 		 * value? */
 		if (n > Maxatomic)
 			n = Maxatomic;
 		b = build_block(p + sofar, n, mem_flags);
 		if (!b)
 			break;
 		if (__qbwrite(q, b, qio_flags) < 0)
 			break;
 		sofar += n;
 	} while ((sofar < len) && (q->state & Qmsg) == 0);
 out_ok:
 	if (qio_flags & QIO_CAN_ERR_SLEEP)
 		poperror();
 	return sofar;
 }

 ssize_t qwrite(struct queue *q, void *vp, int len)
 {
 	return __qwrite(q, vp, len, MEM_WAIT, QIO_CAN_ERR_SLEEP | QIO_LIMIT);
 }

 ssize_t qwrite_nonblock(struct queue *q, void *vp, int len)
 {
 	return __qwrite(q, vp, len, MEM_WAIT, QIO_CAN_ERR_SLEEP | QIO_LIMIT |
 	                                      QIO_NON_BLOCK);
 }

 ssize_t qiwrite(struct queue *q, void *vp, int len)
 {
 	return __qwrite(q, vp, len, MEM_ATOMIC, 0);
 }

 /*
  *  be extremely careful when calling this,
  *  as there is no reference accounting
  */
 void qfree(struct queue *q)
 {
 	qclose(q);
 	kfree(q);
 }

 /*
  *  Mark a queue as closed.  No further IO is permitted.
  *  All blocks are released.
  */
 void qclose(struct queue *q)
 {
 	struct block *bfirst;

 	if (q == NULL)
 		return;

 	/* mark it */
 	spin_lock_irqsave(&q->lock);
 	q->state |= Qclosed;
 	q->state &= ~Qdropoverflow;
 	q->err[0] = 0;
 	bfirst = q->bfirst;
 	q->bfirst = 0;
 	q->dlen = 0;
 	spin_unlock_irqsave(&q->lock);

 	/* free queued blocks */
 	freeblist(bfirst);

 	/* wake up readers/writers */
 	rendez_wakeup(&q->rr);
 	rendez_wakeup(&q->wr);
 	qwake_cb(q, FDTAP_FILT_HANGUP);
 }

 /* Mark a queue as closed.  Wakeup any readers.  Don't remove queued blocks.
  *
  * msg will be the errstr received by any waiters (qread, qbread, etc).  If
  * there is no message, which is what also happens during a natural qclose(),
  * those waiters will simply return 0.  qwriters will always error() on a
  * closed/hungup queue. */
 void qhangup(struct queue *q, char *msg)
 {
 	/* mark it */
 	spin_lock_irqsave(&q->lock);
 	q->state |= Qclosed;
 	if (msg == 0 || *msg == 0)
 		q->err[0] = 0;
 	else
 		strlcpy(q->err, msg, ERRMAX);
 	spin_unlock_irqsave(&q->lock);

 	/* wake up readers/writers */
 	rendez_wakeup(&q->rr);
 	rendez_wakeup(&q->wr);
 	qwake_cb(q, FDTAP_FILT_HANGUP);
 }

 /*
  *  return non-zero if the q is hungup
  */
 int qisclosed(struct queue *q)
 {
 	return q->state & Qclosed;
 }

 /*
  *  mark a queue as no longer hung up.  resets the wake_cb.
  */
 void qreopen(struct queue *q)
 {
 	spin_lock_irqsave(&q->lock);
 	q->state &= ~Qclosed;
 	q->eof = 0;
 	q->limit = q->inilim;
 	q->wake_cb = 0;
 	q->wake_data = 0;
 	spin_unlock_irqsave(&q->lock);
 }

 /*
  *  return bytes queued
  */
 int qlen(struct queue *q)
 {
 	return q->dlen;
 }

 size_t q_bytes_read(struct queue *q)
 {
 	return q->bytes_read;
 }

 /*
  * return space remaining before flow control
  *
  *  This used to be
  *  q->len < q->limit/2
  *  but it slows down tcp too much for certain write sizes.
  *  I really don't understand it completely.  It may be
  *  due to the queue draining so fast that the transmission
  *  stalls waiting for the app to produce more data.  - presotto
  *
  *  q->len was the amount of bytes, which is no longer used.  we now use
  *  q->dlen, the amount of usable data.  a.k.a. qlen()...  - brho
  */
 int qwindow(struct queue *q)
 {
 	int l;

 	l = q->limit - q->dlen;
 	if (l < 0)
 		l = 0;
 	return l;
 }

 /*
  *  return true if we can read without blocking
  */
 int qcanread(struct queue *q)
 {
 	return q->bfirst != 0;
 }

 /*
  *  change queue limit
  */
 void qsetlimit(struct queue *q, size_t limit)
 {
 	bool was_writable = qwritable(q);

 	q->limit = limit;
 	if (!was_writable && qwritable(q)) {
 		rendez_wakeup(&q->wr);
 		qwake_cb(q, FDTAP_FILT_WRITABLE);
 	}
 }

 size_t qgetlimit(struct queue *q)
 {
 	return q->limit;
 }

 /*
  *  set whether writes drop overflowing blocks, or if we sleep
  */
 void qdropoverflow(struct queue *q, bool onoff)
 {
 	spin_lock_irqsave(&q->lock);
 	if (onoff)
 		q->state |= Qdropoverflow;
 	else
 		q->state &= ~Qdropoverflow;
 	spin_unlock_irqsave(&q->lock);
 }

 /* Be careful: this can affect concurrent reads/writes and code that might have
  * built-in expectations of the q's type. */
 void q_toggle_qmsg(struct queue *q, bool onoff)
 {
 	spin_lock_irqsave(&q->lock);
 	if (onoff)
 		q->state |= Qmsg;
 	else
 		q->state &= ~Qmsg;
 	spin_unlock_irqsave(&q->lock);
 }

 /* Be careful: this can affect concurrent reads/writes and code that might have
  * built-in expectations of the q's type. */
 void q_toggle_qcoalesce(struct queue *q, bool onoff)
 {
 	spin_lock_irqsave(&q->lock);
 	if (onoff)
 		q->state |= Qcoalesce;
 	else
 		q->state &= ~Qcoalesce;
 	spin_unlock_irqsave(&q->lock);
 }

 /*
  *  flush the output queue
  */
 void qflush(struct queue *q)
 {
 	struct block *bfirst;

 	/* mark it */
 	spin_lock_irqsave(&q->lock);
 	bfirst = q->bfirst;
 	q->bfirst = 0;
 	q->dlen = 0;
 	spin_unlock_irqsave(&q->lock);

 	/* free queued blocks */
 	freeblist(bfirst);

 	/* wake up writers */
 	rendez_wakeup(&q->wr);
 	qwake_cb(q, FDTAP_FILT_WRITABLE);
 }

 int qfull(struct queue *q)
 {
 	return !qwritable(q);
 }

 int qstate(struct queue *q)
 {
 	return q->state;
 }

 void qdump(struct queue *q)
 {
 	if (q)
 		printk("q=%p bfirst=%p blast=%p dlen=%d limit=%d state=#%x\n",
 			   q, q->bfirst, q->blast, q->dlen, q->limit, q->state);
 }

 /* On certain wakeup events, qio will call func(q, data, filter), where filter
  * marks the type of wakeup event (flags from FDTAP).
  *
  * There's no sync protection.  If you change the CB while the qio is running,
  * you might get a CB with the data or func from a previous set_wake_cb.  You
  * should set this once per queue and forget it.
  *
  * You can remove the CB by passing in 0 for the func.  Alternatively, you can
  * just make sure that the func(data) pair are valid until the queue is freed or
  * reopened. */
 void qio_set_wake_cb(struct queue *q, qio_wake_cb_t func, void *data)
 {
 	q->wake_data = data;
 	wmb();	/* if we see func, we'll also see the data for it */
 	q->wake_cb = func;
 }

 /* Helper for detecting whether we'll block on a read at this instant. */
 bool qreadable(struct queue *q)
 {
 	return qlen(q) > 0;
 }

 /* Helper for detecting whether we'll block on a write at this instant. */
 bool qwritable(struct queue *q)
 {
 	return !q->limit || qwindow(q) > 0;
 }