| /* Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. |
| * Portions Copyright © 1997-1999 Vita Nuova Limited |
| * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited |
| * (www.vitanuova.com) |
| * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others |
| * |
| * Modified for the Akaros operating system: |
| * Copyright (c) 2013-2014 The Regents of the University of California |
| * Copyright (c) 2013-2015 Google Inc. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. */ |
| |
| #include <slab.h> |
| #include <kmalloc.h> |
| #include <kref.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <error.h> |
| #include <cpio.h> |
| #include <pmap.h> |
| #include <smp.h> |
| #include <net/ip.h> |
| |
| #define PANIC_EXTRA(b) \ |
| { \ |
| if ((b)->extra_len) { \ |
| printblock(b); \ |
| backtrace(); \ |
| panic("%s doesn't handle extra_data", __FUNCTION__); \ |
| } \ |
| } |
| |
| static uint32_t padblockcnt; |
| static uint32_t concatblockcnt; |
| static uint32_t pullupblockcnt; |
| static uint32_t copyblockcnt; |
| static uint32_t consumecnt; |
| static uint32_t producecnt; |
| static uint32_t qcopycnt; |
| |
| static int debugging; |
| |
| #define QDEBUG if(0) |
| |
| /* |
| * IO queues |
| */ |
| |
| struct queue { |
| spinlock_t lock;; |
| |
| struct block *bfirst; /* buffer */ |
| struct block *blast; |
| |
| int dlen; /* data bytes in queue */ |
| int limit; /* max bytes in queue */ |
| int inilim; /* initial limit */ |
| int state; |
| int eof; /* number of eofs read by user */ |
| size_t bytes_read; |
| |
| void (*kick) (void *); /* restart output */ |
| void (*bypass) (void *, struct block *); /* bypass queue altogether */ |
| void *arg; /* argument to kick */ |
| |
| struct rendez rr; /* process waiting to read */ |
| struct rendez wr; /* process waiting to write */ |
| qio_wake_cb_t wake_cb; /* callbacks for qio wakeups */ |
| void *wake_data; |
| |
| char err[ERRMAX]; |
| }; |
| |
| enum { |
| Maxatomic = 64 * 1024, |
| QIO_CAN_ERR_SLEEP = (1 << 0), /* can throw errors or block/sleep */ |
| QIO_LIMIT = (1 << 1), /* respect q->limit */ |
| QIO_DROP_OVERFLOW = (1 << 2), /* alternative to qdropoverflow */ |
| QIO_JUST_ONE_BLOCK = (1 << 3), /* when qbreading, just get one block */ |
| QIO_NON_BLOCK = (1 << 4), /* throw EAGAIN instead of blocking */ |
| QIO_DONT_KICK = (1 << 5), /* don't kick when waking */ |
| }; |
| |
| unsigned int qiomaxatomic = Maxatomic; |
| |
| static size_t copy_to_block_body(struct block *to, void *from, size_t copy_amt); |
| static ssize_t __qbwrite(struct queue *q, struct block *b, int flags); |
| static struct block *__qbread(struct queue *q, size_t len, int qio_flags, |
| int mem_flags); |
| static bool qwait_and_ilock(struct queue *q, int qio_flags); |
| |
| /* Helper: fires a wake callback, sending 'filter' */ |
| static void qwake_cb(struct queue *q, int filter) |
| { |
| if (q->wake_cb) |
| q->wake_cb(q, q->wake_data, filter); |
| } |
| |
| void ixsummary(void) |
| { |
| debugging ^= 1; |
| printd("pad %lu, concat %lu, pullup %lu, copy %lu\n", |
| padblockcnt, concatblockcnt, pullupblockcnt, copyblockcnt); |
| printd("consume %lu, produce %lu, qcopy %lu\n", |
| consumecnt, producecnt, qcopycnt); |
| } |
| |
| /* |
| * pad a block to the front (or the back if size is negative) |
| */ |
| struct block *padblock(struct block *bp, int size) |
| { |
| int n; |
| struct block *nbp; |
| |
| QDEBUG checkb(bp, "padblock 1"); |
| if (size >= 0) { |
| if (bp->rp - bp->base >= size) { |
| bp->network_offset += size; |
| bp->transport_offset += size; |
| bp->rp -= size; |
| return bp; |
| } |
| |
| PANIC_EXTRA(bp); |
| if (bp->next) |
| panic("padblock %p", getcallerpc(&bp)); |
| n = BLEN(bp); |
| padblockcnt++; |
| nbp = block_alloc(size + n, MEM_WAIT); |
| block_copy_metadata(nbp, bp); |
| nbp->rp += size; |
| nbp->wp = nbp->rp; |
| memmove(nbp->wp, bp->rp, n); |
| nbp->wp += n; |
| freeb(bp); |
| nbp->rp -= size; |
| } else { |
| size = -size; |
| |
| PANIC_EXTRA(bp); |
| |
| if (bp->next) |
| panic("padblock %p", getcallerpc(&bp)); |
| |
| if (bp->lim - bp->wp >= size) |
| return bp; |
| |
| n = BLEN(bp); |
| padblockcnt++; |
| nbp = block_alloc(size + n, MEM_WAIT); |
| block_copy_metadata(nbp, bp); |
| memmove(nbp->wp, bp->rp, n); |
| nbp->wp += n; |
| freeb(bp); |
| } |
| QDEBUG checkb(nbp, "padblock 1"); |
| return nbp; |
| } |
| |
| /* |
| * return count of bytes in a string of blocks |
| */ |
| int blocklen(struct block *bp) |
| { |
| int len; |
| |
| len = 0; |
| while (bp) { |
| len += BLEN(bp); |
| bp = bp->next; |
| } |
| return len; |
| } |
| |
| /* |
| * return count of space in blocks |
| */ |
| int blockalloclen(struct block *bp) |
| { |
| int len; |
| |
| len = 0; |
| while (bp) { |
| len += BALLOC(bp); |
| bp = bp->next; |
| } |
| return len; |
| } |
| |
| /* |
| * copy the string of blocks into |
| * a single block and free the string |
| */ |
| struct block *concatblock(struct block *bp) |
| { |
| int len; |
| struct block *nb, *f; |
| |
| if (bp->next == 0) |
| return bp; |
| |
| /* probably use parts of qclone */ |
| PANIC_EXTRA(bp); |
| nb = block_alloc(blocklen(bp), MEM_WAIT); |
| for (f = bp; f; f = f->next) { |
| len = BLEN(f); |
| memmove(nb->wp, f->rp, len); |
| nb->wp += len; |
| } |
| concatblockcnt += BLEN(nb); |
| freeblist(bp); |
| QDEBUG checkb(nb, "concatblock 1"); |
| return nb; |
| } |
| |
| /* Makes an identical copy of the block, collapsing all the data into the block |
| * body. It does not point to the contents of the original, it is a copy |
| * (unlike qclone). Since we're copying, we might as well put the memory into |
| * one contiguous chunk. */ |
| struct block *copyblock(struct block *bp, int mem_flags) |
| { |
| struct block *newb; |
| struct extra_bdata *ebd; |
| size_t amt; |
| |
| QDEBUG checkb(bp, "copyblock 0"); |
| newb = block_alloc(BLEN(bp), mem_flags); |
| if (!newb) |
| return 0; |
| amt = copy_to_block_body(newb, bp->rp, BHLEN(bp)); |
| assert(amt == BHLEN(bp)); |
| for (int i = 0; i < bp->nr_extra_bufs; i++) { |
| ebd = &bp->extra_data[i]; |
| if (!ebd->base || !ebd->len) |
| continue; |
| amt = copy_to_block_body(newb, (void*)ebd->base + ebd->off, |
| ebd->len); |
| assert(amt == ebd->len); |
| } |
| block_copy_metadata(newb, bp); |
| copyblockcnt++; |
| QDEBUG checkb(newb, "copyblock 1"); |
| return newb; |
| } |
| |
| /* Returns a block with the remaining contents of b all in the main body of the |
| * returned block. Replace old references to b with the returned value (which |
| * may still be 'b', if no change was needed. */ |
| struct block *linearizeblock(struct block *b) |
| { |
| struct block *newb; |
| |
| if (!b->extra_len) |
| return b; |
| newb = copyblock(b, MEM_WAIT); |
| freeb(b); |
| return newb; |
| } |
| |
| /* Make sure the first block has at least n bytes in its main body. Pulls up |
| * data from the *list* of blocks. Returns 0 if there is not enough data in the |
| * block list. */ |
| struct block *pullupblock(struct block *bp, int n) |
| { |
| int i, len, seglen; |
| struct block *nbp; |
| struct extra_bdata *ebd; |
| |
| /* |
| * this should almost always be true, it's |
| * just to avoid every caller checking. |
| */ |
| if (BHLEN(bp) >= n) |
| return bp; |
| |
| /* If there's no chance, just bail out now. This might be slightly |
| * wasteful if there's a long blist that does have enough data. */ |
| if (n > blocklen(bp)) |
| return 0; |
| /* a start at explicit main-body / header management */ |
| if (bp->extra_len) { |
| if (n > bp->lim - bp->rp) { |
| /* would need to realloc a new block and copy everything |
| * over. */ |
| panic("can't pullup %d bytes, no place to put it: bp->lim %p, bp->rp %p, bp->lim-bp->rp %d\n", |
| n, bp->lim, bp->rp, bp->lim-bp->rp); |
| } |
| len = n - BHLEN(bp); |
| /* Would need to recursively call this, or otherwise pull from |
| * later blocks and put chunks of their data into the block |
| * we're building. */ |
| if (len > bp->extra_len) |
| panic("pullup more than extra (%d, %d, %d)\n", |
| n, BHLEN(bp), bp->extra_len); |
| QDEBUG checkb(bp, "before pullup"); |
| for (int i = 0; (i < bp->nr_extra_bufs) && len; i++) { |
| ebd = &bp->extra_data[i]; |
| if (!ebd->base || !ebd->len) |
| continue; |
| seglen = MIN(ebd->len, len); |
| memcpy(bp->wp, (void*)(ebd->base + ebd->off), seglen); |
| bp->wp += seglen; |
| len -= seglen; |
| ebd->len -= seglen; |
| ebd->off += seglen; |
| bp->extra_len -= seglen; |
| if (ebd->len == 0) { |
| kfree((void *)ebd->base); |
| ebd->off = 0; |
| ebd->base = 0; |
| } |
| } |
| /* maybe just call pullupblock recursively here */ |
| if (len) |
| panic("pullup %d bytes overdrawn\n", len); |
| QDEBUG checkb(bp, "after pullup"); |
| return bp; |
| } |
| |
| /* |
| * if not enough room in the first block, |
| * add another to the front of the list. |
| */ |
| if (bp->lim - bp->rp < n) { |
| nbp = block_alloc(n, MEM_WAIT); |
| nbp->next = bp; |
| bp = nbp; |
| } |
| |
| /* |
| * copy bytes from the trailing blocks into the first |
| */ |
| n -= BLEN(bp); |
| while ((nbp = bp->next)) { |
| i = BLEN(nbp); |
| if (i > n) { |
| memmove(bp->wp, nbp->rp, n); |
| pullupblockcnt++; |
| bp->wp += n; |
| nbp->rp += n; |
| QDEBUG checkb(bp, "pullupblock 1"); |
| return bp; |
| } else { |
| memmove(bp->wp, nbp->rp, i); |
| pullupblockcnt++; |
| bp->wp += i; |
| bp->next = nbp->next; |
| nbp->next = 0; |
| freeb(nbp); |
| n -= i; |
| if (n == 0) { |
| QDEBUG checkb(bp, "pullupblock 2"); |
| return bp; |
| } |
| } |
| } |
| freeb(bp); |
| return 0; |
| } |
| |
| /* |
| * make sure the first block has at least n bytes in its main body |
| */ |
| struct block *pullupqueue(struct queue *q, int n) |
| { |
| struct block *b; |
| |
| /* TODO: lock to protect the queue links? */ |
| if ((BHLEN(q->bfirst) >= n)) |
| return q->bfirst; |
| q->bfirst = pullupblock(q->bfirst, n); |
| for (b = q->bfirst; b != NULL && b->next != NULL; b = b->next) ; |
| q->blast = b; |
| return q->bfirst; |
| } |
| |
| /* throw away count bytes from the front of |
| * block's extradata. Returns count of bytes |
| * thrown away |
| */ |
| |
| static int pullext(struct block *bp, int count) |
| { |
| struct extra_bdata *ed; |
| int i, rem, bytes = 0; |
| |
| for (i = 0; bp->extra_len && count && i < bp->nr_extra_bufs; i++) { |
| ed = &bp->extra_data[i]; |
| rem = MIN(count, ed->len); |
| bp->extra_len -= rem; |
| count -= rem; |
| bytes += rem; |
| ed->off += rem; |
| ed->len -= rem; |
| if (ed->len == 0) { |
| kfree((void *)ed->base); |
| ed->base = 0; |
| ed->off = 0; |
| } |
| } |
| return bytes; |
| } |
| |
| /* throw away count bytes from the end of a |
| * block's extradata. Returns count of bytes |
| * thrown away |
| */ |
| |
| static int dropext(struct block *bp, int count) |
| { |
| struct extra_bdata *ed; |
| int i, rem, bytes = 0; |
| |
| for (i = bp->nr_extra_bufs - 1; bp->extra_len && count && i >= 0; i--) { |
| ed = &bp->extra_data[i]; |
| rem = MIN(count, ed->len); |
| bp->extra_len -= rem; |
| count -= rem; |
| bytes += rem; |
| ed->len -= rem; |
| if (ed->len == 0) { |
| kfree((void *)ed->base); |
| ed->base = 0; |
| ed->off = 0; |
| } |
| } |
| return bytes; |
| } |
| |
| /* |
| * throw away up to count bytes from a |
| * list of blocks. Return count of bytes |
| * thrown away. |
| */ |
| static int _pullblock(struct block **bph, int count, int free) |
| { |
| struct block *bp; |
| int n, bytes; |
| |
| bytes = 0; |
| if (bph == NULL) |
| return 0; |
| |
| while (*bph != NULL && count != 0) { |
| bp = *bph; |
| |
| n = MIN(BHLEN(bp), count); |
| bytes += n; |
| count -= n; |
| bp->rp += n; |
| n = pullext(bp, count); |
| bytes += n; |
| count -= n; |
| QDEBUG checkb(bp, "pullblock "); |
| if (BLEN(bp) == 0 && (free || count)) { |
| *bph = bp->next; |
| bp->next = NULL; |
| freeb(bp); |
| } |
| } |
| return bytes; |
| } |
| |
| int pullblock(struct block **bph, int count) |
| { |
| return _pullblock(bph, count, 1); |
| } |
| |
| /* |
| * trim to len bytes starting at offset |
| */ |
| struct block *trimblock(struct block *bp, int offset, int len) |
| { |
| uint32_t l, trim; |
| int olen = len; |
| |
| QDEBUG checkb(bp, "trimblock 1"); |
| if (blocklen(bp) < offset + len) { |
| freeblist(bp); |
| return NULL; |
| } |
| |
| l =_pullblock(&bp, offset, 0); |
| if (bp == NULL) |
| return NULL; |
| if (l != offset) { |
| freeblist(bp); |
| return NULL; |
| } |
| |
| while ((l = BLEN(bp)) < len) { |
| len -= l; |
| bp = bp->next; |
| } |
| |
| trim = BLEN(bp) - len; |
| trim -= dropext(bp, trim); |
| bp->wp -= trim; |
| |
| if (bp->next) { |
| freeblist(bp->next); |
| bp->next = NULL; |
| } |
| return bp; |
| } |
| |
| /* Adjust block @bp so that its size is exactly @len. |
| * If the size is increased, fill in the new contents with zeros. |
| * If the size is decreased, discard some of the old contents at the tail. */ |
| struct block *adjustblock(struct block *bp, int len) |
| { |
| struct extra_bdata *ebd; |
| void *buf; |
| int i; |
| |
| if (len < 0) { |
| freeb(bp); |
| return NULL; |
| } |
| |
| if (len == BLEN(bp)) |
| return bp; |
| |
| /* Shrink within block main body. */ |
| if (len <= BHLEN(bp)) { |
| free_block_extra(bp); |
| bp->wp = bp->rp + len; |
| QDEBUG checkb(bp, "adjustblock 1"); |
| return bp; |
| } |
| |
| /* Need to grow. */ |
| if (len > BLEN(bp)) { |
| /* Grow within block main body. */ |
| if (bp->extra_len == 0 && bp->rp + len <= bp->lim) { |
| memset(bp->wp, 0, len - BLEN(bp)); |
| bp->wp = bp->rp + len; |
| QDEBUG checkb(bp, "adjustblock 2"); |
| return bp; |
| } |
| /* Grow with extra data buffers. */ |
| buf = kzmalloc(len - BLEN(bp), MEM_WAIT); |
| block_append_extra(bp, (uintptr_t)buf, 0, len - BLEN(bp), |
| MEM_WAIT); |
| QDEBUG checkb(bp, "adjustblock 3"); |
| return bp; |
| } |
| |
| /* Shrink extra data buffers. |
| * len is how much of ebd we need to keep. |
| * extra_len is re-accumulated. */ |
| assert(bp->extra_len > 0); |
| len -= BHLEN(bp); |
| bp->extra_len = 0; |
| for (i = 0; i < bp->nr_extra_bufs; i++) { |
| ebd = &bp->extra_data[i]; |
| if (len <= ebd->len) |
| break; |
| len -= ebd->len; |
| bp->extra_len += ebd->len; |
| } |
| /* If len becomes zero, extra_data[i] should be freed. */ |
| if (len > 0) { |
| ebd = &bp->extra_data[i]; |
| ebd->len = len; |
| bp->extra_len += ebd->len; |
| i++; |
| } |
| for (; i < bp->nr_extra_bufs; i++) { |
| ebd = &bp->extra_data[i]; |
| if (ebd->base) |
| kfree((void*)ebd->base); |
| ebd->base = ebd->off = ebd->len = 0; |
| } |
| QDEBUG checkb(bp, "adjustblock 4"); |
| return bp; |
| } |
| |
| /* Helper: removes and returns the first block from q */ |
| static struct block *pop_first_block(struct queue *q) |
| { |
| struct block *b = q->bfirst; |
| |
| q->dlen -= BLEN(b); |
| q->bytes_read += BLEN(b); |
| q->bfirst = b->next; |
| b->next = 0; |
| return b; |
| } |
| |
| /* Helper: copies up to copy_amt from a buf to a block's main body (b->wp) */ |
| static size_t copy_to_block_body(struct block *to, void *from, size_t copy_amt) |
| { |
| copy_amt = MIN(to->lim - to->wp, copy_amt); |
| memcpy(to->wp, from, copy_amt); |
| to->wp += copy_amt; |
| return copy_amt; |
| } |
| |
| /* Accounting helper. Block b in q lost amt extra_data */ |
| static void block_and_q_lost_extra(struct block *b, struct queue *q, size_t amt) |
| { |
| b->extra_len -= amt; |
| q->dlen -= amt; |
| q->bytes_read += amt; |
| } |
| |
| /* Helper: moves ebd from a block (in from_q) to another block. The *ebd is |
| * fixed in 'from', so we move its contents and zero it out in 'from'. |
| * |
| * Returns the length moved (0 on failure). */ |
| static size_t move_ebd(struct extra_bdata *ebd, struct block *to, |
| struct block *from, struct queue *from_q) |
| { |
| size_t ret = ebd->len; |
| |
| if (block_append_extra(to, ebd->base, ebd->off, ebd->len, MEM_ATOMIC)) |
| return 0; |
| block_and_q_lost_extra(from, from_q, ebd->len); |
| ebd->base = ebd->len = ebd->off = 0; |
| return ret; |
| } |
| |
| /* Copy up to len bytes from q->bfirst to @to, leaving the block in place. May |
| * return with less than len, but greater than 0, even if there is more |
| * available in q. |
| * |
| * At any moment that we have copied anything and things are tricky, we can just |
| * return. The trickiness comes from a bunch of variables: is the main body |
| * empty? How do we split the ebd? If our alloc fails, then we can fall back |
| * to @to's main body, but only if we haven't used it yet. */ |
| static size_t copy_from_first_block(struct queue *q, struct block *to, |
| size_t len) |
| { |
| struct block *from = q->bfirst; |
| size_t copy_amt, amt; |
| struct extra_bdata *ebd; |
| |
| assert(len < BLEN(from)); /* sanity */ |
| /* Try to extract from the main body */ |
| copy_amt = MIN(BHLEN(from), len); |
| if (copy_amt) { |
| copy_amt = copy_to_block_body(to, from->rp, copy_amt); |
| from->rp += copy_amt; |
| /* We only change dlen, (data len), not q->len, since the q |
| * still has the same block memory allocation (no kfrees |
| * happened) */ |
| q->dlen -= copy_amt; |
| q->bytes_read += copy_amt; |
| } |
| /* Try to extract the remainder from the extra data */ |
| len -= copy_amt; |
| for (int i = 0; (i < from->nr_extra_bufs) && len; i++) { |
| ebd = &from->extra_data[i]; |
| if (!ebd->base || !ebd->len) |
| continue; |
| if (len >= ebd->len) { |
| amt = move_ebd(ebd, to, from, q); |
| if (!amt) { |
| /* our internal alloc could have failed. this |
| * ebd is now the last one we'll consider. |
| * let's handle it separately and put it in the |
| * main body. */ |
| if (copy_amt) |
| return copy_amt; |
| copy_amt = copy_to_block_body(to, |
| (void*)ebd->base + |
| ebd->off, |
| ebd->len); |
| block_and_q_lost_extra(from, q, copy_amt); |
| break; |
| } |
| len -= amt; |
| copy_amt += amt; |
| continue; |
| } else { |
| /* If we're here, we reached our final ebd, which we'll |
| * need to split to get anything from it. */ |
| if (copy_amt) |
| return copy_amt; |
| copy_amt = copy_to_block_body(to, (void*)ebd->base + |
| ebd->off, len); |
| ebd->off += copy_amt; |
| ebd->len -= copy_amt; |
| block_and_q_lost_extra(from, q, copy_amt); |
| break; |
| } |
| } |
| if (len) |
| assert(copy_amt); /* sanity */ |
| return copy_amt; |
| } |
| |
| /* Return codes for __qbread and __try_qbread. */ |
| enum { |
| QBR_OK, |
| QBR_FAIL, |
| QBR_SPARE, /* we need a spare block */ |
| QBR_AGAIN, /* do it again, we are coalescing blocks */ |
| }; |
| |
| /* Helper and back-end for __qbread: extracts and returns a list of blocks |
| * containing up to len bytes. It may contain less than len even if q has more |
| * data. |
| * |
| * Returns a code interpreted by __qbread, and the returned blist in ret. */ |
| static int __try_qbread(struct queue *q, size_t len, int qio_flags, |
| struct block **real_ret, struct block *spare) |
| { |
| struct block *ret, *ret_last, *first; |
| size_t blen; |
| bool was_unwritable = FALSE; |
| |
| if (qio_flags & QIO_CAN_ERR_SLEEP) { |
| if (!qwait_and_ilock(q, qio_flags)) { |
| spin_unlock_irqsave(&q->lock); |
| return QBR_FAIL; |
| } |
| /* we qwaited and still hold the lock, so the q is not empty */ |
| first = q->bfirst; |
| } else { |
| spin_lock_irqsave(&q->lock); |
| first = q->bfirst; |
| if (!first) { |
| spin_unlock_irqsave(&q->lock); |
| return QBR_FAIL; |
| } |
| } |
| /* We need to check before adjusting q->len. We're checking the |
| * writer's sleep condition / tap condition. When set, we *might* be |
| * making an edge transition (from unwritable to writable), which needs |
| * to wake and fire taps. But, our read might not drain the queue below |
| * q->lim. We'll check again later to see if we should really wake |
| * them. */ |
| was_unwritable = !qwritable(q); |
| blen = BLEN(first); |
| if ((q->state & Qcoalesce) && (blen == 0)) { |
| freeb(pop_first_block(q)); |
| spin_unlock_irqsave(&q->lock); |
| /* Need to retry to make sure we have a first block */ |
| return QBR_AGAIN; |
| } |
| /* Qmsg: just return the first block. Be careful, since our caller |
| * might not read all of the block and thus drop bytes. Similar to |
| * SOCK_DGRAM. */ |
| if (q->state & Qmsg) { |
| ret = pop_first_block(q); |
| goto out_ok; |
| } |
| /* Let's get at least something first - makes the code easier. This |
| * way, we'll only ever split the block once. */ |
| if (blen <= len) { |
| ret = pop_first_block(q); |
| len -= blen; |
| } else { |
| /* need to split the block. we won't actually take the first |
| * block out of the queue - we're just extracting a little bit. |
| */ |
| if (!spare) { |
| /* We have nothing and need a spare block. Retry! */ |
| spin_unlock_irqsave(&q->lock); |
| return QBR_SPARE; |
| } |
| copy_from_first_block(q, spare, len); |
| ret = spare; |
| goto out_ok; |
| } |
| /* At this point, we just grabbed the first block. We can try to grab |
| * some more, up to len (if they want). */ |
| if (qio_flags & QIO_JUST_ONE_BLOCK) |
| goto out_ok; |
| ret_last = ret; |
| while (q->bfirst && (len > 0)) { |
| blen = BLEN(q->bfirst); |
| if ((q->state & Qcoalesce) && (blen == 0)) { |
| /* remove the intermediate 0 blocks */ |
| freeb(pop_first_block(q)); |
| continue; |
| } |
| if (blen > len) { |
| /* We could try to split the block, but that's a huge |
| * pain. For instance, we might need to move the main |
| * body of b into an extra_data of ret_last. lots of |
| * ways for that to fail, and lots of cases to consider. |
| * Easier to just bail out. This is why I did the first |
| * block above: we don't need to worry about this. */ |
| break; |
| } |
| ret_last->next = pop_first_block(q); |
| ret_last = ret_last->next; |
| len -= blen; |
| } |
| out_ok: |
| /* Don't wake them up or fire tap if we didn't drain enough. */ |
| if (!qwritable(q)) |
| was_unwritable = FALSE; |
| spin_unlock_irqsave(&q->lock); |
| if (was_unwritable) { |
| if (q->kick && !(qio_flags & QIO_DONT_KICK)) |
| q->kick(q->arg); |
| rendez_wakeup(&q->wr); |
| qwake_cb(q, FDTAP_FILT_WRITABLE); |
| } |
| *real_ret = ret; |
| return QBR_OK; |
| } |
| |
| /* Helper and front-end for __try_qbread: extracts and returns a list of blocks |
| * containing up to len bytes. It may contain less than len even if q has more |
| * data. |
| * |
| * Returns 0 if the q is closed, if it would require blocking and !CAN_BLOCK, or |
| * if it required a spare and the memory allocation failed. |
| * |
| * Technically, there's a weird corner case with !Qcoalesce and Qmsg where you |
| * could get a zero length block back. */ |
| static struct block *__qbread(struct queue *q, size_t len, int qio_flags, |
| int mem_flags) |
| { |
| ERRSTACK(1); |
| struct block *ret = 0; |
| struct block *volatile spare = 0; /* volatile for the waserror */ |
| |
| /* __try_qbread can throw, based on qio flags. */ |
| if ((qio_flags & QIO_CAN_ERR_SLEEP) && waserror()) { |
| if (spare) |
| freeb(spare); |
| nexterror(); |
| } |
| while (1) { |
| switch (__try_qbread(q, len, qio_flags, &ret, spare)) { |
| case QBR_OK: |
| case QBR_FAIL: |
| if (spare && (ret != spare)) |
| freeb(spare); |
| goto out_ret; |
| case QBR_SPARE: |
| assert(!spare); |
| /* Due to some nastiness, we need a fresh block so we |
| * can read out anything from the queue. 'len' seems |
| * like a reasonable amount. Maybe we can get away with |
| * less. */ |
| spare = block_alloc(len, mem_flags); |
| if (!spare) { |
| /* Careful here: a memory failure (possible with |
| * MEM_ATOMIC) could look like 'no data in the |
| * queue' (QBR_FAIL). The only one who does is |
| * this qget(), who happens to know that we |
| * won't need a spare, due to the len argument. |
| * Spares are only needed when we need to split |
| * a block. */ |
| ret = 0; |
| goto out_ret; |
| } |
| break; |
| case QBR_AGAIN: |
| /* if the first block is 0 and we are Qcoalesce, then |
| * we'll need to try again. We bounce out of __try so |
| * we can perform the "is there a block" logic again |
| * from the top. */ |
| break; |
| } |
| } |
| assert(0); |
| out_ret: |
| if (qio_flags & QIO_CAN_ERR_SLEEP) |
| poperror(); |
| return ret; |
| } |
| |
| /* |
| * get next block from a queue, return null if nothing there |
| */ |
| struct block *qget(struct queue *q) |
| { |
| /* since len == SIZE_MAX, we should never need to do a mem alloc */ |
| return __qbread(q, SIZE_MAX, QIO_JUST_ONE_BLOCK, MEM_ATOMIC); |
| } |
| |
| /* Throw away the next 'len' bytes in the queue returning the number actually |
| * discarded. |
| * |
| * If the bytes are in the queue, then they must be discarded. The only time to |
| * return less than len is if the q itself has less than len bytes. |
| * |
| * This won't trigger a kick when waking up any sleepers. This seems to be Plan |
| * 9's intent, since the TCP stack will deadlock if qdiscard kicks. */ |
| size_t qdiscard(struct queue *q, size_t len) |
| { |
| struct block *blist; |
| size_t removed_amt; |
| size_t sofar = 0; |
| |
| /* This is racy. There could be multiple qdiscarders or other |
| * consumers, where the consumption could be interleaved. */ |
| while (qlen(q) && len) { |
| blist = __qbread(q, len, QIO_DONT_KICK, MEM_WAIT); |
| removed_amt = freeblist(blist); |
| sofar += removed_amt; |
| len -= removed_amt; |
| } |
| return sofar; |
| } |
| |
| ssize_t qpass(struct queue *q, struct block *b) |
| { |
| return __qbwrite(q, b, QIO_LIMIT | QIO_DROP_OVERFLOW); |
| } |
| |
| ssize_t qpassnolim(struct queue *q, struct block *b) |
| { |
| return __qbwrite(q, b, 0); |
| } |
| |
| /* |
| * if the allocated space is way out of line with the used |
| * space, reallocate to a smaller block |
| */ |
| struct block *packblock(struct block *bp) |
| { |
| struct block **l, *nbp; |
| int n; |
| |
| if (bp->extra_len) |
| return bp; |
| for (l = &bp; *l; l = &(*l)->next) { |
| nbp = *l; |
| n = BLEN(nbp); |
| if ((n << 2) < BALLOC(nbp)) { |
| *l = block_alloc(n, MEM_WAIT); |
| memmove((*l)->wp, nbp->rp, n); |
| (*l)->wp += n; |
| (*l)->next = nbp->next; |
| freeb(nbp); |
| } |
| } |
| |
| return bp; |
| } |
| |
| /* Add an extra_data entry to newb at newb_idx pointing to b's body, starting at |
| * body_rp, for up to len. Returns the len consumed. |
| * |
| * The base is 'b', so that we can kfree it later. This currently ties us to |
| * using kfree for the release method for all extra_data. |
| * |
| * It is possible to have a body size that is 0, if there is no offset, and |
| * b->wp == b->rp. This will have an extra data entry of 0 length. */ |
| static size_t point_to_body(struct block *b, uint8_t *body_rp, |
| struct block *newb, unsigned int newb_idx, |
| size_t len) |
| { |
| struct extra_bdata *ebd = &newb->extra_data[newb_idx]; |
| |
| assert(newb_idx < newb->nr_extra_bufs); |
| |
| kmalloc_incref(b); |
| ebd->base = (uintptr_t)b; |
| ebd->off = (uint32_t)(body_rp - (uint8_t*)b); |
| ebd->len = MIN(b->wp - body_rp, len); /* think of body_rp as b->rp */ |
| assert((int)ebd->len >= 0); |
| newb->extra_len += ebd->len; |
| return ebd->len; |
| } |
| |
| /* Add an extra_data entry to newb at newb_idx pointing to b's b_idx'th |
| * extra_data buf, at b_off within that buffer, for up to len. Returns the len |
| * consumed. |
| * |
| * We can have blocks with 0 length, but they are still refcnt'd. See above. */ |
| static size_t point_to_buf(struct block *b, unsigned int b_idx, uint32_t b_off, |
| struct block *newb, unsigned int newb_idx, |
| size_t len) |
| { |
| struct extra_bdata *n_ebd = &newb->extra_data[newb_idx]; |
| struct extra_bdata *b_ebd = &b->extra_data[b_idx]; |
| |
| assert(b_idx < b->nr_extra_bufs); |
| assert(newb_idx < newb->nr_extra_bufs); |
| |
| kmalloc_incref((void*)b_ebd->base); |
| n_ebd->base = b_ebd->base; |
| n_ebd->off = b_ebd->off + b_off; |
| n_ebd->len = MIN(b_ebd->len - b_off, len); |
| newb->extra_len += n_ebd->len; |
| return n_ebd->len; |
| } |
| |
| /* given a string of blocks, sets up the new block's extra_data such that it |
| * *points* to the contents of the blist [offset, len + offset). This does not |
| * make a separate copy of the contents of the blist. |
| * |
| * returns 0 on success. the only failure is if the extra_data array was too |
| * small, so this returns a positive integer saying how big the extra_data needs |
| * to be. |
| * |
| * callers are responsible for protecting the list structure. */ |
| static int __blist_clone_to(struct block *blist, struct block *newb, int len, |
| uint32_t offset) |
| { |
| struct block *b, *first; |
| unsigned int nr_bufs = 0; |
| unsigned int b_idx, newb_idx = 0; |
| uint8_t *first_main_body = 0; |
| ssize_t sofar = 0; |
| |
| /* find the first block; keep offset relative to the latest b in the |
| * list */ |
| for (b = blist; b; b = b->next) { |
| if (BLEN(b) > offset) |
| break; |
| offset -= BLEN(b); |
| } |
| /* qcopy semantics: if you asked for an offset outside the block list, |
| * you get an empty block back */ |
| if (!b) |
| return 0; |
| first = b; |
| sofar -= offset; /* don't count the remaining offset in the first b */ |
| /* upper bound for how many buffers we'll need in newb */ |
| for (/* b is set*/; b; b = b->next) { |
| nr_bufs += BHLEN(b) ? 1 : 0; |
| /* still assuming nr_extra == nr_valid */ |
| nr_bufs += b->nr_extra_bufs; |
| sofar += BLEN(b); |
| if (sofar > len) |
| break; |
| } |
| /* we might be holding a spinlock here, so we won't wait for kmalloc */ |
| if (block_add_extd(newb, nr_bufs, 0) != 0) { |
| /* caller will need to alloc these, then re-call us */ |
| return nr_bufs; |
| } |
| for (b = first; b && len; b = b->next) { |
| b_idx = 0; |
| if (offset) { |
| if (offset < BHLEN(b)) { |
| /* off is in the main body */ |
| len -= point_to_body(b, b->rp + offset, newb, |
| newb_idx, len); |
| newb_idx++; |
| } else { |
| /* off is in one of the buffers (or just past |
| * the last one). we're not going to point to |
| * b's main body at all. */ |
| offset -= BHLEN(b); |
| assert(b->extra_data); |
| /* assuming these extrabufs are packed, or at |
| * least that len isn't gibberish */ |
| while (b->extra_data[b_idx].len <= offset) { |
| offset -= b->extra_data[b_idx].len; |
| b_idx++; |
| } |
| /* now offset is set to our offset in the |
| * b_idx'th buf */ |
| len -= point_to_buf(b, b_idx, offset, newb, |
| newb_idx, len); |
| newb_idx++; |
| b_idx++; |
| } |
| offset = 0; |
| } else { |
| if (BHLEN(b)) { |
| len -= point_to_body(b, b->rp, newb, newb_idx, |
| len); |
| newb_idx++; |
| } |
| } |
| /* knock out all remaining bufs. we only did one point_to_ op |
| * by now, and any point_to_ could be our last if it consumed |
| * all of len. */ |
| for (int i = b_idx; (i < b->nr_extra_bufs) && len; i++) { |
| len -= point_to_buf(b, i, 0, newb, newb_idx, len); |
| newb_idx++; |
| } |
| } |
| return 0; |
| } |
| |
| struct block *blist_clone(struct block *blist, int header_len, int len, |
| uint32_t offset) |
| { |
| int ret; |
| struct block *newb = block_alloc(header_len, MEM_WAIT); |
| |
| do { |
| ret = __blist_clone_to(blist, newb, len, offset); |
| if (ret) |
| block_add_extd(newb, ret, MEM_WAIT); |
| } while (ret); |
| return newb; |
| } |
| |
| /* given a queue, makes a single block with header_len reserved space in the |
| * block main body, and the contents of [offset, len + offset) pointed to in the |
| * new blocks ext_data. This does not make a copy of the q's contents, though |
| * you do have a ref count on the memory. */ |
| struct block *qclone(struct queue *q, int header_len, int len, uint32_t offset) |
| { |
| int ret; |
| struct block *newb = block_alloc(header_len, MEM_WAIT); |
| /* the while loop should rarely be used: it would require someone |
| * concurrently adding to the queue. */ |
| do { |
| /* TODO: RCU protect the q list (b->next) (need read lock) */ |
| spin_lock_irqsave(&q->lock); |
| ret = __blist_clone_to(q->bfirst, newb, len, offset); |
| spin_unlock_irqsave(&q->lock); |
| if (ret) |
| block_add_extd(newb, ret, MEM_WAIT); |
| } while (ret); |
| return newb; |
| } |
| |
| /* |
| * copy from offset in the queue |
| */ |
| struct block *qcopy_old(struct queue *q, int len, uint32_t offset) |
| { |
| int sofar; |
| int n; |
| struct block *b, *nb; |
| uint8_t *p; |
| |
| nb = block_alloc(len, MEM_WAIT); |
| |
| spin_lock_irqsave(&q->lock); |
| |
| /* go to offset */ |
| b = q->bfirst; |
| for (sofar = 0;; sofar += n) { |
| if (b == NULL) { |
| spin_unlock_irqsave(&q->lock); |
| return nb; |
| } |
| n = BLEN(b); |
| if (sofar + n > offset) { |
| p = b->rp + offset - sofar; |
| n -= offset - sofar; |
| break; |
| } |
| QDEBUG checkb(b, "qcopy"); |
| b = b->next; |
| } |
| |
| /* copy bytes from there */ |
| for (sofar = 0; sofar < len;) { |
| if (n > len - sofar) |
| n = len - sofar; |
| PANIC_EXTRA(b); |
| memmove(nb->wp, p, n); |
| qcopycnt += n; |
| sofar += n; |
| nb->wp += n; |
| b = b->next; |
| if (b == NULL) |
| break; |
| n = BLEN(b); |
| p = b->rp; |
| } |
| spin_unlock_irqsave(&q->lock); |
| |
| return nb; |
| } |
| |
| struct block *qcopy(struct queue *q, int len, uint32_t offset) |
| { |
| #ifdef CONFIG_BLOCK_EXTRAS |
| return qclone(q, 0, len, offset); |
| #else |
| return qcopy_old(q, len, offset); |
| #endif |
| } |
| |
| static void qinit_common(struct queue *q) |
| { |
| spinlock_init_irqsave(&q->lock); |
| rendez_init(&q->rr); |
| rendez_init(&q->wr); |
| } |
| |
| /* |
| * called by non-interrupt code |
| */ |
| struct queue *qopen(int limit, int msg, void (*kick) (void *), void *arg) |
| { |
| struct queue *q; |
| |
| q = kzmalloc(sizeof(struct queue), 0); |
| if (q == 0) |
| return 0; |
| qinit_common(q); |
| |
| q->limit = q->inilim = limit; |
| q->kick = kick; |
| q->arg = arg; |
| q->state = msg; |
| q->eof = 0; |
| |
| return q; |
| } |
| |
| /* open a queue to be bypassed */ |
| struct queue *qbypass(void (*bypass) (void *, struct block *), void *arg) |
| { |
| struct queue *q; |
| |
| q = kzmalloc(sizeof(struct queue), 0); |
| if (q == 0) |
| return 0; |
| qinit_common(q); |
| |
| q->limit = 0; |
| q->arg = arg; |
| q->bypass = bypass; |
| q->state = 0; |
| |
| return q; |
| } |
| |
| static int notempty(void *a) |
| { |
| struct queue *q = a; |
| |
| return (q->state & Qclosed) || q->bfirst != 0; |
| } |
| |
| /* Block, waiting for the queue to be non-empty or closed. Returns with |
| * the spinlock held. Returns TRUE when there queue is not empty, FALSE if it |
| * was naturally closed. Throws an error o/w. */ |
| static bool qwait_and_ilock(struct queue *q, int qio_flags) |
| { |
| while (1) { |
| spin_lock_irqsave(&q->lock); |
| if (q->bfirst != NULL) |
| return TRUE; |
| if (q->state & Qclosed) { |
| if (++q->eof > 3) { |
| spin_unlock_irqsave(&q->lock); |
| error(EPIPE, |
| "multiple reads on a closed queue"); |
| } |
| if (q->err[0]) { |
| spin_unlock_irqsave(&q->lock); |
| error(EPIPE, q->err); |
| } |
| return FALSE; |
| } |
| if (qio_flags & QIO_NON_BLOCK) { |
| spin_unlock_irqsave(&q->lock); |
| error(EAGAIN, "queue empty"); |
| } |
| spin_unlock_irqsave(&q->lock); |
| /* As with the producer side, we check for a condition while |
| * holding the q->lock, decide to sleep, then unlock. It's like |
| * the "check, signal, check again" pattern, but we do it |
| * conditionally. Both sides agree synchronously to do it, and |
| * those decisions are made while holding q->lock. I think this |
| * is OK. |
| * |
| * The invariant is that no reader sleeps when the queue has |
| * data. While holding the rendez lock, if we see there's no |
| * data, we'll sleep. Since we saw there was no data, the next |
| * writer will see (or already saw) no data, and then the writer |
| * decides to rendez_wake, which will grab the rendez lock. If |
| * the writer already did that, then we'll see notempty when we |
| * do our check-again. */ |
| rendez_sleep(&q->rr, notempty, q); |
| } |
| } |
| |
| /* |
| * add a block list to a queue |
| * XXX basically the same as enqueue blist, and has no locking! |
| */ |
| void qaddlist(struct queue *q, struct block *b) |
| { |
| /* TODO: q lock? */ |
| /* queue the block */ |
| if (q->bfirst) |
| q->blast->next = b; |
| else |
| q->bfirst = b; |
| q->dlen += blocklen(b); |
| while (b->next) |
| b = b->next; |
| q->blast = b; |
| } |
| |
| static size_t read_from_block(struct block *b, uint8_t *to, size_t amt) |
| { |
| size_t copy_amt, retval = 0; |
| struct extra_bdata *ebd; |
| |
| copy_amt = MIN(BHLEN(b), amt); |
| memcpy(to, b->rp, copy_amt); |
| /* advance the rp, since this block might not be completely consumed and |
| * future reads need to know where to pick up from */ |
| b->rp += copy_amt; |
| to += copy_amt; |
| amt -= copy_amt; |
| retval += copy_amt; |
| for (int i = 0; (i < b->nr_extra_bufs) && amt; i++) { |
| ebd = &b->extra_data[i]; |
| /* skip empty entires. if we track this in the struct block, we |
| * can just start the for loop early */ |
| if (!ebd->base || !ebd->len) |
| continue; |
| copy_amt = MIN(ebd->len, amt); |
| memcpy(to, (void*)(ebd->base + ebd->off), copy_amt); |
| /* we're actually consuming the entries, just like how we |
| * advance rp up above, and might only consume part of one. */ |
| ebd->len -= copy_amt; |
| ebd->off += copy_amt; |
| b->extra_len -= copy_amt; |
| if (!ebd->len) { |
| /* we don't actually have to decref here. it's also |
| * done in freeb(). this is the earliest we can free. |
| */ |
| kfree((void*)ebd->base); |
| ebd->base = ebd->off = 0; |
| } |
| to += copy_amt; |
| amt -= copy_amt; |
| retval += copy_amt; |
| } |
| return retval; |
| } |
| |
| /* |
| * copy the contents of a string of blocks into |
| * memory. emptied blocks are freed. return |
| * pointer to first unconsumed block. |
| */ |
| struct block *bl2mem(uint8_t * p, struct block *b, int n) |
| { |
| int i; |
| struct block *next; |
| |
| /* could be slicker here, since read_from_block is smart */ |
| for (; b != NULL; b = next) { |
| i = BLEN(b); |
| if (i > n) { |
| /* partial block, consume some */ |
| read_from_block(b, p, n); |
| return b; |
| } |
| /* full block, consume all and move on */ |
| i = read_from_block(b, p, i); |
| n -= i; |
| p += i; |
| next = b->next; |
| freeb(b); |
| } |
| return NULL; |
| } |
| |
| /* Extract the contents of all blocks and copy to va, up to len. Returns the |
| * actual amount copied. */ |
| static size_t read_all_blocks(struct block *b, void *va, size_t len) |
| { |
| size_t sofar = 0; |
| struct block *next; |
| |
| do { |
| assert(va); |
| assert(b->rp); |
| sofar += read_from_block(b, va + sofar, len - sofar); |
| if (BLEN(b) && b->next) |
| panic("Failed to drain entire block (Qmsg?) but had a next!"); |
| next = b->next; |
| freeb(b); |
| b = next; |
| } while (b); |
| return sofar; |
| } |
| |
| /* |
| * copy the contents of memory into a string of blocks. |
| * return NULL on error. |
| */ |
| struct block *mem2bl(uint8_t * p, int len) |
| { |
| ERRSTACK(1); |
| int n; |
| struct block *b, *first, **l; |
| |
| first = NULL; |
| l = &first; |
| if (waserror()) { |
| freeblist(first); |
| nexterror(); |
| } |
| do { |
| n = len; |
| if (n > Maxatomic) |
| n = Maxatomic; |
| |
| *l = b = block_alloc(n, MEM_WAIT); |
| /* TODO consider extra_data */ |
| memmove(b->wp, p, n); |
| b->wp += n; |
| p += n; |
| len -= n; |
| l = &b->next; |
| } while (len > 0); |
| poperror(); |
| |
| return first; |
| } |
| |
| /* |
| * put a block back to the front of the queue |
| * called with q ilocked |
| */ |
| void qputback(struct queue *q, struct block *b) |
| { |
| b->next = q->bfirst; |
| if (q->bfirst == NULL) |
| q->blast = b; |
| q->bfirst = b; |
| q->dlen += BLEN(b); |
| /* qputback seems to undo a read, so we can undo the accounting too. */ |
| q->bytes_read -= BLEN(b); |
| } |
| |
| /* |
| * get next block from a queue (up to a limit) |
| * |
| */ |
| struct block *qbread(struct queue *q, size_t len) |
| { |
| return __qbread(q, len, QIO_JUST_ONE_BLOCK | QIO_CAN_ERR_SLEEP, |
| MEM_WAIT); |
| } |
| |
| struct block *qbread_nonblock(struct queue *q, size_t len) |
| { |
| return __qbread(q, len, QIO_JUST_ONE_BLOCK | QIO_CAN_ERR_SLEEP | |
| QIO_NON_BLOCK, MEM_WAIT); |
| } |
| |
| /* read up to len from a queue into vp. */ |
| size_t qread(struct queue *q, void *va, size_t len) |
| { |
| struct block *blist = __qbread(q, len, QIO_CAN_ERR_SLEEP, MEM_WAIT); |
| |
| if (!blist) |
| return 0; |
| return read_all_blocks(blist, va, len); |
| } |
| |
| size_t qread_nonblock(struct queue *q, void *va, size_t len) |
| { |
| struct block *blist = __qbread(q, len, QIO_CAN_ERR_SLEEP | |
| QIO_NON_BLOCK, MEM_WAIT); |
| |
| if (!blist) |
| return 0; |
| return read_all_blocks(blist, va, len); |
| } |
| |
| /* This is the rendez wake condition for writers. */ |
| static int qwriter_should_wake(void *a) |
| { |
| struct queue *q = a; |
| |
| return qwritable(q) || (q->state & Qclosed); |
| } |
| |
| /* Helper: enqueues a list of blocks to a queue. Returns the total length. */ |
| static size_t enqueue_blist(struct queue *q, struct block *b) |
| { |
| size_t dlen; |
| |
| if (q->bfirst) |
| q->blast->next = b; |
| else |
| q->bfirst = b; |
| dlen = BLEN(b); |
| while (b->next) { |
| b = b->next; |
| dlen += BLEN(b); |
| } |
| q->blast = b; |
| q->dlen += dlen; |
| return dlen; |
| } |
| |
| /* Adds block (which can be a list of blocks) to the queue, subject to |
| * qio_flags. Returns the length written on success or -1 on non-throwable |
| * error. Adjust qio_flags to control the value-added features!. */ |
| static ssize_t __qbwrite(struct queue *q, struct block *b, int qio_flags) |
| { |
| ssize_t ret; |
| bool was_unreadable; |
| |
| if (q->bypass) { |
| ret = blocklen(b); |
| (*q->bypass) (q->arg, b); |
| return ret; |
| } |
| spin_lock_irqsave(&q->lock); |
| was_unreadable = q->dlen == 0; |
| if (q->state & Qclosed) { |
| spin_unlock_irqsave(&q->lock); |
| freeblist(b); |
| if (!(qio_flags & QIO_CAN_ERR_SLEEP)) |
| return -1; |
| if (q->err[0]) |
| error(EPIPE, q->err); |
| else |
| error(EPIPE, "connection closed"); |
| } |
| if ((qio_flags & QIO_LIMIT) && (q->dlen >= q->limit)) { |
| /* drop overflow takes priority over regular non-blocking */ |
| if ((qio_flags & QIO_DROP_OVERFLOW) |
| || (q->state & Qdropoverflow)) { |
| spin_unlock_irqsave(&q->lock); |
| freeb(b); |
| return -1; |
| } |
| /* People shouldn't set NON_BLOCK without CAN_ERR, but we can be |
| * nice and catch it. */ |
| if ((qio_flags & QIO_CAN_ERR_SLEEP) |
| && (qio_flags & QIO_NON_BLOCK)) { |
| spin_unlock_irqsave(&q->lock); |
| freeb(b); |
| error(EAGAIN, "queue full"); |
| } |
| } |
| ret = enqueue_blist(q, b); |
| QDEBUG checkb(b, "__qbwrite"); |
| spin_unlock_irqsave(&q->lock); |
| /* TODO: not sure if the usage of a kick is mutually exclusive with a |
| * wakeup, meaning that actual users either want a kick or have |
| * qreaders. */ |
| if (q->kick && (was_unreadable || (q->state & Qkick))) |
| q->kick(q->arg); |
| if (was_unreadable) { |
| /* Unlike the read side, there's no double-check to make sure |
| * the queue transitioned across an edge. We know we added |
| * something, so that's enough. We wake if the queue was empty. |
| * Both sides are the same, in that the condition for which we |
| * do the rendez_wakeup() is the same as the condition done for |
| * the rendez_sleep(). */ |
| rendez_wakeup(&q->rr); |
| qwake_cb(q, FDTAP_FILT_READABLE); |
| } |
| /* |
| * flow control, wait for queue to get below the limit |
| * before allowing the process to continue and queue |
| * more. We do this here so that postnote can only |
| * interrupt us after the data has been queued. This |
| * means that things like 9p flushes and ssl messages |
| * will not be disrupted by software interrupts. |
| * |
| * Note - this is moderately dangerous since a process |
| * that keeps getting interrupted and rewriting will |
| * queue infinite crud. |
| */ |
| if ((qio_flags & QIO_CAN_ERR_SLEEP) && |
| !(q->state & Qdropoverflow) && !(qio_flags & QIO_NON_BLOCK)) { |
| /* This is a racy peek at the q status. If we accidentally |
| * block, our rendez will return. The rendez's peak |
| * (qwriter_should_wake) is also racy w.r.t. the q's spinlock |
| * (that lock protects writes, but not reads). |
| * |
| * Here's the deal: when holding the rendez lock, if we see the |
| * sleep condition, the consumer will wake us. The condition |
| * will only ever be changed by the next qbread() (consumer, |
| * changes q->dlen). That code will do a rendez wake, which |
| * will spin on the rendez lock, meaning it won't procede until |
| * we either see the new state (and return) or put ourselves on |
| * the rendez, and wake up. |
| * |
| * The pattern is one side writes mem, then signals. Our side |
| * checks the signal, then reads the mem. The goal is to not |
| * miss seeing the signal AND missing the memory write. In this |
| * specific case, the signal is actually synchronous (the rendez |
| * lock) and not basic shared memory. |
| * |
| * Oh, and we spin in case we woke early and someone else filled |
| * the queue, mesa-style. */ |
| while (!qwriter_should_wake(q)) |
| rendez_sleep(&q->wr, qwriter_should_wake, q); |
| } |
| return ret; |
| } |
| |
| /* |
| * add a block to a queue obeying flow control |
| */ |
| ssize_t qbwrite(struct queue *q, struct block *b) |
| { |
| return __qbwrite(q, b, QIO_CAN_ERR_SLEEP | QIO_LIMIT); |
| } |
| |
| ssize_t qbwrite_nonblock(struct queue *q, struct block *b) |
| { |
| return __qbwrite(q, b, QIO_CAN_ERR_SLEEP | QIO_LIMIT | QIO_NON_BLOCK); |
| } |
| |
| ssize_t qibwrite(struct queue *q, struct block *b) |
| { |
| return __qbwrite(q, b, 0); |
| } |
| |
| /* Helper, allocs a block and copies [from, from + len) into it. Returns the |
| * block on success, 0 on failure. */ |
| static struct block *build_block(void *from, size_t len, int mem_flags) |
| { |
| struct block *b; |
| void *ext_buf; |
| |
| /* If len is small, we don't need to bother with the extra_data. But |
| * until the whole stack can handle extd blocks, we'll use them |
| * unconditionally. */ |
| #ifdef CONFIG_BLOCK_EXTRAS |
| /* allocb builds in 128 bytes of header space to all blocks, but this is |
| * only available via padblock (to the left). we also need some space |
| * for pullupblock for some basic headers (like icmp) that get written |
| * in directly */ |
| b = block_alloc(64, mem_flags); |
| if (!b) |
| return 0; |
| ext_buf = kmalloc(len, mem_flags); |
| if (!ext_buf) { |
| kfree(b); |
| return 0; |
| } |
| memcpy(ext_buf, from, len); |
| if (block_add_extd(b, 1, mem_flags)) { |
| kfree(ext_buf); |
| kfree(b); |
| return 0; |
| } |
| b->extra_data[0].base = (uintptr_t)ext_buf; |
| b->extra_data[0].off = 0; |
| b->extra_data[0].len = len; |
| b->extra_len += len; |
| #else |
| b = block_alloc(len, mem_flags); |
| if (!b) |
| return 0; |
| memmove(b->wp, from, len); |
| b->wp += len; |
| #endif |
| return b; |
| } |
| |
| static ssize_t __qwrite(struct queue *q, void *vp, size_t len, int mem_flags, |
| int qio_flags) |
| { |
| ERRSTACK(1); |
| size_t n; |
| volatile size_t sofar = 0; /* volatile for the waserror */ |
| struct block *b; |
| uint8_t *p = vp; |
| void *ext_buf; |
| |
| /* Only some callers can throw. Others might be in a context where |
| * waserror isn't safe. */ |
| if ((qio_flags & QIO_CAN_ERR_SLEEP) && waserror()) { |
| /* Any error (EAGAIN for nonblock, syscall aborted, even EPIPE) |
| * after some data has been sent should be treated as a partial |
| * write. */ |
| if (sofar) |
| goto out_ok; |
| nexterror(); |
| } |
| do { |
| n = len - sofar; |
| /* This is 64K, the max amount per single block. Still a good |
| * value? */ |
| if (n > Maxatomic) |
| n = Maxatomic; |
| b = build_block(p + sofar, n, mem_flags); |
| if (!b) |
| break; |
| if (__qbwrite(q, b, qio_flags) < 0) |
| break; |
| sofar += n; |
| } while ((sofar < len) && (q->state & Qmsg) == 0); |
| out_ok: |
| if (qio_flags & QIO_CAN_ERR_SLEEP) |
| poperror(); |
| return sofar; |
| } |
| |
| ssize_t qwrite(struct queue *q, void *vp, int len) |
| { |
| return __qwrite(q, vp, len, MEM_WAIT, QIO_CAN_ERR_SLEEP | QIO_LIMIT); |
| } |
| |
| ssize_t qwrite_nonblock(struct queue *q, void *vp, int len) |
| { |
| return __qwrite(q, vp, len, MEM_WAIT, QIO_CAN_ERR_SLEEP | QIO_LIMIT | |
| QIO_NON_BLOCK); |
| } |
| |
| ssize_t qiwrite(struct queue *q, void *vp, int len) |
| { |
| return __qwrite(q, vp, len, MEM_ATOMIC, 0); |
| } |
| |
| /* |
| * be extremely careful when calling this, |
| * as there is no reference accounting |
| */ |
| void qfree(struct queue *q) |
| { |
| qclose(q); |
| kfree(q); |
| } |
| |
| /* |
| * Mark a queue as closed. No further IO is permitted. |
| * All blocks are released. |
| */ |
| void qclose(struct queue *q) |
| { |
| struct block *bfirst; |
| |
| if (q == NULL) |
| return; |
| |
| /* mark it */ |
| spin_lock_irqsave(&q->lock); |
| q->state |= Qclosed; |
| q->state &= ~Qdropoverflow; |
| q->err[0] = 0; |
| bfirst = q->bfirst; |
| q->bfirst = 0; |
| q->dlen = 0; |
| spin_unlock_irqsave(&q->lock); |
| |
| /* free queued blocks */ |
| freeblist(bfirst); |
| |
| /* wake up readers/writers */ |
| rendez_wakeup(&q->rr); |
| rendez_wakeup(&q->wr); |
| qwake_cb(q, FDTAP_FILT_HANGUP); |
| } |
| |
| /* Mark a queue as closed. Wakeup any readers. Don't remove queued blocks. |
| * |
| * msg will be the errstr received by any waiters (qread, qbread, etc). If |
| * there is no message, which is what also happens during a natural qclose(), |
| * those waiters will simply return 0. qwriters will always error() on a |
| * closed/hungup queue. */ |
| void qhangup(struct queue *q, char *msg) |
| { |
| /* mark it */ |
| spin_lock_irqsave(&q->lock); |
| q->state |= Qclosed; |
| if (msg == 0 || *msg == 0) |
| q->err[0] = 0; |
| else |
| strlcpy(q->err, msg, ERRMAX); |
| spin_unlock_irqsave(&q->lock); |
| |
| /* wake up readers/writers */ |
| rendez_wakeup(&q->rr); |
| rendez_wakeup(&q->wr); |
| qwake_cb(q, FDTAP_FILT_HANGUP); |
| } |
| |
| /* |
| * return non-zero if the q is hungup |
| */ |
| int qisclosed(struct queue *q) |
| { |
| return q->state & Qclosed; |
| } |
| |
| /* |
| * mark a queue as no longer hung up. resets the wake_cb. |
| */ |
| void qreopen(struct queue *q) |
| { |
| spin_lock_irqsave(&q->lock); |
| q->state &= ~Qclosed; |
| q->eof = 0; |
| q->limit = q->inilim; |
| q->wake_cb = 0; |
| q->wake_data = 0; |
| spin_unlock_irqsave(&q->lock); |
| } |
| |
| /* |
| * return bytes queued |
| */ |
| int qlen(struct queue *q) |
| { |
| return q->dlen; |
| } |
| |
| size_t q_bytes_read(struct queue *q) |
| { |
| return q->bytes_read; |
| } |
| |
| /* |
| * return space remaining before flow control |
| * |
| * This used to be |
| * q->len < q->limit/2 |
| * but it slows down tcp too much for certain write sizes. |
| * I really don't understand it completely. It may be |
| * due to the queue draining so fast that the transmission |
| * stalls waiting for the app to produce more data. - presotto |
| * |
| * q->len was the amount of bytes, which is no longer used. we now use |
| * q->dlen, the amount of usable data. a.k.a. qlen()... - brho |
| */ |
| int qwindow(struct queue *q) |
| { |
| int l; |
| |
| l = q->limit - q->dlen; |
| if (l < 0) |
| l = 0; |
| return l; |
| } |
| |
| /* |
| * return true if we can read without blocking |
| */ |
| int qcanread(struct queue *q) |
| { |
| return q->bfirst != 0; |
| } |
| |
| /* |
| * change queue limit |
| */ |
| void qsetlimit(struct queue *q, size_t limit) |
| { |
| bool was_writable = qwritable(q); |
| |
| q->limit = limit; |
| if (!was_writable && qwritable(q)) { |
| rendez_wakeup(&q->wr); |
| qwake_cb(q, FDTAP_FILT_WRITABLE); |
| } |
| } |
| |
| size_t qgetlimit(struct queue *q) |
| { |
| return q->limit; |
| } |
| |
| /* |
| * set whether writes drop overflowing blocks, or if we sleep |
| */ |
| void qdropoverflow(struct queue *q, bool onoff) |
| { |
| spin_lock_irqsave(&q->lock); |
| if (onoff) |
| q->state |= Qdropoverflow; |
| else |
| q->state &= ~Qdropoverflow; |
| spin_unlock_irqsave(&q->lock); |
| } |
| |
| /* Be careful: this can affect concurrent reads/writes and code that might have |
| * built-in expectations of the q's type. */ |
| void q_toggle_qmsg(struct queue *q, bool onoff) |
| { |
| spin_lock_irqsave(&q->lock); |
| if (onoff) |
| q->state |= Qmsg; |
| else |
| q->state &= ~Qmsg; |
| spin_unlock_irqsave(&q->lock); |
| } |
| |
| /* Be careful: this can affect concurrent reads/writes and code that might have |
| * built-in expectations of the q's type. */ |
| void q_toggle_qcoalesce(struct queue *q, bool onoff) |
| { |
| spin_lock_irqsave(&q->lock); |
| if (onoff) |
| q->state |= Qcoalesce; |
| else |
| q->state &= ~Qcoalesce; |
| spin_unlock_irqsave(&q->lock); |
| } |
| |
| /* |
| * flush the output queue |
| */ |
| void qflush(struct queue *q) |
| { |
| struct block *bfirst; |
| |
| /* mark it */ |
| spin_lock_irqsave(&q->lock); |
| bfirst = q->bfirst; |
| q->bfirst = 0; |
| q->dlen = 0; |
| spin_unlock_irqsave(&q->lock); |
| |
| /* free queued blocks */ |
| freeblist(bfirst); |
| |
| /* wake up writers */ |
| rendez_wakeup(&q->wr); |
| qwake_cb(q, FDTAP_FILT_WRITABLE); |
| } |
| |
| int qfull(struct queue *q) |
| { |
| return !qwritable(q); |
| } |
| |
| int qstate(struct queue *q) |
| { |
| return q->state; |
| } |
| |
| void qdump(struct queue *q) |
| { |
| if (q) |
| printk("q=%p bfirst=%p blast=%p dlen=%d limit=%d state=#%x\n", |
| q, q->bfirst, q->blast, q->dlen, q->limit, q->state); |
| } |
| |
| /* On certain wakeup events, qio will call func(q, data, filter), where filter |
| * marks the type of wakeup event (flags from FDTAP). |
| * |
| * There's no sync protection. If you change the CB while the qio is running, |
| * you might get a CB with the data or func from a previous set_wake_cb. You |
| * should set this once per queue and forget it. |
| * |
| * You can remove the CB by passing in 0 for the func. Alternatively, you can |
| * just make sure that the func(data) pair are valid until the queue is freed or |
| * reopened. */ |
| void qio_set_wake_cb(struct queue *q, qio_wake_cb_t func, void *data) |
| { |
| q->wake_data = data; |
| wmb(); /* if we see func, we'll also see the data for it */ |
| q->wake_cb = func; |
| } |
| |
| /* Helper for detecting whether we'll block on a read at this instant. */ |
| bool qreadable(struct queue *q) |
| { |
| return qlen(q) > 0; |
| } |
| |
| /* Helper for detecting whether we'll block on a write at this instant. */ |
| bool qwritable(struct queue *q) |
| { |
| return !q->limit || qwindow(q) > 0; |
| } |