slab: fix alignment issues
This clarifies many of the issues around alignment and source quantum.
Previously, there were a lot of assumptions about source alignment
(assumed PGSIZE, but it was actually quantum), object size (assumed big
enough for a pointer), etc. If you had an arena with quantum > PGSIZE
and made a slab / KC from it (e.g. a qcache), you'd trip the assertion
too.
We also didn't have any guarantees about carrying a source's
quantum-multiple-alignment through to the slab, which matters for
non-power-of-two sources that want to use qcaches. We use the "if
obj_size is a multiple of quantum, you'll get quantum-multiple-aligned
allocations" guarantee to solve the problem for qcaches.
Slab align is a separate item from both arena quantum and arena align.
The object we get from a source gets aligned up (or is already the right
alignment, for the pro-touch/non-bufctl case), which requires us to
track the original address from the arena in the slab. That's fine.
Might as well use that for the pro-touch case.
I considered getting rid of PGSIZE, but its usage in obj->slab lookups
is pretty handy.
Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/include/slab.h b/kern/include/slab.h
index 182b2bc..d625d92 100644
--- a/kern/include/slab.h
+++ b/kern/include/slab.h
@@ -94,6 +94,7 @@
TAILQ_ENTRY(kmem_slab) link;
size_t num_busy_obj;
size_t num_total_obj;
+ void *source_obj;
union {
struct kmem_bufctl_list bufctl_freelist;
void *free_small_obj;
diff --git a/kern/src/arena.c b/kern/src/arena.c
index ff53d92..3dd0084 100644
--- a/kern/src/arena.c
+++ b/kern/src/arena.c
@@ -127,8 +127,14 @@
for (int i = 0; i < nr_qcaches; i++) {
qc_size = (i + 1) * quantum;
snprintf(kc_name, KMC_NAME_SZ, "%s_%d", arena->name, qc_size);
- __kmem_cache_create(&arena->qcaches[i], kc_name, qc_size,
- quantum, KMC_NOTOUCH | KMC_QCACHE, arena,
+ /* Alignment == 1. These QCs will give out quantum-aligned
+ * (actually multiples) objects, even without an alignment
+ * request. The reason is that the QCs pull their slabs from
+ * us, and we give out quantum-aligned objects (i.e. the slabs).
+ * Those slabs are made of up objects that are multiples
+ * of quantum. */
+ __kmem_cache_create(&arena->qcaches[i], kc_name, qc_size, 1,
+ KMC_NOTOUCH | KMC_QCACHE, arena,
NULL, NULL, NULL);
}
}
diff --git a/kern/src/slab.c b/kern/src/slab.c
index 55b7776..602cf2d 100644
--- a/kern/src/slab.c
+++ b/kern/src/slab.c
@@ -293,22 +293,74 @@
void (*dtor)(void *, void *), void *priv)
{
assert(kc);
- assert(align);
+ /* Our alignment is independent of our source's quantum. We pull from
+ * our source, which gives us quantum-multiple/aligned chunks, but our
+ * alignment and object size is our own business. Mostly.
+ *
+ * There is one guarantee we must make:
+ * - If aligned-obj_size (ALIGN(obj_size, align)) is a multiple of our
+ * source's quantum, then all objects we return are
+ * quantum-multiple-aligned (addresses are multiples of quantum).
+ *
+ * The main restriction for us is that when we get a slab from our
+ * source, we need to hand out objects at the beginning of the slab
+ * (where we are source quantum-aligned).
+ *
+ * As an example, if our source quantum is 15, and we give out 45 byte
+ * objects, we must give out e.g. [15,60), but not [10,55). This really
+ * only comes up for qcaches for arenas that aren't memory, since all
+ * memory users will be going with power-of-two alignment. And
+ * typically the slabs will have their own alignment. e.g.
+ * alignof(struct foo), with a PGSIZE-quantum source.
+ *
+ * Our objects are always aligned to 'align', regardless of our source's
+ * alignment/quantum. Similarly, if our source's quantum is a multiple
+ * of aligned-obj_size, then all objects we return are
+ * obj_size-multiple-aligned. */
+ assert(IS_PWR2(align));
+ /* Every allocation is aligned, and every allocation is the same
+ * size, so we might as well align-up obj_size. */
+ obj_size = ALIGN(obj_size, align);
spinlock_init_irqsave(&kc->cache_lock);
strlcpy(kc->name, name, KMC_NAME_SZ);
- kc->obj_size = ROUNDUP(obj_size, align);
+ /* We might want some sort of per-call site NUMA-awareness in the
+ * future. */
+ source = source ?: kpages_arena;
+ kc->source = source;
+ kc->obj_size = obj_size;
+ kc->align = align;
+ kc->flags = flags;
+ /* No touch must use bufctls, even for small objects, so that it does
+ * not use the object as memory. RAM objects need enough space for a
+ * pointer to form the linked list of objects. */
+ if (obj_size < sizeof(void*) || obj_size > SLAB_LARGE_CUTOFF
+ || flags & KMC_NOTOUCH) {
+ kc->flags |= __KMC_USE_BUFCTL;
+ } else {
+ /* pro-touch (non-bufctl) slabs must get a page-aligned slab
+ * from the source. quantum < PGSIZE won't guarantee that.
+ * quantum > PGSIZE is a waste and a programmer error. */
+ if (kc->source->quantum != PGSIZE) {
+ warn("KC %s is 'pro-touch', but source arena %s has non-PGSIZE quantum %d",
+ kc->name, source->name, source->quantum);
+ kc->flags |= __KMC_USE_BUFCTL;
+ }
+ }
+ /* Note that import_amt is only used for bufctls. The alternative puts
+ * the slab at the end of a PGSIZE chunk, and fills the page with
+ * objects. The reliance on PGSIZE is used when finding a slab for a
+ * given buffer.
+ *
+ * Also note that import_amt can be ignored for qcaches too. If the
+ * object is small and pro-touch, we'll still try and get a page from
+ * the source, even if that is very large. Consider a source with
+ * qcache_max = 5, quantum = 1. It's actually fine - we may waste a
+ * little (unused allocations), but we save on not having bufctls. */
if (flags & KMC_QCACHE)
kc->import_amt = ROUNDUPPWR2(3 * source->qcache_max);
else
- kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size, PGSIZE);
- kc->align = align;
- if (align > PGSIZE)
- panic("Cache %s object alignment is actually MIN(PGSIZE, align (%p))",
- name, align);
- kc->flags = flags;
- /* We might want some sort of per-call site NUMA-awareness in the
- * future. */
- kc->source = source ? source : kpages_arena;
+ kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size,
+ ROUNDUP(PGSIZE, source->quantum));
TAILQ_INIT(&kc->full_slab_list);
TAILQ_INIT(&kc->partial_slab_list);
TAILQ_INIT(&kc->empty_slab_list);
@@ -321,13 +373,6 @@
hash_init_hh(&kc->hh);
for (int i = 0; i < kc->hh.nr_hash_lists; i++)
BSD_LIST_INIT(&kc->static_hash[i]);
- /* No touch must use bufctls, even for small objects, so that it does
- * not use the object as memory. Note that if we have an arbitrary
- * source, small objects, and we're 'pro-touch', the small allocation
- * path will assume we're importing from a PGSIZE-aligned source arena.
- */
- if ((obj_size > SLAB_LARGE_CUTOFF) || (flags & KMC_NOTOUCH))
- kc->flags |= __KMC_USE_BUFCTL;
depot_init(&kc->depot);
kmem_trace_ht_init(&kc->trace_ht);
/* We do this last, since this will all into the magazine cache - which
@@ -429,20 +474,16 @@
static void kmem_slab_destroy(struct kmem_cache *cp, struct kmem_slab *a_slab)
{
if (!__use_bufctls(cp)) {
- arena_free(cp->source, ROUNDDOWN(a_slab, PGSIZE), PGSIZE);
+ arena_free(cp->source, a_slab->source_obj, PGSIZE);
} else {
struct kmem_bufctl *i, *temp;
- void *buf_start = (void*)SIZE_MAX;
BSD_LIST_FOREACH_SAFE(i, &a_slab->bufctl_freelist, link, temp) {
- // Track the lowest buffer address, which is the start
- // of the buffer
- buf_start = MIN(buf_start, i->buf_addr);
/* This is a little dangerous, but we can skip removing,
* since we init the freelist when we reuse the slab. */
kmem_cache_free(kmem_bufctl_cache, i);
}
- arena_free(cp->source, buf_start, cp->import_amt);
+ arena_free(cp->source, a_slab->source_obj, cp->import_amt);
kmem_cache_free(kmem_slab_cache, a_slab);
}
}
@@ -768,20 +809,19 @@
if (!__use_bufctls(cp)) {
void *a_page;
- /* Careful, this assumes our source is a PGSIZE-aligned
- * allocator. We could use xalloc to enforce the alignment, but
- * that'll bypass the qcaches, which we don't want. Caller
- * beware. */
a_page = arena_alloc(cp->source, PGSIZE, MEM_ATOMIC);
if (!a_page)
return FALSE;
- // the slab struct is stored at the end of the page
+ /* The slab struct is stored at the end of the page. Keep it
+ * there, so that our first object is page aligned, and thus
+ * aligned to all smaller alignments. If align > PGSIZE,
+ * obj_size > PGSIZE, and we'd use bufctls. */
a_slab = (struct kmem_slab*)(a_page + PGSIZE
- sizeof(struct kmem_slab));
+ a_slab->source_obj = a_page;
a_slab->num_busy_obj = 0;
a_slab->num_total_obj = (PGSIZE - sizeof(struct kmem_slab)) /
cp->obj_size;
- // TODO: consider staggering this IAW section 4.3
a_slab->free_small_obj = a_page;
/* Walk and create the free list, which is circular. Each item
* stores the location of the next one at the beginning of the
@@ -795,17 +835,29 @@
*((uintptr_t**)buf) = NULL;
} else {
void *buf;
+ uintptr_t delta;
a_slab = kmem_cache_alloc(kmem_slab_cache, 0);
if (!a_slab)
return FALSE;
buf = arena_alloc(cp->source, cp->import_amt, MEM_ATOMIC);
- if (!buf) {
- kmem_cache_free(kmem_slab_cache, a_slab);
- return FALSE;
+ if (!buf)
+ goto err_slab;
+ a_slab->source_obj = buf;
+ buf = ALIGN(buf, cp->align);
+ delta = buf - a_slab->source_obj;
+ if (delta >= cp->import_amt) {
+ /* Shouldn't happen - the import_amt should always be
+ * enough for at least two objects, with obj_size >=
+ * align. Maybe if a qcache had an alignment (which
+ * they don't). */
+ warn("Delta %p >= import_amt %p! (buf %p align %p)",
+ delta, cp->import_amt, a_slab->source_obj,
+ cp->align);
+ goto err_source_obj;
}
a_slab->num_busy_obj = 0;
- a_slab->num_total_obj = cp->import_amt / cp->obj_size;
+ a_slab->num_total_obj = (cp->import_amt - delta) / cp->obj_size;
BSD_LIST_INIT(&a_slab->bufctl_freelist);
/* for each buffer, set up a bufctl and point to the buffer */
for (int i = 0; i < a_slab->num_total_obj; i++) {
@@ -821,6 +873,12 @@
TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
return TRUE;
+
+err_source_obj:
+ arena_free(cp->source, a_slab->source_obj, cp->import_amt);
+err_slab:
+ kmem_cache_free(kmem_slab_cache, a_slab);
+ return FALSE;
}
/* This deallocs every slab from the empty list. TODO: think a bit more about