slab: fix alignment issues

This clarifies many of the issues around alignment and source quantum.

Previously, there were a lot of assumptions about source alignment
(assumed PGSIZE, but it was actually quantum), object size (assumed big
enough for a pointer), etc.  If you had an arena with quantum > PGSIZE
and made a slab / KC from it (e.g. a qcache), you'd trip the assertion
too.

We also didn't have any guarantees about carrying a source's
quantum-multiple-alignment through to the slab, which matters for
non-power-of-two sources that want to use qcaches.  We use the "if
obj_size is a multiple of quantum, you'll get quantum-multiple-aligned
allocations" guarantee to solve the problem for qcaches.

Slab align is a separate item from both arena quantum and arena align.
The object we get from a source gets aligned up (or is already the right
alignment, for the pro-touch/non-bufctl case), which requires us to
track the original address from the arena in the slab.  That's fine.
Might as well use that for the pro-touch case.

I considered getting rid of PGSIZE, but its usage in obj->slab lookups
is pretty handy.

Signed-off-by: Barret Rhoden <brho@cs.berkeley.edu>
diff --git a/kern/include/slab.h b/kern/include/slab.h
index 182b2bc..d625d92 100644
--- a/kern/include/slab.h
+++ b/kern/include/slab.h
@@ -94,6 +94,7 @@
 	TAILQ_ENTRY(kmem_slab) link;
 	size_t num_busy_obj;
 	size_t num_total_obj;
+	void *source_obj;
 	union {
 		struct kmem_bufctl_list bufctl_freelist;
 		void *free_small_obj;
diff --git a/kern/src/arena.c b/kern/src/arena.c
index ff53d92..3dd0084 100644
--- a/kern/src/arena.c
+++ b/kern/src/arena.c
@@ -127,8 +127,14 @@
 	for (int i = 0; i < nr_qcaches; i++) {
 		qc_size = (i + 1) * quantum;
 		snprintf(kc_name, KMC_NAME_SZ, "%s_%d", arena->name, qc_size);
-		__kmem_cache_create(&arena->qcaches[i], kc_name, qc_size,
-				    quantum, KMC_NOTOUCH | KMC_QCACHE, arena,
+		/* Alignment == 1.  These QCs will give out quantum-aligned
+		 * (actually multiples) objects, even without an alignment
+		 * request.  The reason is that the QCs pull their slabs from
+		 * us, and we give out quantum-aligned objects (i.e. the slabs).
+		 * Those slabs are made of up objects that are multiples
+		 * of quantum. */
+		__kmem_cache_create(&arena->qcaches[i], kc_name, qc_size, 1,
+				    KMC_NOTOUCH | KMC_QCACHE, arena,
 				    NULL, NULL, NULL);
 	}
 }
diff --git a/kern/src/slab.c b/kern/src/slab.c
index 55b7776..602cf2d 100644
--- a/kern/src/slab.c
+++ b/kern/src/slab.c
@@ -293,22 +293,74 @@
                          void (*dtor)(void *, void *), void *priv)
 {
 	assert(kc);
-	assert(align);
+	/* Our alignment is independent of our source's quantum.  We pull from
+	 * our source, which gives us quantum-multiple/aligned chunks, but our
+	 * alignment and object size is our own business.  Mostly.
+	 *
+	 * There is one guarantee we must make:
+	 * - If aligned-obj_size (ALIGN(obj_size, align)) is a multiple of our
+	 *   source's quantum, then all objects we return are
+	 *   quantum-multiple-aligned (addresses are multiples of quantum).
+	 *
+	 * The main restriction for us is that when we get a slab from our
+	 * source, we need to hand out objects at the beginning of the slab
+	 * (where we are source quantum-aligned).
+	 *
+	 * As an example, if our source quantum is 15, and we give out 45 byte
+	 * objects, we must give out e.g. [15,60), but not [10,55).  This really
+	 * only comes up for qcaches for arenas that aren't memory, since all
+	 * memory users will be going with power-of-two alignment.  And
+	 * typically the slabs will have their own alignment.  e.g.
+	 * alignof(struct foo), with a PGSIZE-quantum source.
+	 *
+	 * Our objects are always aligned to 'align', regardless of our source's
+	 * alignment/quantum.  Similarly, if our source's quantum is a multiple
+	 * of aligned-obj_size, then all objects we return are
+	 * obj_size-multiple-aligned. */
+	assert(IS_PWR2(align));
+	/* Every allocation is aligned, and every allocation is the same
+	 * size, so we might as well align-up obj_size. */
+	obj_size = ALIGN(obj_size, align);
 	spinlock_init_irqsave(&kc->cache_lock);
 	strlcpy(kc->name, name, KMC_NAME_SZ);
-	kc->obj_size = ROUNDUP(obj_size, align);
+	/* We might want some sort of per-call site NUMA-awareness in the
+	 * future. */
+	source = source ?: kpages_arena;
+	kc->source = source;
+	kc->obj_size = obj_size;
+	kc->align = align;
+	kc->flags = flags;
+	/* No touch must use bufctls, even for small objects, so that it does
+	 * not use the object as memory.  RAM objects need enough space for a
+	 * pointer to form the linked list of objects. */
+	if (obj_size < sizeof(void*) || obj_size > SLAB_LARGE_CUTOFF
+	    || flags & KMC_NOTOUCH) {
+		kc->flags |= __KMC_USE_BUFCTL;
+	} else {
+		/* pro-touch (non-bufctl) slabs must get a page-aligned slab
+		 * from the source.  quantum < PGSIZE won't guarantee that.
+		 * quantum > PGSIZE is a waste and a programmer error. */
+		if (kc->source->quantum != PGSIZE) {
+			warn("KC %s is 'pro-touch', but source arena %s has non-PGSIZE quantum %d",
+			     kc->name, source->name, source->quantum);
+			kc->flags |= __KMC_USE_BUFCTL;
+		}
+	}
+	/* Note that import_amt is only used for bufctls.  The alternative puts
+	 * the slab at the end of a PGSIZE chunk, and fills the page with
+	 * objects.  The reliance on PGSIZE is used when finding a slab for a
+	 * given buffer.
+	 *
+	 * Also note that import_amt can be ignored for qcaches too.  If the
+	 * object is small and pro-touch, we'll still try and get a page from
+	 * the source, even if that is very large.  Consider a source with
+	 * qcache_max = 5, quantum = 1.  It's actually fine - we may waste a
+	 * little (unused allocations), but we save on not having bufctls. */
 	if (flags & KMC_QCACHE)
 		kc->import_amt = ROUNDUPPWR2(3 * source->qcache_max);
 	else
-		kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size, PGSIZE);
-	kc->align = align;
-	if (align > PGSIZE)
-		panic("Cache %s object alignment is actually MIN(PGSIZE, align (%p))",
-		      name, align);
-	kc->flags = flags;
-	/* We might want some sort of per-call site NUMA-awareness in the
-	 * future. */
-	kc->source = source ? source : kpages_arena;
+		kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size,
+					 ROUNDUP(PGSIZE, source->quantum));
 	TAILQ_INIT(&kc->full_slab_list);
 	TAILQ_INIT(&kc->partial_slab_list);
 	TAILQ_INIT(&kc->empty_slab_list);
@@ -321,13 +373,6 @@
 	hash_init_hh(&kc->hh);
 	for (int i = 0; i < kc->hh.nr_hash_lists; i++)
 		BSD_LIST_INIT(&kc->static_hash[i]);
-	/* No touch must use bufctls, even for small objects, so that it does
-	 * not use the object as memory.  Note that if we have an arbitrary
-	 * source, small objects, and we're 'pro-touch', the small allocation
-	 * path will assume we're importing from a PGSIZE-aligned source arena.
-	 */
-	if ((obj_size > SLAB_LARGE_CUTOFF) || (flags & KMC_NOTOUCH))
-		kc->flags |= __KMC_USE_BUFCTL;
 	depot_init(&kc->depot);
 	kmem_trace_ht_init(&kc->trace_ht);
 	/* We do this last, since this will all into the magazine cache - which
@@ -429,20 +474,16 @@
 static void kmem_slab_destroy(struct kmem_cache *cp, struct kmem_slab *a_slab)
 {
 	if (!__use_bufctls(cp)) {
-		arena_free(cp->source, ROUNDDOWN(a_slab, PGSIZE), PGSIZE);
+		arena_free(cp->source, a_slab->source_obj, PGSIZE);
 	} else {
 		struct kmem_bufctl *i, *temp;
-		void *buf_start = (void*)SIZE_MAX;
 
 		BSD_LIST_FOREACH_SAFE(i, &a_slab->bufctl_freelist, link, temp) {
-			// Track the lowest buffer address, which is the start
-			// of the buffer
-			buf_start = MIN(buf_start, i->buf_addr);
 			/* This is a little dangerous, but we can skip removing,
 			 * since we init the freelist when we reuse the slab. */
 			kmem_cache_free(kmem_bufctl_cache, i);
 		}
-		arena_free(cp->source, buf_start, cp->import_amt);
+		arena_free(cp->source, a_slab->source_obj, cp->import_amt);
 		kmem_cache_free(kmem_slab_cache, a_slab);
 	}
 }
@@ -768,20 +809,19 @@
 	if (!__use_bufctls(cp)) {
 		void *a_page;
 
-		/* Careful, this assumes our source is a PGSIZE-aligned
-		 * allocator.  We could use xalloc to enforce the alignment, but
-		 * that'll bypass the qcaches, which we don't want.  Caller
-		 * beware. */
 		a_page = arena_alloc(cp->source, PGSIZE, MEM_ATOMIC);
 		if (!a_page)
 			return FALSE;
-		// the slab struct is stored at the end of the page
+		/* The slab struct is stored at the end of the page.  Keep it
+		 * there, so that our first object is page aligned, and thus
+		 * aligned to all smaller alignments.  If align > PGSIZE,
+		 * obj_size > PGSIZE, and we'd use bufctls. */
 		a_slab = (struct kmem_slab*)(a_page + PGSIZE
 		                             - sizeof(struct kmem_slab));
+		a_slab->source_obj = a_page;
 		a_slab->num_busy_obj = 0;
 		a_slab->num_total_obj = (PGSIZE - sizeof(struct kmem_slab)) /
 		                        cp->obj_size;
-		// TODO: consider staggering this IAW section 4.3
 		a_slab->free_small_obj = a_page;
 		/* Walk and create the free list, which is circular.  Each item
 		 * stores the location of the next one at the beginning of the
@@ -795,17 +835,29 @@
 		*((uintptr_t**)buf) = NULL;
 	} else {
 		void *buf;
+		uintptr_t delta;
 
 		a_slab = kmem_cache_alloc(kmem_slab_cache, 0);
 		if (!a_slab)
 			return FALSE;
 		buf = arena_alloc(cp->source, cp->import_amt, MEM_ATOMIC);
-		if (!buf) {
-			kmem_cache_free(kmem_slab_cache, a_slab);
-			return FALSE;
+		if (!buf)
+			goto err_slab;
+		a_slab->source_obj = buf;
+		buf = ALIGN(buf, cp->align);
+		delta = buf - a_slab->source_obj;
+		if (delta >= cp->import_amt) {
+			/* Shouldn't happen - the import_amt should always be
+			 * enough for at least two objects, with obj_size >=
+			 * align.  Maybe if a qcache had an alignment (which
+			 * they don't). */
+			warn("Delta %p >= import_amt %p! (buf %p align %p)",
+			     delta, cp->import_amt, a_slab->source_obj,
+			     cp->align);
+			goto err_source_obj;
 		}
 		a_slab->num_busy_obj = 0;
-		a_slab->num_total_obj = cp->import_amt / cp->obj_size;
+		a_slab->num_total_obj = (cp->import_amt - delta) / cp->obj_size;
 		BSD_LIST_INIT(&a_slab->bufctl_freelist);
 		/* for each buffer, set up a bufctl and point to the buffer */
 		for (int i = 0; i < a_slab->num_total_obj; i++) {
@@ -821,6 +873,12 @@
 	TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
 
 	return TRUE;
+
+err_source_obj:
+	arena_free(cp->source, a_slab->source_obj, cp->import_amt);
+err_slab:
+	kmem_cache_free(kmem_slab_cache, a_slab);
+	return FALSE;
 }
 
 /* This deallocs every slab from the empty list.  TODO: think a bit more about