blob: 520932dc1ea874bd62e89f1eceb8f2388b6c3da3 [file] [log] [blame]
/* Copyright (c) 2009 The Regents of the University of California
* See LICENSE for details.
*
* Slab allocator, based on the SunOS 5.4 allocator paper.
*
* Barret Rhoden <brho@cs.berkeley.edu>
* Kevin Klues <klueska@cs.berkeley.edu>
*
* Copyright (c) 2016 Google Inc
*
* Upgraded and extended to support magazines, based on Bonwick and Adams's
* "Magazines and Vmem" paper.
*
* Barret Rhoden <brho@cs.berkeley.edu>
*
* FAQ:
* - What sort of allocator do we need for the kmem_pcpu_caches? In general,
* the base allocator. All slabs/caches depend on the pcpu_caches for any
* allocation, so we need something that does not rely on slabs. We could use
* generic kpages, if we knew that we weren't: qcaches for a kpages_arena, the
* slab kcache, or the bufctl kcache. This is the same set of restrictions
* for the hash table allocations.
* - Why doesn't the magazine cache deadlock on itself? Because magazines are
* only allocated during the free path of another cache. There are no
* magazine allocations during a cache's allocation.
* - Does the magazine cache need to be statically allocated? Maybe not, but it
* doesn't hurt. We need to set it up at some point. We can use other caches
* for allocations before the mag cache is initialized, but we can't free.
* - Does the magazine cache need to pull from the base arena? Similar to the
* static allocation question - by default, maybe not, but it is safer. And
* yes, due to other design choices. We could initialize it after kpages is
* allocated and use a kpages_arena, but that would require us to not free a
* page before or during kpages_arena_init(). A related note is where the
* first magazines in a pcpu_cache come from. I'm currently going with "raw
* slab alloc from the magazine cache", which means magazines need to work
* when we're setting up the qcache's for kpages_arena. That creates a
* dependency, which means kpages depends on mags, which means mags can only
* depend on base. If we ever use slabs for non-base arena btags, we'll also
* have this dependency between kpages and mags.
* - The paper talks about full and empty magazines. Why does our code talk
* about not_empty and empty? The way we'll do our magazine resizing is to
* just() increment the pcpu_cache's magsize. Then we'll eventually start
* filling the magazines to their new capacity (during frees, btw). During
* this time, a mag that was previously full will technically be not-empty,
* but not full. The correctness of the magazine code is still OK, I think,
* since when they say 'full', they require 'not empty' in most cases. In
* short, 'not empty' is more accurate, though it makes sense to say 'full'
* when explaining the basic idea for their paper.
* - Due to a resize, what happens when the depot gives a pcpu cache a magazine
* with *more* rounds than ppc->magsize? The allocation path doesn't care
* about magsize - it just looks at nr_rounds. So that's fine. On the free
* path, we might mistakenly think that a mag has no more room. In that case,
* we'll just hand it to the depot and it'll be a 'not-empty' mag. Eventually
* it'll get filled up, or it just won't matter. 'magsize' is basically an
* instruction to the pcpu_cache: "fill to X, please."
* - Why is nr_rounds tracked in the magazine and not the pcpu cache? The paper
* uses the pcpu cache, but doesn't say whether or not the mag tracks it too.
* We track it in the mag since not all mags have the same size (e.g. during
* a resize operation). For performance (avoid an occasional cache miss), we
* could consider tracking it in the pcpu_cache. Might save a miss now and
* then.
* - Why do we just disable IRQs for the pcpu_cache? The paper explicitly talks
* about using locks instead of disabling IRQs, since disabling IRQs can be
* expensive. First off, we only just disable IRQs when there's 1:1 core to
* pcc. If we were to use a spinlock, we'd be disabling IRQs anyway, since we
* do allocations from IRQ context. The other reason to lock is when changing
* the pcpu state during a magazine resize. I have two ways to do this: just
* racily write and set pcc->magsize, or have the pcc's poll when they check
* the depot during free. Either approach doesn't require someone else to
* grab a pcc lock.
*
* TODO:
* - Add reclaim function.
* - When resizing, do we want to go through the depot and consolidate
* magazines? (probably not a big deal. maybe we'd deal with it when we
* clean up our excess mags.)
* - Could do some working set tracking. Like max/min over an interval, with
* resetting (in the depot, used for reclaim and maybe aggressive freeing).
* - Debugging info
*/
#include <slab.h>
#include <stdio.h>
#include <assert.h>
#include <pmap.h>
#include <kmalloc.h>
#include <hash.h>
#include <arena.h>
#include <hashtable.h>
#define SLAB_POISON ((void*)0xdead1111)
/* Tunables. I don't know which numbers to pick yet. Maybe we play with it at
* runtime. Though once a mag increases, it'll never decrease. */
uint64_t resize_timeout_ns = 1000000000;
unsigned int resize_threshold = 1;
/* Protected by the arenas_and_slabs_lock. */
struct kmem_cache_tailq all_kmem_caches =
TAILQ_HEAD_INITIALIZER(all_kmem_caches);
static void kmc_track(struct kmem_cache *kc)
{
struct kmem_cache *kc_i;
qlock(&arenas_and_slabs_lock);
TAILQ_FOREACH(kc_i, &all_kmem_caches, all_kmc_link) {
if (!strcmp(kc->name, kc_i->name))
warn("Creatgin KMC %s, but one with that name exists!",
kc->name);
}
TAILQ_INSERT_TAIL(&all_kmem_caches, kc, all_kmc_link);
qunlock(&arenas_and_slabs_lock);
}
static void kmc_untrack(struct kmem_cache *kc)
{
qlock(&arenas_and_slabs_lock);
TAILQ_REMOVE(&all_kmem_caches, kc, all_kmc_link);
qunlock(&arenas_and_slabs_lock);
}
/* Backend/internal functions, defined later. Grab the lock before calling
* these. */
static bool kmem_cache_grow(struct kmem_cache *cp);
static void *__kmem_alloc_from_slab(struct kmem_cache *cp, int flags);
static void __kmem_free_to_slab(struct kmem_cache *cp, void *buf);
/* Forward declarations for trace hooks */
static void kmem_trace_ht_init(struct kmem_trace_ht *ht);
static void kmem_trace_free(struct kmem_cache *kc, void *obj);
static void kmem_trace_alloc(struct kmem_cache *kc, void *obj);
static void kmem_trace_warn_notempty(struct kmem_cache *kc);
/* Cache of the kmem_cache objects, needed for bootstrapping */
struct kmem_cache kmem_cache_cache[1];
struct kmem_cache kmem_slab_cache[1];
struct kmem_cache kmem_bufctl_cache[1];
struct kmem_cache kmem_magazine_cache[1];
struct kmem_cache kmem_trace_cache[1];
static bool __use_bufctls(struct kmem_cache *cp)
{
return cp->flags & __KMC_USE_BUFCTL;
}
/* Using a layer of indirection for the pcpu caches, in case we want to use
* clustered objects, only per-NUMA-domain caches, or something like that. */
unsigned int kmc_nr_pcpu_caches(void)
{
return num_cores;
}
static struct kmem_pcpu_cache *get_my_pcpu_cache(struct kmem_cache *kc)
{
return &kc->pcpu_caches[core_id()];
}
/* In our current model, there is one pcc per core. If we had multiple cores
* that could use the pcc, such as with per-NUMA caches, then we'd need a
* spinlock. Since we do allocations from IRQ context, we still need to disable
* IRQs. */
static void lock_pcu_cache(struct kmem_pcpu_cache *pcc)
{
disable_irqsave(&pcc->irq_state);
}
static void unlock_pcu_cache(struct kmem_pcpu_cache *pcc)
{
enable_irqsave(&pcc->irq_state);
}
static void lock_depot(struct kmem_depot *depot)
{
uint64_t time;
if (spin_trylock_irqsave(&depot->lock))
return;
/* The lock is contended. When we finally get the lock, we'll up the
* contention count and see if we've had too many contentions over time.
*
* The idea is that if there are bursts of contention worse than X
* contended acquisitions in Y nsec, then we'll grow the magazines.
* This might not be that great of an approach - every thread gets one
* count, regardless of how long they take.
*
* We read the time before locking so that we don't artificially grow
* the window too much. Say the lock is heavily contended and we take a
* long time to get it. Perhaps X threads try to lock it immediately,
* but it takes over Y seconds for the Xth thread to actually get the
* lock. We might then think the burst wasn't big enough. */
time = nsec();
spin_lock_irqsave(&depot->lock);
/* If there are no not-empty mags, we're probably fighting for the lock
* not because the magazines aren't big enough, but because there aren't
* enough mags in the system yet. */
if (!depot->nr_not_empty)
return;
if (time - depot->busy_start > resize_timeout_ns) {
depot->busy_count = 0;
depot->busy_start = time;
}
depot->busy_count++;
if (depot->busy_count > resize_threshold) {
depot->busy_count = 0;
depot->magsize = MIN(KMC_MAG_MAX_SZ, depot->magsize + 1);
/* That's all we do - the pccs will eventually notice and up
* their magazine sizes. */
}
}
static void unlock_depot(struct kmem_depot *depot)
{
spin_unlock_irqsave(&depot->lock);
}
static void depot_init(struct kmem_depot *depot)
{
spinlock_init_irqsave(&depot->lock);
SLIST_INIT(&depot->not_empty);
SLIST_INIT(&depot->empty);
depot->magsize = KMC_MAG_MIN_SZ;
depot->nr_not_empty = 0;
depot->nr_empty = 0;
depot->busy_count = 0;
depot->busy_start = 0;
}
static bool mag_is_empty(struct kmem_magazine *mag)
{
return mag->nr_rounds == 0;
}
/* Helper, swaps the loaded and previous mags. Hold the pcc lock. */
static void __swap_mags(struct kmem_pcpu_cache *pcc)
{
struct kmem_magazine *temp;
temp = pcc->prev;
pcc->prev = pcc->loaded;
pcc->loaded = temp;
}
/* Helper, returns a magazine to the depot. Hold the depot lock. */
static void __return_to_depot(struct kmem_cache *kc, struct kmem_magazine *mag)
{
struct kmem_depot *depot = &kc->depot;
if (mag_is_empty(mag)) {
SLIST_INSERT_HEAD(&depot->empty, mag, link);
depot->nr_empty++;
} else {
SLIST_INSERT_HEAD(&depot->not_empty, mag, link);
depot->nr_not_empty++;
}
}
/* Helper, removes the contents of the magazine, giving them back to the slab
* layer. */
static void drain_mag(struct kmem_cache *kc, struct kmem_magazine *mag)
{
for (int i = 0; i < mag->nr_rounds; i++) {
if (kc->dtor)
kc->dtor(mag->rounds[i], kc->priv);
__kmem_free_to_slab(kc, mag->rounds[i]);
}
mag->nr_rounds = 0;
}
static struct kmem_pcpu_cache *build_pcpu_caches(void)
{
struct kmem_pcpu_cache *pcc;
pcc = base_alloc(NULL,
sizeof(struct kmem_pcpu_cache) * kmc_nr_pcpu_caches(),
MEM_WAIT);
for (int i = 0; i < kmc_nr_pcpu_caches(); i++) {
pcc[i].irq_state = 0;
pcc[i].magsize = KMC_MAG_MIN_SZ;
pcc[i].loaded = __kmem_alloc_from_slab(kmem_magazine_cache,
MEM_WAIT);
pcc[i].prev = __kmem_alloc_from_slab(kmem_magazine_cache,
MEM_WAIT);
pcc[i].nr_allocs_ever = 0;
}
return pcc;
}
void __kmem_cache_create(struct kmem_cache *kc, const char *name,
size_t obj_size, int align, int flags,
struct arena *source,
int (*ctor)(void *, void *, int),
void (*dtor)(void *, void *), void *priv)
{
assert(kc);
assert(align);
spinlock_init_irqsave(&kc->cache_lock);
strlcpy(kc->name, name, KMC_NAME_SZ);
kc->obj_size = ROUNDUP(obj_size, align);
if (flags & KMC_QCACHE)
kc->import_amt = ROUNDUPPWR2(3 * source->qcache_max);
else
kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size, PGSIZE);
kc->align = align;
if (align > PGSIZE)
panic("Cache %s object alignment is actually MIN(PGSIZE, align (%p))",
name, align);
kc->flags = flags;
/* We might want some sort of per-call site NUMA-awareness in the
* future. */
kc->source = source ? source : kpages_arena;
TAILQ_INIT(&kc->full_slab_list);
TAILQ_INIT(&kc->partial_slab_list);
TAILQ_INIT(&kc->empty_slab_list);
kc->ctor = ctor;
kc->dtor = dtor;
kc->priv = priv;
kc->nr_cur_alloc = 0;
kc->nr_direct_allocs_ever = 0;
kc->alloc_hash = kc->static_hash;
hash_init_hh(&kc->hh);
for (int i = 0; i < kc->hh.nr_hash_lists; i++)
BSD_LIST_INIT(&kc->static_hash[i]);
/* No touch must use bufctls, even for small objects, so that it does
* not use the object as memory. Note that if we have an arbitrary
* source, small objects, and we're 'pro-touch', the small allocation
* path will assume we're importing from a PGSIZE-aligned source arena.
*/
if ((obj_size > SLAB_LARGE_CUTOFF) || (flags & KMC_NOTOUCH))
kc->flags |= __KMC_USE_BUFCTL;
depot_init(&kc->depot);
kmem_trace_ht_init(&kc->trace_ht);
/* We do this last, since this will all into the magazine cache - which
* we could be creating on this call! */
kc->pcpu_caches = build_pcpu_caches();
add_importing_slab(kc->source, kc);
kmc_track(kc);
}
static int __mag_ctor(void *obj, void *priv, int flags)
{
struct kmem_magazine *mag = (struct kmem_magazine*)obj;
mag->nr_rounds = 0;
return 0;
}
void kmem_cache_init(void)
{
/* magazine must be first - all caches, including mags, will do a slab
* alloc from the mag cache. */
static_assert(sizeof(struct kmem_magazine) <= SLAB_LARGE_CUTOFF);
__kmem_cache_create(kmem_magazine_cache, "kmem_magazine",
sizeof(struct kmem_magazine),
__alignof__(struct kmem_magazine), KMC_NOTRACE,
base_arena, __mag_ctor, NULL, NULL);
__kmem_cache_create(kmem_cache_cache, "kmem_cache",
sizeof(struct kmem_cache),
__alignof__(struct kmem_cache), 0, base_arena,
NULL, NULL, NULL);
__kmem_cache_create(kmem_slab_cache, "kmem_slab",
sizeof(struct kmem_slab),
__alignof__(struct kmem_slab), KMC_NOTRACE,
base_arena, NULL, NULL, NULL);
__kmem_cache_create(kmem_bufctl_cache, "kmem_bufctl",
sizeof(struct kmem_bufctl),
__alignof__(struct kmem_bufctl), KMC_NOTRACE,
base_arena, NULL, NULL, NULL);
__kmem_cache_create(kmem_trace_cache, "kmem_trace",
sizeof(struct kmem_trace),
__alignof__(struct kmem_trace), KMC_NOTRACE,
base_arena, NULL, NULL, NULL);
}
/* Cache management */
struct kmem_cache *kmem_cache_create(const char *name, size_t obj_size,
int align, int flags,
struct arena *source,
int (*ctor)(void *, void *, int),
void (*dtor)(void *, void *),
void *priv)
{
struct kmem_cache *kc = kmem_cache_alloc(kmem_cache_cache, 0);
__kmem_cache_create(kc, name, obj_size, align, flags, source, ctor,
dtor, priv);
return kc;
}
/* Helper during destruction. No one should be touching the allocator anymore.
* We just need to hand objects back to the depot, which will hand them to the
* slab. Locking is just a formality here. */
static void drain_pcpu_caches(struct kmem_cache *kc)
{
struct kmem_pcpu_cache *pcc;
for (int i = 0; i < kmc_nr_pcpu_caches(); i++) {
pcc = &kc->pcpu_caches[i];
lock_pcu_cache(pcc);
lock_depot(&kc->depot);
__return_to_depot(kc, pcc->loaded);
__return_to_depot(kc, pcc->prev);
unlock_depot(&kc->depot);
pcc->loaded = SLAB_POISON;
pcc->prev = SLAB_POISON;
unlock_pcu_cache(pcc);
}
}
static void depot_destroy(struct kmem_cache *kc)
{
struct kmem_magazine *mag_i;
struct kmem_depot *depot = &kc->depot;
lock_depot(depot);
while ((mag_i = SLIST_FIRST(&depot->not_empty))) {
drain_mag(kc, mag_i);
kmem_cache_free(kmem_magazine_cache, mag_i);
}
while ((mag_i = SLIST_FIRST(&depot->empty)))
kmem_cache_free(kmem_magazine_cache, mag_i);
unlock_depot(depot);
}
static void kmem_slab_destroy(struct kmem_cache *cp, struct kmem_slab *a_slab)
{
if (!__use_bufctls(cp)) {
arena_free(cp->source, ROUNDDOWN(a_slab, PGSIZE), PGSIZE);
} else {
struct kmem_bufctl *i, *temp;
void *buf_start = (void*)SIZE_MAX;
BSD_LIST_FOREACH_SAFE(i, &a_slab->bufctl_freelist, link, temp) {
// Track the lowest buffer address, which is the start
// of the buffer
buf_start = MIN(buf_start, i->buf_addr);
/* This is a little dangerous, but we can skip removing,
* since we init the freelist when we reuse the slab. */
kmem_cache_free(kmem_bufctl_cache, i);
}
arena_free(cp->source, buf_start, cp->import_amt);
kmem_cache_free(kmem_slab_cache, a_slab);
}
}
/* Once you call destroy, never use this cache again... o/w there may be weird
* races, and other serious issues. */
void kmem_cache_destroy(struct kmem_cache *cp)
{
struct kmem_slab *a_slab, *next;
kmc_untrack(cp);
del_importing_slab(cp->source, cp);
drain_pcpu_caches(cp);
depot_destroy(cp);
spin_lock_irqsave(&cp->cache_lock);
assert(TAILQ_EMPTY(&cp->full_slab_list));
assert(TAILQ_EMPTY(&cp->partial_slab_list));
/* Clean out the empty list. We can't use a regular FOREACH here, since
* the link element is stored in the slab struct, which is stored on the
* page that we are freeing. */
a_slab = TAILQ_FIRST(&cp->empty_slab_list);
while (a_slab) {
next = TAILQ_NEXT(a_slab, link);
kmem_slab_destroy(cp, a_slab);
a_slab = next;
}
spin_unlock_irqsave(&cp->cache_lock);
kmem_trace_warn_notempty(cp);
kmem_cache_free(kmem_cache_cache, cp);
}
static void __try_hash_resize(struct kmem_cache *cp)
{
struct kmem_bufctl_list *new_tbl, *old_tbl;
struct kmem_bufctl *bc_i;
unsigned int new_tbl_nr_lists, old_tbl_nr_lists;
size_t new_tbl_sz, old_tbl_sz;
size_t hash_idx;
if (!hash_needs_more(&cp->hh))
return;
new_tbl_nr_lists = hash_next_nr_lists(&cp->hh);
new_tbl_sz = new_tbl_nr_lists * sizeof(struct kmem_bufctl_list);
/* TODO: we only need to pull from base if our arena is a base or we are
* inside a kpages arena (keep in mind there could be more than one of
* those, depending on how we do NUMA allocs). This might help with
* fragmentation. To know this, we'll need the caller to pass us a
* flag. */
new_tbl = base_zalloc(NULL, new_tbl_sz, ARENA_INSTANTFIT | MEM_ATOMIC);
if (!new_tbl)
return;
old_tbl = cp->alloc_hash;
old_tbl_nr_lists = cp->hh.nr_hash_lists;
old_tbl_sz = old_tbl_nr_lists * sizeof(struct kmem_bufctl_list);
cp->alloc_hash = new_tbl;
hash_incr_nr_lists(&cp->hh);
for (int i = 0; i < old_tbl_nr_lists; i++) {
while ((bc_i = BSD_LIST_FIRST(&old_tbl[i]))) {
BSD_LIST_REMOVE(bc_i, link);
hash_idx = hash_ptr(bc_i->buf_addr,
cp->hh.nr_hash_bits);
BSD_LIST_INSERT_HEAD(&cp->alloc_hash[hash_idx], bc_i,
link);
}
}
hash_reset_load_limit(&cp->hh);
if (old_tbl != cp->static_hash)
base_free(NULL, old_tbl, old_tbl_sz);
}
/* Helper, tracks the allocation of @bc in the hash table */
static void __track_alloc(struct kmem_cache *cp, struct kmem_bufctl *bc)
{
size_t hash_idx;
hash_idx = hash_ptr(bc->buf_addr, cp->hh.nr_hash_bits);
BSD_LIST_INSERT_HEAD(&cp->alloc_hash[hash_idx], bc, link);
cp->hh.nr_items++;
__try_hash_resize(cp);
}
/* Helper, looks up and removes the bufctl corresponding to buf. */
static struct kmem_bufctl *__yank_bufctl(struct kmem_cache *cp, void *buf)
{
struct kmem_bufctl *bc_i;
size_t hash_idx;
hash_idx = hash_ptr(buf, cp->hh.nr_hash_bits);
BSD_LIST_FOREACH(bc_i, &cp->alloc_hash[hash_idx], link) {
if (bc_i->buf_addr == buf) {
BSD_LIST_REMOVE(bc_i, link);
break;
}
}
if (!bc_i)
panic("Could not find buf %p in cache %s!", buf, cp->name);
return bc_i;
}
/* Alloc, bypassing the magazines and depot */
static void *__kmem_alloc_from_slab(struct kmem_cache *cp, int flags)
{
void *retval = NULL;
spin_lock_irqsave(&cp->cache_lock);
// look at partial list
struct kmem_slab *a_slab = TAILQ_FIRST(&cp->partial_slab_list);
// if none, go to empty list and get an empty and make it partial
if (!a_slab) {
// TODO: think about non-sleeping flags
if (TAILQ_EMPTY(&cp->empty_slab_list) &&
!kmem_cache_grow(cp)) {
spin_unlock_irqsave(&cp->cache_lock);
if (flags & MEM_ERROR)
error(ENOMEM, ERROR_FIXME);
else
panic("[German Accent]: OOM for a small slab growth!!!");
}
// move to partial list
a_slab = TAILQ_FIRST(&cp->empty_slab_list);
TAILQ_REMOVE(&cp->empty_slab_list, a_slab, link);
TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link);
}
// have a partial now (a_slab), get an item, return item
if (!__use_bufctls(cp)) {
retval = a_slab->free_small_obj;
/* the next free_small_obj address is stored at the beginning of
* the current free_small_obj. */
a_slab->free_small_obj = *(uintptr_t**)(a_slab->free_small_obj);
} else {
// rip the first bufctl out of the partial slab's buf list
struct kmem_bufctl *a_bufctl =
BSD_LIST_FIRST(&a_slab->bufctl_freelist);
BSD_LIST_REMOVE(a_bufctl, link);
__track_alloc(cp, a_bufctl);
retval = a_bufctl->buf_addr;
}
a_slab->num_busy_obj++;
// Check if we are full, if so, move to the full list
if (a_slab->num_busy_obj == a_slab->num_total_obj) {
TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link);
TAILQ_INSERT_HEAD(&cp->full_slab_list, a_slab, link);
}
cp->nr_cur_alloc++;
cp->nr_direct_allocs_ever++;
spin_unlock_irqsave(&cp->cache_lock);
if (cp->ctor) {
if (cp->ctor(retval, cp->priv, flags)) {
warn("Ctor %p failed, probably a bug!");
__kmem_free_to_slab(cp, retval);
return NULL;
}
}
return retval;
}
void *kmem_cache_alloc(struct kmem_cache *kc, int flags)
{
struct kmem_pcpu_cache *pcc = get_my_pcpu_cache(kc);
struct kmem_depot *depot = &kc->depot;
struct kmem_magazine *mag;
void *ret;
lock_pcu_cache(pcc);
try_alloc:
if (pcc->loaded->nr_rounds) {
ret = pcc->loaded->rounds[pcc->loaded->nr_rounds - 1];
pcc->loaded->nr_rounds--;
pcc->nr_allocs_ever++;
unlock_pcu_cache(pcc);
kmem_trace_alloc(kc, ret);
return ret;
}
if (!mag_is_empty(pcc->prev)) {
__swap_mags(pcc);
goto try_alloc;
}
/* Note the lock ordering: pcc -> depot */
lock_depot(depot);
mag = SLIST_FIRST(&depot->not_empty);
if (mag) {
SLIST_REMOVE_HEAD(&depot->not_empty, link);
depot->nr_not_empty--;
__return_to_depot(kc, pcc->prev);
unlock_depot(depot);
pcc->prev = pcc->loaded;
pcc->loaded = mag;
goto try_alloc;
}
unlock_depot(depot);
unlock_pcu_cache(pcc);
ret = __kmem_alloc_from_slab(kc, flags);
kmem_trace_alloc(kc, ret);
return ret;
}
/* Returns an object to the slab layer. Caller must deconstruct the objects.
* Note that objects in the slabs are unconstructed. */
static void __kmem_free_to_slab(struct kmem_cache *cp, void *buf)
{
struct kmem_slab *a_slab;
struct kmem_bufctl *a_bufctl;
spin_lock_irqsave(&cp->cache_lock);
if (!__use_bufctls(cp)) {
// find its slab
a_slab = (struct kmem_slab*)(ROUNDDOWN((uintptr_t)buf, PGSIZE) +
PGSIZE - sizeof(struct kmem_slab));
/* write location of next free small obj to the space at the
* beginning of the buffer, then list buf as the next free small
* obj */
*(uintptr_t**)buf = a_slab->free_small_obj;
a_slab->free_small_obj = buf;
} else {
/* Give the bufctl back to the parent slab */
a_bufctl = __yank_bufctl(cp, buf);
a_slab = a_bufctl->my_slab;
BSD_LIST_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl, link);
}
a_slab->num_busy_obj--;
cp->nr_cur_alloc--;
// if it was full, move it to partial
if (a_slab->num_busy_obj + 1 == a_slab->num_total_obj) {
TAILQ_REMOVE(&cp->full_slab_list, a_slab, link);
TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link);
} else if (!a_slab->num_busy_obj) {
// if there are none, move to from partial to empty
TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link);
TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
}
spin_unlock_irqsave(&cp->cache_lock);
}
void kmem_cache_free(struct kmem_cache *kc, void *buf)
{
struct kmem_pcpu_cache *pcc = get_my_pcpu_cache(kc);
struct kmem_depot *depot = &kc->depot;
struct kmem_magazine *mag;
assert(buf); /* catch bugs */
kmem_trace_free(kc, buf);
lock_pcu_cache(pcc);
try_free:
if (pcc->loaded->nr_rounds < pcc->magsize) {
pcc->loaded->rounds[pcc->loaded->nr_rounds] = buf;
pcc->loaded->nr_rounds++;
unlock_pcu_cache(pcc);
return;
}
/* The paper checks 'is empty' here. But we actually just care if it
* has room left, not that prev is completely empty. This could be the
* case due to magazine resize. */
if (pcc->prev->nr_rounds < pcc->magsize) {
__swap_mags(pcc);
goto try_free;
}
lock_depot(depot);
/* Here's where the resize magic happens. We'll start using it for the
* next magazine. */
pcc->magsize = depot->magsize;
mag = SLIST_FIRST(&depot->empty);
if (mag) {
SLIST_REMOVE_HEAD(&depot->empty, link);
depot->nr_empty--;
__return_to_depot(kc, pcc->prev);
unlock_depot(depot);
pcc->prev = pcc->loaded;
pcc->loaded = mag;
goto try_free;
}
unlock_depot(depot);
/* Need to unlock, in case we end up calling back into ourselves. */
unlock_pcu_cache(pcc);
/* don't want to wait on a free. if this fails, we can still just give
* it to the slab layer. */
mag = kmem_cache_alloc(kmem_magazine_cache, MEM_ATOMIC);
if (mag) {
assert(mag->nr_rounds == 0);
lock_depot(depot);
SLIST_INSERT_HEAD(&depot->empty, mag, link);
depot->nr_empty++;
unlock_depot(depot);
lock_pcu_cache(pcc);
goto try_free;
}
if (kc->dtor)
kc->dtor(buf, kc->priv);
__kmem_free_to_slab(kc, buf);
}
/* Back end: internal functions */
/* When this returns, the cache has at least one slab in the empty list. If
* page_alloc fails, there are some serious issues. This only grows by one slab
* at a time.
*
* Grab the cache lock before calling this.
*
* TODO: think about page colouring issues with kernel memory allocation. */
static bool kmem_cache_grow(struct kmem_cache *cp)
{
struct kmem_slab *a_slab;
struct kmem_bufctl *a_bufctl;
if (!__use_bufctls(cp)) {
void *a_page;
/* Careful, this assumes our source is a PGSIZE-aligned
* allocator. We could use xalloc to enforce the alignment, but
* that'll bypass the qcaches, which we don't want. Caller
* beware. */
a_page = arena_alloc(cp->source, PGSIZE, MEM_ATOMIC);
if (!a_page)
return FALSE;
// the slab struct is stored at the end of the page
a_slab = (struct kmem_slab*)(a_page + PGSIZE
- sizeof(struct kmem_slab));
a_slab->num_busy_obj = 0;
a_slab->num_total_obj = (PGSIZE - sizeof(struct kmem_slab)) /
cp->obj_size;
// TODO: consider staggering this IAW section 4.3
a_slab->free_small_obj = a_page;
/* Walk and create the free list, which is circular. Each item
* stores the location of the next one at the beginning of the
* block. */
void *buf = a_slab->free_small_obj;
for (int i = 0; i < a_slab->num_total_obj - 1; i++) {
*(uintptr_t**)buf = buf + cp->obj_size;
buf += cp->obj_size;
}
*((uintptr_t**)buf) = NULL;
} else {
void *buf;
a_slab = kmem_cache_alloc(kmem_slab_cache, 0);
if (!a_slab)
return FALSE;
buf = arena_alloc(cp->source, cp->import_amt, MEM_ATOMIC);
if (!buf) {
kmem_cache_free(kmem_slab_cache, a_slab);
return FALSE;
}
a_slab->num_busy_obj = 0;
a_slab->num_total_obj = cp->import_amt / cp->obj_size;
BSD_LIST_INIT(&a_slab->bufctl_freelist);
/* for each buffer, set up a bufctl and point to the buffer */
for (int i = 0; i < a_slab->num_total_obj; i++) {
a_bufctl = kmem_cache_alloc(kmem_bufctl_cache, 0);
BSD_LIST_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl,
link);
a_bufctl->buf_addr = buf;
a_bufctl->my_slab = a_slab;
buf += cp->obj_size;
}
}
// add a_slab to the empty_list
TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link);
return TRUE;
}
/* This deallocs every slab from the empty list. TODO: think a bit more about
* this. We can do things like not free all of the empty lists to prevent
* thrashing. See 3.4 in the paper. */
void kmem_cache_reap(struct kmem_cache *cp)
{
struct kmem_slab *a_slab, *next;
// Destroy all empty slabs. Refer to the notes about the while loop
spin_lock_irqsave(&cp->cache_lock);
a_slab = TAILQ_FIRST(&cp->empty_slab_list);
while (a_slab) {
next = TAILQ_NEXT(a_slab, link);
kmem_slab_destroy(cp, a_slab);
a_slab = next;
}
spin_unlock_irqsave(&cp->cache_lock);
}
/* Tracing */
static void kmem_trace_ht_foreach(struct kmem_trace_ht *ht,
void (*f)(struct kmem_trace *, void *),
void *arg)
{
struct kmem_trace *tr;
struct hlist_node *temp;
spin_lock_irqsave(&ht->lock);
for (int i = 0; i < ht->hh.nr_hash_lists; i++)
hlist_for_each_entry_safe(tr, temp, &ht->ht[i], hash)
f(tr, arg);
spin_unlock_irqsave(&ht->lock);
}
static void __kmem_trace_print(struct kmem_trace *tr, void *arg)
{
struct sized_alloc *sza = arg;
sza_printf(sza, "Obj %p, from %s:\n----\n", tr->obj, tr->str);
sza_print_backtrace_list(sza, tr->pcs, tr->nr_pcs);
sza_printf(sza, "\n");
}
static void __kmem_trace_bytes_needed(struct kmem_trace *tr, void *arg)
{
size_t *x = arg;
/* Just a guess of how much room we'll need, plus fudge. */
*x += tr->nr_pcs * 80 + 100;
}
struct sized_alloc *kmem_trace_print(struct kmem_cache *kc)
{
struct sized_alloc *sza;
size_t amt = 100;
kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_bytes_needed, &amt);
sza = sized_kzmalloc(amt, MEM_WAIT);
sza_printf(sza, "Dumping outstanding allocs from %s\n", kc->name);
sza_printf(sza, "-------------------------\n");
kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_print, sza);
return sza;
}
static void __kmem_trace_drop(struct kmem_trace *tr, void *arg)
{
hlist_del(&tr->hash);
kmem_cache_free(kmem_trace_cache, tr);
}
void kmem_trace_reset(struct kmem_cache *kc)
{
kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_drop, NULL);
}
/* It's a bug to ever have a left-over trace when we delete a KMC, and probably
* will never happen. If it does, we can expand the debugging info. */
static void __kmem_trace_warn_and_drop(struct kmem_trace *tr, void *arg)
{
warn("KMC had an object! (%p)", tr->obj);
__kmem_trace_drop(tr, NULL);
}
static void kmem_trace_warn_notempty(struct kmem_cache *kc)
{
kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_warn_and_drop, NULL);
}
int kmem_trace_start(struct kmem_cache *kc)
{
spin_lock_irqsave(&kc->cache_lock);
if (kc->flags & KMC_NOTRACE) {
spin_unlock_irqsave(&kc->cache_lock);
return -1;
}
WRITE_ONCE(kc->flags, kc->flags | __KMC_TRACED | __KMC_EVER_TRACED);
spin_unlock_irqsave(&kc->cache_lock);
return 0;
}
/* Note that the tracers locklessly-peek at the flags, and we may have an
* allocation complete its trace after we stop. You could conceivably stop,
* reset/remove all traces, and then have a trace appear still. */
void kmem_trace_stop(struct kmem_cache *kc)
{
spin_lock_irqsave(&kc->cache_lock);
WRITE_ONCE(kc->flags, kc->flags & ~__KMC_TRACED);
spin_unlock_irqsave(&kc->cache_lock);
}
static void kmem_trace_ht_init(struct kmem_trace_ht *ht)
{
spinlock_init(&ht->lock);
ht->ht = ht->static_ht;
hash_init_hh(&ht->hh);
for (int i = 0; i < ht->hh.nr_hash_lists; i++)
INIT_HLIST_HEAD(&ht->ht[i]);
}
static void kmem_trace_ht_insert(struct kmem_trace_ht *ht,
struct kmem_trace *tr)
{
unsigned long hash_val = __generic_hash(tr->obj);
struct hlist_head *bucket;
spin_lock_irqsave(&ht->lock);
bucket = &ht->ht[hash_val % ht->hh.nr_hash_bits];
hlist_add_head(&tr->hash, bucket);
spin_unlock_irqsave(&ht->lock);
}
static struct kmem_trace *kmem_trace_ht_remove(struct kmem_trace_ht *ht,
void *obj)
{
struct kmem_trace *tr;
unsigned long hash_val = __generic_hash(obj);
struct hlist_head *bucket;
spin_lock_irqsave(&ht->lock);
bucket = &ht->ht[hash_val % ht->hh.nr_hash_bits];
hlist_for_each_entry(tr, bucket, hash) {
if (tr->obj == obj) {
hlist_del(&tr->hash);
break;
}
}
spin_unlock_irqsave(&ht->lock);
return tr;
}
static void kmem_trace_free(struct kmem_cache *kc, void *obj)
{
struct kmem_trace *tr;
/* Even if we turn off tracing, we still want to free traces that we may
* have collected earlier. Otherwise, those objects will never get
* removed from the trace, and could lead to confusion if they are
* reallocated and traced again. Still, we don't want to pay the cost
* on every free for untraced KCs. */
if (!(READ_ONCE(kc->flags) & __KMC_EVER_TRACED))
return;
tr = kmem_trace_ht_remove(&kc->trace_ht, obj);
if (tr)
kmem_cache_free(kmem_trace_cache, tr);
}
static void trace_context(struct kmem_trace *tr)
{
if (is_ktask(current_kthread)) {
snprintf(tr->str, sizeof(tr->str), "ktask %s",
current_kthread->name);
} else if (current) {
/* When we're handling a page fault, knowing the user PC helps
* determine the source. A backtrace is nice, but harder to do.
* Since we're deep in MM code and holding locks, we can't use
* copy_from_user, which uses the page-fault fixups. If you
* need to get the BT, stash it in the genbuf in
* handle_page_fault(). */
snprintf(tr->str, sizeof(tr->str), "PID %d %s: %s, %p",
current->pid, current->progname,
current_kthread->name,
current_ctx ? get_user_ctx_pc(current_ctx) : 0);
} else {
snprintf(tr->str, sizeof(tr->str), "(none)");
}
tr->str[sizeof(tr->str) - 1] = 0;
}
static void kmem_trace_alloc(struct kmem_cache *kc, void *obj)
{
struct kmem_trace *tr;
if (!(READ_ONCE(kc->flags) & __KMC_TRACED))
return;
tr = kmem_cache_alloc(kmem_trace_cache, MEM_ATOMIC);
if (!tr)
return;
tr->obj = obj;
tr->nr_pcs = backtrace_list(get_caller_pc(), get_caller_fp(), tr->pcs,
ARRAY_SIZE(tr->pcs));
trace_context(tr);
kmem_trace_ht_insert(&kc->trace_ht, tr);
}