| /* Copyright (c) 2009 The Regents of the University of California |
| * See LICENSE for details. |
| * |
| * Slab allocator, based on the SunOS 5.4 allocator paper. |
| * |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * Kevin Klues <klueska@cs.berkeley.edu> |
| * |
| * Copyright (c) 2016 Google Inc |
| * |
| * Upgraded and extended to support magazines, based on Bonwick and Adams's |
| * "Magazines and Vmem" paper. |
| * |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * |
| * FAQ: |
| * - What sort of allocator do we need for the kmem_pcpu_caches? In general, |
| * the base allocator. All slabs/caches depend on the pcpu_caches for any |
| * allocation, so we need something that does not rely on slabs. We could use |
| * generic kpages, if we knew that we weren't: qcaches for a kpages_arena, the |
| * slab kcache, or the bufctl kcache. This is the same set of restrictions |
| * for the hash table allocations. |
| * - Why doesn't the magazine cache deadlock on itself? Because magazines are |
| * only allocated during the free path of another cache. There are no |
| * magazine allocations during a cache's allocation. |
| * - Does the magazine cache need to be statically allocated? Maybe not, but it |
| * doesn't hurt. We need to set it up at some point. We can use other caches |
| * for allocations before the mag cache is initialized, but we can't free. |
| * - Does the magazine cache need to pull from the base arena? Similar to the |
| * static allocation question - by default, maybe not, but it is safer. And |
| * yes, due to other design choices. We could initialize it after kpages is |
| * allocated and use a kpages_arena, but that would require us to not free a |
| * page before or during kpages_arena_init(). A related note is where the |
| * first magazines in a pcpu_cache come from. I'm currently going with "raw |
| * slab alloc from the magazine cache", which means magazines need to work |
| * when we're setting up the qcache's for kpages_arena. That creates a |
| * dependency, which means kpages depends on mags, which means mags can only |
| * depend on base. If we ever use slabs for non-base arena btags, we'll also |
| * have this dependency between kpages and mags. |
| * - The paper talks about full and empty magazines. Why does our code talk |
| * about not_empty and empty? The way we'll do our magazine resizing is to |
| * just() increment the pcpu_cache's magsize. Then we'll eventually start |
| * filling the magazines to their new capacity (during frees, btw). During |
| * this time, a mag that was previously full will technically be not-empty, |
| * but not full. The correctness of the magazine code is still OK, I think, |
| * since when they say 'full', they require 'not empty' in most cases. In |
| * short, 'not empty' is more accurate, though it makes sense to say 'full' |
| * when explaining the basic idea for their paper. |
| * - Due to a resize, what happens when the depot gives a pcpu cache a magazine |
| * with *more* rounds than ppc->magsize? The allocation path doesn't care |
| * about magsize - it just looks at nr_rounds. So that's fine. On the free |
| * path, we might mistakenly think that a mag has no more room. In that case, |
| * we'll just hand it to the depot and it'll be a 'not-empty' mag. Eventually |
| * it'll get filled up, or it just won't matter. 'magsize' is basically an |
| * instruction to the pcpu_cache: "fill to X, please." |
| * - Why is nr_rounds tracked in the magazine and not the pcpu cache? The paper |
| * uses the pcpu cache, but doesn't say whether or not the mag tracks it too. |
| * We track it in the mag since not all mags have the same size (e.g. during |
| * a resize operation). For performance (avoid an occasional cache miss), we |
| * could consider tracking it in the pcpu_cache. Might save a miss now and |
| * then. |
| * - Why do we just disable IRQs for the pcpu_cache? The paper explicitly talks |
| * about using locks instead of disabling IRQs, since disabling IRQs can be |
| * expensive. First off, we only just disable IRQs when there's 1:1 core to |
| * pcc. If we were to use a spinlock, we'd be disabling IRQs anyway, since we |
| * do allocations from IRQ context. The other reason to lock is when changing |
| * the pcpu state during a magazine resize. I have two ways to do this: just |
| * racily write and set pcc->magsize, or have the pcc's poll when they check |
| * the depot during free. Either approach doesn't require someone else to |
| * grab a pcc lock. |
| * |
| * TODO: |
| * - Add reclaim function. |
| * - When resizing, do we want to go through the depot and consolidate |
| * magazines? (probably not a big deal. maybe we'd deal with it when we |
| * clean up our excess mags.) |
| * - Could do some working set tracking. Like max/min over an interval, with |
| * resetting (in the depot, used for reclaim and maybe aggressive freeing). |
| * - Debugging info |
| */ |
| |
| #include <slab.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <pmap.h> |
| #include <kmalloc.h> |
| #include <hash.h> |
| #include <arena.h> |
| #include <hashtable.h> |
| |
| #define SLAB_POISON ((void*)0xdead1111) |
| |
| /* Tunables. I don't know which numbers to pick yet. Maybe we play with it at |
| * runtime. Though once a mag increases, it'll never decrease. */ |
| uint64_t resize_timeout_ns = 1000000000; |
| unsigned int resize_threshold = 1; |
| |
| /* Protected by the arenas_and_slabs_lock. */ |
| struct kmem_cache_tailq all_kmem_caches = |
| TAILQ_HEAD_INITIALIZER(all_kmem_caches); |
| |
| static void kmc_track(struct kmem_cache *kc) |
| { |
| struct kmem_cache *kc_i; |
| |
| qlock(&arenas_and_slabs_lock); |
| TAILQ_FOREACH(kc_i, &all_kmem_caches, all_kmc_link) { |
| if (!strcmp(kc->name, kc_i->name)) |
| warn("Creatgin KMC %s, but one with that name exists!", |
| kc->name); |
| } |
| TAILQ_INSERT_TAIL(&all_kmem_caches, kc, all_kmc_link); |
| qunlock(&arenas_and_slabs_lock); |
| } |
| |
| static void kmc_untrack(struct kmem_cache *kc) |
| { |
| qlock(&arenas_and_slabs_lock); |
| TAILQ_REMOVE(&all_kmem_caches, kc, all_kmc_link); |
| qunlock(&arenas_and_slabs_lock); |
| } |
| |
| /* Backend/internal functions, defined later. Grab the lock before calling |
| * these. */ |
| static bool kmem_cache_grow(struct kmem_cache *cp); |
| static void *__kmem_alloc_from_slab(struct kmem_cache *cp, int flags); |
| static void __kmem_free_to_slab(struct kmem_cache *cp, void *buf); |
| |
| /* Forward declarations for trace hooks */ |
| static void kmem_trace_ht_init(struct kmem_trace_ht *ht); |
| static void kmem_trace_free(struct kmem_cache *kc, void *obj); |
| static void kmem_trace_alloc(struct kmem_cache *kc, void *obj); |
| static void kmem_trace_warn_notempty(struct kmem_cache *kc); |
| |
| /* Cache of the kmem_cache objects, needed for bootstrapping */ |
| struct kmem_cache kmem_cache_cache[1]; |
| struct kmem_cache kmem_slab_cache[1]; |
| struct kmem_cache kmem_bufctl_cache[1]; |
| struct kmem_cache kmem_magazine_cache[1]; |
| struct kmem_cache kmem_trace_cache[1]; |
| |
| static bool __use_bufctls(struct kmem_cache *cp) |
| { |
| return cp->flags & __KMC_USE_BUFCTL; |
| } |
| |
| /* Using a layer of indirection for the pcpu caches, in case we want to use |
| * clustered objects, only per-NUMA-domain caches, or something like that. */ |
| unsigned int kmc_nr_pcpu_caches(void) |
| { |
| return num_cores; |
| } |
| |
| static struct kmem_pcpu_cache *get_my_pcpu_cache(struct kmem_cache *kc) |
| { |
| return &kc->pcpu_caches[core_id()]; |
| } |
| |
| /* In our current model, there is one pcc per core. If we had multiple cores |
| * that could use the pcc, such as with per-NUMA caches, then we'd need a |
| * spinlock. Since we do allocations from IRQ context, we still need to disable |
| * IRQs. */ |
| static void lock_pcu_cache(struct kmem_pcpu_cache *pcc) |
| { |
| disable_irqsave(&pcc->irq_state); |
| } |
| |
| static void unlock_pcu_cache(struct kmem_pcpu_cache *pcc) |
| { |
| enable_irqsave(&pcc->irq_state); |
| } |
| |
| static void lock_depot(struct kmem_depot *depot) |
| { |
| uint64_t time; |
| |
| if (spin_trylock_irqsave(&depot->lock)) |
| return; |
| /* The lock is contended. When we finally get the lock, we'll up the |
| * contention count and see if we've had too many contentions over time. |
| * |
| * The idea is that if there are bursts of contention worse than X |
| * contended acquisitions in Y nsec, then we'll grow the magazines. |
| * This might not be that great of an approach - every thread gets one |
| * count, regardless of how long they take. |
| * |
| * We read the time before locking so that we don't artificially grow |
| * the window too much. Say the lock is heavily contended and we take a |
| * long time to get it. Perhaps X threads try to lock it immediately, |
| * but it takes over Y seconds for the Xth thread to actually get the |
| * lock. We might then think the burst wasn't big enough. */ |
| time = nsec(); |
| spin_lock_irqsave(&depot->lock); |
| /* If there are no not-empty mags, we're probably fighting for the lock |
| * not because the magazines aren't big enough, but because there aren't |
| * enough mags in the system yet. */ |
| if (!depot->nr_not_empty) |
| return; |
| if (time - depot->busy_start > resize_timeout_ns) { |
| depot->busy_count = 0; |
| depot->busy_start = time; |
| } |
| depot->busy_count++; |
| if (depot->busy_count > resize_threshold) { |
| depot->busy_count = 0; |
| depot->magsize = MIN(KMC_MAG_MAX_SZ, depot->magsize + 1); |
| /* That's all we do - the pccs will eventually notice and up |
| * their magazine sizes. */ |
| } |
| } |
| |
| static void unlock_depot(struct kmem_depot *depot) |
| { |
| spin_unlock_irqsave(&depot->lock); |
| } |
| |
| static void depot_init(struct kmem_depot *depot) |
| { |
| spinlock_init_irqsave(&depot->lock); |
| SLIST_INIT(&depot->not_empty); |
| SLIST_INIT(&depot->empty); |
| depot->magsize = KMC_MAG_MIN_SZ; |
| depot->nr_not_empty = 0; |
| depot->nr_empty = 0; |
| depot->busy_count = 0; |
| depot->busy_start = 0; |
| } |
| |
| static bool mag_is_empty(struct kmem_magazine *mag) |
| { |
| return mag->nr_rounds == 0; |
| } |
| |
| /* Helper, swaps the loaded and previous mags. Hold the pcc lock. */ |
| static void __swap_mags(struct kmem_pcpu_cache *pcc) |
| { |
| struct kmem_magazine *temp; |
| |
| temp = pcc->prev; |
| pcc->prev = pcc->loaded; |
| pcc->loaded = temp; |
| } |
| |
| /* Helper, returns a magazine to the depot. Hold the depot lock. */ |
| static void __return_to_depot(struct kmem_cache *kc, struct kmem_magazine *mag) |
| { |
| struct kmem_depot *depot = &kc->depot; |
| |
| if (mag_is_empty(mag)) { |
| SLIST_INSERT_HEAD(&depot->empty, mag, link); |
| depot->nr_empty++; |
| } else { |
| SLIST_INSERT_HEAD(&depot->not_empty, mag, link); |
| depot->nr_not_empty++; |
| } |
| } |
| |
| /* Helper, removes the contents of the magazine, giving them back to the slab |
| * layer. */ |
| static void drain_mag(struct kmem_cache *kc, struct kmem_magazine *mag) |
| { |
| for (int i = 0; i < mag->nr_rounds; i++) { |
| if (kc->dtor) |
| kc->dtor(mag->rounds[i], kc->priv); |
| __kmem_free_to_slab(kc, mag->rounds[i]); |
| } |
| mag->nr_rounds = 0; |
| } |
| |
| static struct kmem_pcpu_cache *build_pcpu_caches(void) |
| { |
| struct kmem_pcpu_cache *pcc; |
| |
| pcc = base_alloc(NULL, |
| sizeof(struct kmem_pcpu_cache) * kmc_nr_pcpu_caches(), |
| MEM_WAIT); |
| for (int i = 0; i < kmc_nr_pcpu_caches(); i++) { |
| pcc[i].irq_state = 0; |
| pcc[i].magsize = KMC_MAG_MIN_SZ; |
| pcc[i].loaded = __kmem_alloc_from_slab(kmem_magazine_cache, |
| MEM_WAIT); |
| pcc[i].prev = __kmem_alloc_from_slab(kmem_magazine_cache, |
| MEM_WAIT); |
| pcc[i].nr_allocs_ever = 0; |
| } |
| return pcc; |
| } |
| |
| void __kmem_cache_create(struct kmem_cache *kc, const char *name, |
| size_t obj_size, int align, int flags, |
| struct arena *source, |
| int (*ctor)(void *, void *, int), |
| void (*dtor)(void *, void *), void *priv) |
| { |
| assert(kc); |
| assert(align); |
| spinlock_init_irqsave(&kc->cache_lock); |
| strlcpy(kc->name, name, KMC_NAME_SZ); |
| kc->obj_size = ROUNDUP(obj_size, align); |
| if (flags & KMC_QCACHE) |
| kc->import_amt = ROUNDUPPWR2(3 * source->qcache_max); |
| else |
| kc->import_amt = ROUNDUP(NUM_BUF_PER_SLAB * obj_size, PGSIZE); |
| kc->align = align; |
| if (align > PGSIZE) |
| panic("Cache %s object alignment is actually MIN(PGSIZE, align (%p))", |
| name, align); |
| kc->flags = flags; |
| /* We might want some sort of per-call site NUMA-awareness in the |
| * future. */ |
| kc->source = source ? source : kpages_arena; |
| TAILQ_INIT(&kc->full_slab_list); |
| TAILQ_INIT(&kc->partial_slab_list); |
| TAILQ_INIT(&kc->empty_slab_list); |
| kc->ctor = ctor; |
| kc->dtor = dtor; |
| kc->priv = priv; |
| kc->nr_cur_alloc = 0; |
| kc->nr_direct_allocs_ever = 0; |
| kc->alloc_hash = kc->static_hash; |
| hash_init_hh(&kc->hh); |
| for (int i = 0; i < kc->hh.nr_hash_lists; i++) |
| BSD_LIST_INIT(&kc->static_hash[i]); |
| /* No touch must use bufctls, even for small objects, so that it does |
| * not use the object as memory. Note that if we have an arbitrary |
| * source, small objects, and we're 'pro-touch', the small allocation |
| * path will assume we're importing from a PGSIZE-aligned source arena. |
| */ |
| if ((obj_size > SLAB_LARGE_CUTOFF) || (flags & KMC_NOTOUCH)) |
| kc->flags |= __KMC_USE_BUFCTL; |
| depot_init(&kc->depot); |
| kmem_trace_ht_init(&kc->trace_ht); |
| /* We do this last, since this will all into the magazine cache - which |
| * we could be creating on this call! */ |
| kc->pcpu_caches = build_pcpu_caches(); |
| add_importing_slab(kc->source, kc); |
| kmc_track(kc); |
| } |
| |
| static int __mag_ctor(void *obj, void *priv, int flags) |
| { |
| struct kmem_magazine *mag = (struct kmem_magazine*)obj; |
| |
| mag->nr_rounds = 0; |
| return 0; |
| } |
| |
| void kmem_cache_init(void) |
| { |
| /* magazine must be first - all caches, including mags, will do a slab |
| * alloc from the mag cache. */ |
| static_assert(sizeof(struct kmem_magazine) <= SLAB_LARGE_CUTOFF); |
| __kmem_cache_create(kmem_magazine_cache, "kmem_magazine", |
| sizeof(struct kmem_magazine), |
| __alignof__(struct kmem_magazine), KMC_NOTRACE, |
| base_arena, __mag_ctor, NULL, NULL); |
| __kmem_cache_create(kmem_cache_cache, "kmem_cache", |
| sizeof(struct kmem_cache), |
| __alignof__(struct kmem_cache), 0, base_arena, |
| NULL, NULL, NULL); |
| __kmem_cache_create(kmem_slab_cache, "kmem_slab", |
| sizeof(struct kmem_slab), |
| __alignof__(struct kmem_slab), KMC_NOTRACE, |
| base_arena, NULL, NULL, NULL); |
| __kmem_cache_create(kmem_bufctl_cache, "kmem_bufctl", |
| sizeof(struct kmem_bufctl), |
| __alignof__(struct kmem_bufctl), KMC_NOTRACE, |
| base_arena, NULL, NULL, NULL); |
| __kmem_cache_create(kmem_trace_cache, "kmem_trace", |
| sizeof(struct kmem_trace), |
| __alignof__(struct kmem_trace), KMC_NOTRACE, |
| base_arena, NULL, NULL, NULL); |
| } |
| |
| /* Cache management */ |
| struct kmem_cache *kmem_cache_create(const char *name, size_t obj_size, |
| int align, int flags, |
| struct arena *source, |
| int (*ctor)(void *, void *, int), |
| void (*dtor)(void *, void *), |
| void *priv) |
| { |
| struct kmem_cache *kc = kmem_cache_alloc(kmem_cache_cache, 0); |
| |
| __kmem_cache_create(kc, name, obj_size, align, flags, source, ctor, |
| dtor, priv); |
| return kc; |
| } |
| |
| /* Helper during destruction. No one should be touching the allocator anymore. |
| * We just need to hand objects back to the depot, which will hand them to the |
| * slab. Locking is just a formality here. */ |
| static void drain_pcpu_caches(struct kmem_cache *kc) |
| { |
| struct kmem_pcpu_cache *pcc; |
| |
| for (int i = 0; i < kmc_nr_pcpu_caches(); i++) { |
| pcc = &kc->pcpu_caches[i]; |
| lock_pcu_cache(pcc); |
| lock_depot(&kc->depot); |
| __return_to_depot(kc, pcc->loaded); |
| __return_to_depot(kc, pcc->prev); |
| unlock_depot(&kc->depot); |
| pcc->loaded = SLAB_POISON; |
| pcc->prev = SLAB_POISON; |
| unlock_pcu_cache(pcc); |
| } |
| } |
| |
| static void depot_destroy(struct kmem_cache *kc) |
| { |
| struct kmem_magazine *mag_i; |
| struct kmem_depot *depot = &kc->depot; |
| |
| lock_depot(depot); |
| while ((mag_i = SLIST_FIRST(&depot->not_empty))) { |
| drain_mag(kc, mag_i); |
| kmem_cache_free(kmem_magazine_cache, mag_i); |
| } |
| while ((mag_i = SLIST_FIRST(&depot->empty))) |
| kmem_cache_free(kmem_magazine_cache, mag_i); |
| unlock_depot(depot); |
| } |
| |
| static void kmem_slab_destroy(struct kmem_cache *cp, struct kmem_slab *a_slab) |
| { |
| if (!__use_bufctls(cp)) { |
| arena_free(cp->source, ROUNDDOWN(a_slab, PGSIZE), PGSIZE); |
| } else { |
| struct kmem_bufctl *i, *temp; |
| void *buf_start = (void*)SIZE_MAX; |
| |
| BSD_LIST_FOREACH_SAFE(i, &a_slab->bufctl_freelist, link, temp) { |
| // Track the lowest buffer address, which is the start |
| // of the buffer |
| buf_start = MIN(buf_start, i->buf_addr); |
| /* This is a little dangerous, but we can skip removing, |
| * since we init the freelist when we reuse the slab. */ |
| kmem_cache_free(kmem_bufctl_cache, i); |
| } |
| arena_free(cp->source, buf_start, cp->import_amt); |
| kmem_cache_free(kmem_slab_cache, a_slab); |
| } |
| } |
| |
| /* Once you call destroy, never use this cache again... o/w there may be weird |
| * races, and other serious issues. */ |
| void kmem_cache_destroy(struct kmem_cache *cp) |
| { |
| struct kmem_slab *a_slab, *next; |
| |
| kmc_untrack(cp); |
| del_importing_slab(cp->source, cp); |
| drain_pcpu_caches(cp); |
| depot_destroy(cp); |
| spin_lock_irqsave(&cp->cache_lock); |
| assert(TAILQ_EMPTY(&cp->full_slab_list)); |
| assert(TAILQ_EMPTY(&cp->partial_slab_list)); |
| /* Clean out the empty list. We can't use a regular FOREACH here, since |
| * the link element is stored in the slab struct, which is stored on the |
| * page that we are freeing. */ |
| a_slab = TAILQ_FIRST(&cp->empty_slab_list); |
| while (a_slab) { |
| next = TAILQ_NEXT(a_slab, link); |
| kmem_slab_destroy(cp, a_slab); |
| a_slab = next; |
| } |
| spin_unlock_irqsave(&cp->cache_lock); |
| kmem_trace_warn_notempty(cp); |
| kmem_cache_free(kmem_cache_cache, cp); |
| } |
| |
| static void __try_hash_resize(struct kmem_cache *cp) |
| { |
| struct kmem_bufctl_list *new_tbl, *old_tbl; |
| struct kmem_bufctl *bc_i; |
| unsigned int new_tbl_nr_lists, old_tbl_nr_lists; |
| size_t new_tbl_sz, old_tbl_sz; |
| size_t hash_idx; |
| |
| if (!hash_needs_more(&cp->hh)) |
| return; |
| new_tbl_nr_lists = hash_next_nr_lists(&cp->hh); |
| new_tbl_sz = new_tbl_nr_lists * sizeof(struct kmem_bufctl_list); |
| /* TODO: we only need to pull from base if our arena is a base or we are |
| * inside a kpages arena (keep in mind there could be more than one of |
| * those, depending on how we do NUMA allocs). This might help with |
| * fragmentation. To know this, we'll need the caller to pass us a |
| * flag. */ |
| new_tbl = base_zalloc(NULL, new_tbl_sz, ARENA_INSTANTFIT | MEM_ATOMIC); |
| if (!new_tbl) |
| return; |
| old_tbl = cp->alloc_hash; |
| old_tbl_nr_lists = cp->hh.nr_hash_lists; |
| old_tbl_sz = old_tbl_nr_lists * sizeof(struct kmem_bufctl_list); |
| cp->alloc_hash = new_tbl; |
| hash_incr_nr_lists(&cp->hh); |
| for (int i = 0; i < old_tbl_nr_lists; i++) { |
| while ((bc_i = BSD_LIST_FIRST(&old_tbl[i]))) { |
| BSD_LIST_REMOVE(bc_i, link); |
| hash_idx = hash_ptr(bc_i->buf_addr, |
| cp->hh.nr_hash_bits); |
| BSD_LIST_INSERT_HEAD(&cp->alloc_hash[hash_idx], bc_i, |
| link); |
| } |
| } |
| hash_reset_load_limit(&cp->hh); |
| if (old_tbl != cp->static_hash) |
| base_free(NULL, old_tbl, old_tbl_sz); |
| } |
| |
| /* Helper, tracks the allocation of @bc in the hash table */ |
| static void __track_alloc(struct kmem_cache *cp, struct kmem_bufctl *bc) |
| { |
| size_t hash_idx; |
| |
| hash_idx = hash_ptr(bc->buf_addr, cp->hh.nr_hash_bits); |
| BSD_LIST_INSERT_HEAD(&cp->alloc_hash[hash_idx], bc, link); |
| cp->hh.nr_items++; |
| __try_hash_resize(cp); |
| } |
| |
| /* Helper, looks up and removes the bufctl corresponding to buf. */ |
| static struct kmem_bufctl *__yank_bufctl(struct kmem_cache *cp, void *buf) |
| { |
| struct kmem_bufctl *bc_i; |
| size_t hash_idx; |
| |
| hash_idx = hash_ptr(buf, cp->hh.nr_hash_bits); |
| BSD_LIST_FOREACH(bc_i, &cp->alloc_hash[hash_idx], link) { |
| if (bc_i->buf_addr == buf) { |
| BSD_LIST_REMOVE(bc_i, link); |
| break; |
| } |
| } |
| if (!bc_i) |
| panic("Could not find buf %p in cache %s!", buf, cp->name); |
| return bc_i; |
| } |
| |
| /* Alloc, bypassing the magazines and depot */ |
| static void *__kmem_alloc_from_slab(struct kmem_cache *cp, int flags) |
| { |
| void *retval = NULL; |
| |
| spin_lock_irqsave(&cp->cache_lock); |
| // look at partial list |
| struct kmem_slab *a_slab = TAILQ_FIRST(&cp->partial_slab_list); |
| // if none, go to empty list and get an empty and make it partial |
| if (!a_slab) { |
| // TODO: think about non-sleeping flags |
| if (TAILQ_EMPTY(&cp->empty_slab_list) && |
| !kmem_cache_grow(cp)) { |
| spin_unlock_irqsave(&cp->cache_lock); |
| if (flags & MEM_ERROR) |
| error(ENOMEM, ERROR_FIXME); |
| else |
| panic("[German Accent]: OOM for a small slab growth!!!"); |
| } |
| // move to partial list |
| a_slab = TAILQ_FIRST(&cp->empty_slab_list); |
| TAILQ_REMOVE(&cp->empty_slab_list, a_slab, link); |
| TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link); |
| } |
| // have a partial now (a_slab), get an item, return item |
| if (!__use_bufctls(cp)) { |
| retval = a_slab->free_small_obj; |
| /* the next free_small_obj address is stored at the beginning of |
| * the current free_small_obj. */ |
| a_slab->free_small_obj = *(uintptr_t**)(a_slab->free_small_obj); |
| } else { |
| // rip the first bufctl out of the partial slab's buf list |
| struct kmem_bufctl *a_bufctl = |
| BSD_LIST_FIRST(&a_slab->bufctl_freelist); |
| |
| BSD_LIST_REMOVE(a_bufctl, link); |
| __track_alloc(cp, a_bufctl); |
| retval = a_bufctl->buf_addr; |
| } |
| a_slab->num_busy_obj++; |
| // Check if we are full, if so, move to the full list |
| if (a_slab->num_busy_obj == a_slab->num_total_obj) { |
| TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link); |
| TAILQ_INSERT_HEAD(&cp->full_slab_list, a_slab, link); |
| } |
| cp->nr_cur_alloc++; |
| cp->nr_direct_allocs_ever++; |
| spin_unlock_irqsave(&cp->cache_lock); |
| if (cp->ctor) { |
| if (cp->ctor(retval, cp->priv, flags)) { |
| warn("Ctor %p failed, probably a bug!"); |
| __kmem_free_to_slab(cp, retval); |
| return NULL; |
| } |
| } |
| return retval; |
| } |
| |
| void *kmem_cache_alloc(struct kmem_cache *kc, int flags) |
| { |
| struct kmem_pcpu_cache *pcc = get_my_pcpu_cache(kc); |
| struct kmem_depot *depot = &kc->depot; |
| struct kmem_magazine *mag; |
| void *ret; |
| |
| lock_pcu_cache(pcc); |
| try_alloc: |
| if (pcc->loaded->nr_rounds) { |
| ret = pcc->loaded->rounds[pcc->loaded->nr_rounds - 1]; |
| pcc->loaded->nr_rounds--; |
| pcc->nr_allocs_ever++; |
| unlock_pcu_cache(pcc); |
| kmem_trace_alloc(kc, ret); |
| return ret; |
| } |
| if (!mag_is_empty(pcc->prev)) { |
| __swap_mags(pcc); |
| goto try_alloc; |
| } |
| /* Note the lock ordering: pcc -> depot */ |
| lock_depot(depot); |
| mag = SLIST_FIRST(&depot->not_empty); |
| if (mag) { |
| SLIST_REMOVE_HEAD(&depot->not_empty, link); |
| depot->nr_not_empty--; |
| __return_to_depot(kc, pcc->prev); |
| unlock_depot(depot); |
| pcc->prev = pcc->loaded; |
| pcc->loaded = mag; |
| goto try_alloc; |
| } |
| unlock_depot(depot); |
| unlock_pcu_cache(pcc); |
| ret = __kmem_alloc_from_slab(kc, flags); |
| kmem_trace_alloc(kc, ret); |
| return ret; |
| } |
| |
| /* Returns an object to the slab layer. Caller must deconstruct the objects. |
| * Note that objects in the slabs are unconstructed. */ |
| static void __kmem_free_to_slab(struct kmem_cache *cp, void *buf) |
| { |
| struct kmem_slab *a_slab; |
| struct kmem_bufctl *a_bufctl; |
| |
| spin_lock_irqsave(&cp->cache_lock); |
| if (!__use_bufctls(cp)) { |
| // find its slab |
| a_slab = (struct kmem_slab*)(ROUNDDOWN((uintptr_t)buf, PGSIZE) + |
| PGSIZE - sizeof(struct kmem_slab)); |
| /* write location of next free small obj to the space at the |
| * beginning of the buffer, then list buf as the next free small |
| * obj */ |
| *(uintptr_t**)buf = a_slab->free_small_obj; |
| a_slab->free_small_obj = buf; |
| } else { |
| /* Give the bufctl back to the parent slab */ |
| a_bufctl = __yank_bufctl(cp, buf); |
| a_slab = a_bufctl->my_slab; |
| BSD_LIST_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl, link); |
| } |
| a_slab->num_busy_obj--; |
| cp->nr_cur_alloc--; |
| // if it was full, move it to partial |
| if (a_slab->num_busy_obj + 1 == a_slab->num_total_obj) { |
| TAILQ_REMOVE(&cp->full_slab_list, a_slab, link); |
| TAILQ_INSERT_HEAD(&cp->partial_slab_list, a_slab, link); |
| } else if (!a_slab->num_busy_obj) { |
| // if there are none, move to from partial to empty |
| TAILQ_REMOVE(&cp->partial_slab_list, a_slab, link); |
| TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link); |
| } |
| spin_unlock_irqsave(&cp->cache_lock); |
| } |
| |
| void kmem_cache_free(struct kmem_cache *kc, void *buf) |
| { |
| struct kmem_pcpu_cache *pcc = get_my_pcpu_cache(kc); |
| struct kmem_depot *depot = &kc->depot; |
| struct kmem_magazine *mag; |
| |
| assert(buf); /* catch bugs */ |
| kmem_trace_free(kc, buf); |
| lock_pcu_cache(pcc); |
| try_free: |
| if (pcc->loaded->nr_rounds < pcc->magsize) { |
| pcc->loaded->rounds[pcc->loaded->nr_rounds] = buf; |
| pcc->loaded->nr_rounds++; |
| unlock_pcu_cache(pcc); |
| return; |
| } |
| /* The paper checks 'is empty' here. But we actually just care if it |
| * has room left, not that prev is completely empty. This could be the |
| * case due to magazine resize. */ |
| if (pcc->prev->nr_rounds < pcc->magsize) { |
| __swap_mags(pcc); |
| goto try_free; |
| } |
| lock_depot(depot); |
| /* Here's where the resize magic happens. We'll start using it for the |
| * next magazine. */ |
| pcc->magsize = depot->magsize; |
| mag = SLIST_FIRST(&depot->empty); |
| if (mag) { |
| SLIST_REMOVE_HEAD(&depot->empty, link); |
| depot->nr_empty--; |
| __return_to_depot(kc, pcc->prev); |
| unlock_depot(depot); |
| pcc->prev = pcc->loaded; |
| pcc->loaded = mag; |
| goto try_free; |
| } |
| unlock_depot(depot); |
| /* Need to unlock, in case we end up calling back into ourselves. */ |
| unlock_pcu_cache(pcc); |
| /* don't want to wait on a free. if this fails, we can still just give |
| * it to the slab layer. */ |
| mag = kmem_cache_alloc(kmem_magazine_cache, MEM_ATOMIC); |
| if (mag) { |
| assert(mag->nr_rounds == 0); |
| lock_depot(depot); |
| SLIST_INSERT_HEAD(&depot->empty, mag, link); |
| depot->nr_empty++; |
| unlock_depot(depot); |
| lock_pcu_cache(pcc); |
| goto try_free; |
| } |
| if (kc->dtor) |
| kc->dtor(buf, kc->priv); |
| __kmem_free_to_slab(kc, buf); |
| } |
| |
| /* Back end: internal functions */ |
| /* When this returns, the cache has at least one slab in the empty list. If |
| * page_alloc fails, there are some serious issues. This only grows by one slab |
| * at a time. |
| * |
| * Grab the cache lock before calling this. |
| * |
| * TODO: think about page colouring issues with kernel memory allocation. */ |
| static bool kmem_cache_grow(struct kmem_cache *cp) |
| { |
| struct kmem_slab *a_slab; |
| struct kmem_bufctl *a_bufctl; |
| |
| if (!__use_bufctls(cp)) { |
| void *a_page; |
| |
| /* Careful, this assumes our source is a PGSIZE-aligned |
| * allocator. We could use xalloc to enforce the alignment, but |
| * that'll bypass the qcaches, which we don't want. Caller |
| * beware. */ |
| a_page = arena_alloc(cp->source, PGSIZE, MEM_ATOMIC); |
| if (!a_page) |
| return FALSE; |
| // the slab struct is stored at the end of the page |
| a_slab = (struct kmem_slab*)(a_page + PGSIZE |
| - sizeof(struct kmem_slab)); |
| a_slab->num_busy_obj = 0; |
| a_slab->num_total_obj = (PGSIZE - sizeof(struct kmem_slab)) / |
| cp->obj_size; |
| // TODO: consider staggering this IAW section 4.3 |
| a_slab->free_small_obj = a_page; |
| /* Walk and create the free list, which is circular. Each item |
| * stores the location of the next one at the beginning of the |
| * block. */ |
| void *buf = a_slab->free_small_obj; |
| |
| for (int i = 0; i < a_slab->num_total_obj - 1; i++) { |
| *(uintptr_t**)buf = buf + cp->obj_size; |
| buf += cp->obj_size; |
| } |
| *((uintptr_t**)buf) = NULL; |
| } else { |
| void *buf; |
| |
| a_slab = kmem_cache_alloc(kmem_slab_cache, 0); |
| if (!a_slab) |
| return FALSE; |
| buf = arena_alloc(cp->source, cp->import_amt, MEM_ATOMIC); |
| if (!buf) { |
| kmem_cache_free(kmem_slab_cache, a_slab); |
| return FALSE; |
| } |
| a_slab->num_busy_obj = 0; |
| a_slab->num_total_obj = cp->import_amt / cp->obj_size; |
| BSD_LIST_INIT(&a_slab->bufctl_freelist); |
| /* for each buffer, set up a bufctl and point to the buffer */ |
| for (int i = 0; i < a_slab->num_total_obj; i++) { |
| a_bufctl = kmem_cache_alloc(kmem_bufctl_cache, 0); |
| BSD_LIST_INSERT_HEAD(&a_slab->bufctl_freelist, a_bufctl, |
| link); |
| a_bufctl->buf_addr = buf; |
| a_bufctl->my_slab = a_slab; |
| buf += cp->obj_size; |
| } |
| } |
| // add a_slab to the empty_list |
| TAILQ_INSERT_HEAD(&cp->empty_slab_list, a_slab, link); |
| |
| return TRUE; |
| } |
| |
| /* This deallocs every slab from the empty list. TODO: think a bit more about |
| * this. We can do things like not free all of the empty lists to prevent |
| * thrashing. See 3.4 in the paper. */ |
| void kmem_cache_reap(struct kmem_cache *cp) |
| { |
| struct kmem_slab *a_slab, *next; |
| |
| // Destroy all empty slabs. Refer to the notes about the while loop |
| spin_lock_irqsave(&cp->cache_lock); |
| a_slab = TAILQ_FIRST(&cp->empty_slab_list); |
| while (a_slab) { |
| next = TAILQ_NEXT(a_slab, link); |
| kmem_slab_destroy(cp, a_slab); |
| a_slab = next; |
| } |
| spin_unlock_irqsave(&cp->cache_lock); |
| } |
| |
| |
| /* Tracing */ |
| |
| static void kmem_trace_ht_foreach(struct kmem_trace_ht *ht, |
| void (*f)(struct kmem_trace *, void *), |
| void *arg) |
| { |
| struct kmem_trace *tr; |
| struct hlist_node *temp; |
| |
| spin_lock_irqsave(&ht->lock); |
| for (int i = 0; i < ht->hh.nr_hash_lists; i++) |
| hlist_for_each_entry_safe(tr, temp, &ht->ht[i], hash) |
| f(tr, arg); |
| spin_unlock_irqsave(&ht->lock); |
| } |
| |
| static void __kmem_trace_print(struct kmem_trace *tr, void *arg) |
| { |
| struct sized_alloc *sza = arg; |
| |
| sza_printf(sza, "Obj %p, from %s:\n----\n", tr->obj, tr->str); |
| sza_print_backtrace_list(sza, tr->pcs, tr->nr_pcs); |
| sza_printf(sza, "\n"); |
| } |
| |
| static void __kmem_trace_bytes_needed(struct kmem_trace *tr, void *arg) |
| { |
| size_t *x = arg; |
| |
| /* Just a guess of how much room we'll need, plus fudge. */ |
| *x += tr->nr_pcs * 80 + 100; |
| } |
| |
| struct sized_alloc *kmem_trace_print(struct kmem_cache *kc) |
| { |
| struct sized_alloc *sza; |
| size_t amt = 100; |
| |
| kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_bytes_needed, &amt); |
| sza = sized_kzmalloc(amt, MEM_WAIT); |
| sza_printf(sza, "Dumping outstanding allocs from %s\n", kc->name); |
| sza_printf(sza, "-------------------------\n"); |
| kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_print, sza); |
| return sza; |
| } |
| |
| static void __kmem_trace_drop(struct kmem_trace *tr, void *arg) |
| { |
| hlist_del(&tr->hash); |
| kmem_cache_free(kmem_trace_cache, tr); |
| } |
| |
| void kmem_trace_reset(struct kmem_cache *kc) |
| { |
| kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_drop, NULL); |
| } |
| |
| /* It's a bug to ever have a left-over trace when we delete a KMC, and probably |
| * will never happen. If it does, we can expand the debugging info. */ |
| static void __kmem_trace_warn_and_drop(struct kmem_trace *tr, void *arg) |
| { |
| warn("KMC had an object! (%p)", tr->obj); |
| __kmem_trace_drop(tr, NULL); |
| } |
| |
| static void kmem_trace_warn_notempty(struct kmem_cache *kc) |
| { |
| kmem_trace_ht_foreach(&kc->trace_ht, __kmem_trace_warn_and_drop, NULL); |
| } |
| |
| int kmem_trace_start(struct kmem_cache *kc) |
| { |
| spin_lock_irqsave(&kc->cache_lock); |
| if (kc->flags & KMC_NOTRACE) { |
| spin_unlock_irqsave(&kc->cache_lock); |
| return -1; |
| } |
| WRITE_ONCE(kc->flags, kc->flags | __KMC_TRACED | __KMC_EVER_TRACED); |
| spin_unlock_irqsave(&kc->cache_lock); |
| return 0; |
| } |
| |
| /* Note that the tracers locklessly-peek at the flags, and we may have an |
| * allocation complete its trace after we stop. You could conceivably stop, |
| * reset/remove all traces, and then have a trace appear still. */ |
| void kmem_trace_stop(struct kmem_cache *kc) |
| { |
| spin_lock_irqsave(&kc->cache_lock); |
| WRITE_ONCE(kc->flags, kc->flags & ~__KMC_TRACED); |
| spin_unlock_irqsave(&kc->cache_lock); |
| } |
| |
| static void kmem_trace_ht_init(struct kmem_trace_ht *ht) |
| { |
| spinlock_init(&ht->lock); |
| ht->ht = ht->static_ht; |
| hash_init_hh(&ht->hh); |
| for (int i = 0; i < ht->hh.nr_hash_lists; i++) |
| INIT_HLIST_HEAD(&ht->ht[i]); |
| } |
| |
| static void kmem_trace_ht_insert(struct kmem_trace_ht *ht, |
| struct kmem_trace *tr) |
| { |
| unsigned long hash_val = __generic_hash(tr->obj); |
| struct hlist_head *bucket; |
| |
| spin_lock_irqsave(&ht->lock); |
| bucket = &ht->ht[hash_val % ht->hh.nr_hash_bits]; |
| hlist_add_head(&tr->hash, bucket); |
| spin_unlock_irqsave(&ht->lock); |
| } |
| |
| static struct kmem_trace *kmem_trace_ht_remove(struct kmem_trace_ht *ht, |
| void *obj) |
| { |
| struct kmem_trace *tr; |
| unsigned long hash_val = __generic_hash(obj); |
| struct hlist_head *bucket; |
| |
| spin_lock_irqsave(&ht->lock); |
| bucket = &ht->ht[hash_val % ht->hh.nr_hash_bits]; |
| hlist_for_each_entry(tr, bucket, hash) { |
| if (tr->obj == obj) { |
| hlist_del(&tr->hash); |
| break; |
| } |
| } |
| spin_unlock_irqsave(&ht->lock); |
| return tr; |
| } |
| |
| static void kmem_trace_free(struct kmem_cache *kc, void *obj) |
| { |
| struct kmem_trace *tr; |
| |
| /* Even if we turn off tracing, we still want to free traces that we may |
| * have collected earlier. Otherwise, those objects will never get |
| * removed from the trace, and could lead to confusion if they are |
| * reallocated and traced again. Still, we don't want to pay the cost |
| * on every free for untraced KCs. */ |
| if (!(READ_ONCE(kc->flags) & __KMC_EVER_TRACED)) |
| return; |
| tr = kmem_trace_ht_remove(&kc->trace_ht, obj); |
| if (tr) |
| kmem_cache_free(kmem_trace_cache, tr); |
| } |
| |
| static void trace_context(struct kmem_trace *tr) |
| { |
| if (is_ktask(current_kthread)) { |
| snprintf(tr->str, sizeof(tr->str), "ktask %s", |
| current_kthread->name); |
| } else if (current) { |
| /* When we're handling a page fault, knowing the user PC helps |
| * determine the source. A backtrace is nice, but harder to do. |
| * Since we're deep in MM code and holding locks, we can't use |
| * copy_from_user, which uses the page-fault fixups. If you |
| * need to get the BT, stash it in the genbuf in |
| * handle_page_fault(). */ |
| snprintf(tr->str, sizeof(tr->str), "PID %d %s: %s, %p", |
| current->pid, current->progname, |
| current_kthread->name, |
| current_ctx ? get_user_ctx_pc(current_ctx) : 0); |
| } else { |
| snprintf(tr->str, sizeof(tr->str), "(none)"); |
| } |
| tr->str[sizeof(tr->str) - 1] = 0; |
| } |
| |
| static void kmem_trace_alloc(struct kmem_cache *kc, void *obj) |
| { |
| struct kmem_trace *tr; |
| |
| if (!(READ_ONCE(kc->flags) & __KMC_TRACED)) |
| return; |
| tr = kmem_cache_alloc(kmem_trace_cache, MEM_ATOMIC); |
| if (!tr) |
| return; |
| tr->obj = obj; |
| tr->nr_pcs = backtrace_list(get_caller_pc(), get_caller_fp(), tr->pcs, |
| ARRAY_SIZE(tr->pcs)); |
| trace_context(tr); |
| kmem_trace_ht_insert(&kc->trace_ht, tr); |
| } |